diff --git a/.changeset/odysseysbench-eval-suite.md b/.changeset/odysseysbench-eval-suite.md new file mode 100644 index 0000000000..6d9b5c3f35 --- /dev/null +++ b/.changeset/odysseysbench-eval-suite.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand-evals": minor +--- + +Add OdysseysBench as a supported agent benchmark in the evals CLI. OdysseysBench is a 200-task web-agent benchmark (45 easy / 46 medium / 109 hard); each task ships a weighted rubric that is baked into the verifier's `precomputed_rubric` format so process + outcome are scored against the published criteria. Run with `--eval-name agent/odysseysbench` (or the `external_agent_benchmarks` category); supports `EVAL_ODYSSEYSBENCH_LIMIT`, `EVAL_ODYSSEYSBENCH_SAMPLE`, `EVAL_ODYSSEYSBENCH_LEVEL`, and `EVAL_ODYSSEYSBENCH_IDS`. diff --git a/packages/evals/cli-legacy.ts b/packages/evals/cli-legacy.ts index baf907c7ac..fb4602ae73 100644 --- a/packages/evals/cli-legacy.ts +++ b/packages/evals/cli-legacy.ts @@ -100,6 +100,7 @@ const CATEGORY_OVERRIDES: Record = { "agent/webvoyager": ["external_agent_benchmarks"], "agent/onlineMind2Web": ["external_agent_benchmarks"], "agent/webtailbench": ["external_agent_benchmarks"], + "agent/odysseysbench": ["external_agent_benchmarks"], }; /** @@ -681,6 +682,7 @@ function handleRun(args: string[]): void { osworld: "agent/osworld", onlineMind2Web: "agent/onlineMind2Web", webtailbench: "agent/webtailbench", + odysseysbench: "agent/odysseysbench", }; evalName = benchmarkMap[benchmarkName]; diff --git a/packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl b/packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl new file mode 100644 index 0000000000..46d48b6f0d --- /dev/null +++ b/packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl @@ -0,0 +1,200 @@ +{"task_id":"440ed7f388a2a4528a8d9fb75f83e11f934b5b5d","confirmed_task":"I’m putting together a small TV watchlist and want to anchor it around The Pitt first, so please go to Hulu and open the actual show page for The Pitt to confirm what service it’s on, then leave that tab open so I can see the listing myself. Once you’ve confirmed that, use Wikipedia to look up the TV series Ponies and pull the main cast names from the series page so I can compare who’s in a different show; if the cast is listed on the page, open the Ponies article itself and keep that tab available too just so I can glance at it. Then round out the watchlist with something older by going to Memory Alpha and finding the entry for Amok Time, and grab the key details from that page including which Star Trek series it belongs to, the season and episode number, and the original air date. Please give me everything back in one concise summary with the streaming service for The Pitt, the Ponies cast list, and the Amok Time details, and keep the Hulu and Memory Alpha pages open in separate tabs so I have visual proof.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Arts & Entertainment > Streaming & Online TV","Reference Materials > Dictionaries and Encyclopedias"],"precomputed_rubric":{"items":[{"criterion":"The agent opens the Hulu page for “The Pitt,” confirms the service as Hulu, and preserves the page as visual proof.","description":"The agent opens the Hulu page for “The Pitt,” confirms the service as Hulu, and preserves the page as visual proof.\n\nHow a grader verifies this: A grader can see the Hulu title page for “The Pitt” open in a browser tab and the final answer states the service is Hulu.","max_points":300},{"criterion":"The agent extracts the main cast members for the TV series “Ponies” from the Wikipedia series page exactly as shown there.","description":"The agent extracts the main cast members for the TV series “Ponies” from the Wikipedia series page exactly as shown there.\n\nHow a grader verifies this: A grader can inspect the open Wikipedia article for “Ponies” and compare the listed cast names against the final answer.","max_points":300},{"criterion":"The agent reports the Memory Alpha details for “Amok Time,” including the Star Trek series, season and episode number, and original air date.","description":"The agent reports the Memory Alpha details for “Amok Time,” including the Star Trek series, season and episode number, and original air date.\n\nHow a grader verifies this: A grader can view the open Memory Alpha page for “Amok Time” and confirm the final answer includes the correct series title, season/episode notation, and air date.","max_points":250},{"criterion":"The agent provides one concise combined summary covering the Hulu service for The Pitt, the Ponies cast list, and the Amok Time episode details.","description":"The agent provides one concise combined summary covering the Hulu service for The Pitt, the Ponies cast list, and the Amok Time episode details.\n\nHow a grader verifies this: The final response is a single concise summary containing all three requested result groups without omitting any required field.","max_points":150}]}} +{"task_id":"2cb0ed2a5df6053c6c982a5c5d436d25e006370f","confirmed_task":"I’m putting together a really simple Baltimore event night plan and just want the official pages open so I know I’m starting from the right places. Please go to the official Please Find Your Seat site and grab the homepage URL for me, because that’s the event platform I want to use as the anchor for the night. Then find the official Pier 5 Hotel Baltimore website URL so I have a nearby hotel option tied to the same outing, and leave that hotel page open in its own tab so I can look at it afterward. After that, open The Capital Grille’s official homepage and note the restaurant brand name exactly as it appears there, since I want a recognizable dinner option to mention alongside the hotel and event plan. Keep the key pages open in separate tabs and give me a short planning summary with the Please Find Your Seat homepage URL, the Pier 5 Hotel Baltimore official URL, and The Capital Grille homepage URL plus the brand name shown on the page.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Community and Society > Community and Society - Other","Travel and Tourism > Accommodation and Hotels","Food and Drink > Restaurants and Delivery"],"precomputed_rubric":{"items":[{"criterion":"Provide the official homepage URL for the Please Find Your Seat event platform.","description":"Provide the official homepage URL for the Please Find Your Seat event platform.\n\nHow a grader verifies this: Grader confirms the returned URL is the official Please Find Your Seat homepage and matches the page opened in the browser.","max_points":270},{"criterion":"Provide the official website URL for Pier 5 Hotel Baltimore and keep the hotel page open in its own tab.","description":"Provide the official website URL for Pier 5 Hotel Baltimore and keep the hotel page open in its own tab.\n\nHow a grader verifies this: Grader confirms the returned URL is the official Pier 5 Hotel Baltimore site and that the browser shows the hotel page open.","max_points":280},{"criterion":"Provide The Capital Grille official homepage URL and identify the restaurant brand name shown on the homepage.","description":"Provide The Capital Grille official homepage URL and identify the restaurant brand name shown on the homepage.\n\nHow a grader verifies this: Grader confirms the URL is The Capital Grille official homepage and the reported brand name matches visible text on the page.","max_points":280},{"criterion":"Return all four findings together in a short planning summary covering the event platform, hotel, dinner option.","description":"Return all four findings together in a short planning summary covering the event platform, hotel, dinner option.\n\nHow a grader verifies this: Grader confirms the final response includes the Please Find Your Seat URL, Pier 5 Hotel Baltimore URL, The Capital Grille URL plus brand name.","max_points":170}]}} +{"task_id":"082aa17f3e88c3ce10796244e3677c5643dd19c9","confirmed_task":"I’m setting up a new kitchen and want one of the first things I make to be roasted Brussels sprouts, so could you start on Google and find me a recipe that clearly uses both Parmesan and balsamic vinegar, then open the actual recipe page and note the title, oven temperature, and cook time because I want to make sure the cookware I buy fits that kind of roasting setup. Once you’ve got that recipe open, head to Le Creuset and look for a light green Dutch oven, and specifically check whether the 5.5 qt size is offered in that color so I can see if it would work for recipes in that range; please open the product page itself and leave it open so I can look at the color and size options on the page. While you’re at it, I’m also sorting out kitchen appliances before I start cooking, so go to YouTube and find a practical video about whether a dishwasher installation needs an air gap, open the video page, start playing it, and tell me the main decision points like when an air gap is required, when a high loop is used instead, and why the air gap exists in the first place. Please keep the recipe tab, the Le Creuset product tab, and the YouTube video tab open in separate tabs so I can compare everything visually afterward.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Food and Drink > Cooking and Recipes","Ecommerce & Shopping > Ecommerce and Shopping - Other","Home and Garden > Home Improvement and Maintenance"],"precomputed_rubric":{"items":[{"criterion":"A roasted Brussels sprouts recipe was found and the actual recipe page shows both Parmesan and balsamic vinegar, along with the recipe title, oven temperature, and cook time.","description":"A roasted Brussels sprouts recipe was found and the actual recipe page shows both Parmesan and balsamic vinegar, along with the recipe title, oven temperature, and cook time.\n\nHow a grader verifies this: Grader can confirm the open recipe page contains the recipe title and visible recipe details, and the ingredients or recipe text visibly includes Parmesan and balsamic vinegar.","max_points":350},{"criterion":"A light green Le Creuset Dutch oven product page was opened and the agent correctly confirmed whether the 5.5 qt size is available in that color, including product name and color.","description":"A light green Le Creuset Dutch oven product page was opened and the agent correctly confirmed whether the 5.5 qt size is available in that color, including product name and color.\n\nHow a grader verifies this: Grader can inspect the open Le Creuset product page and see the product name, the selected or referenced light green color, and visible size options or product details indicating whether 5.5 qt is available.","max_points":300},{"criterion":"A relevant YouTube video about dishwasher air gaps was opened and played, and the summary captures when an air gap is required, the high loop alternative, and why an air gap is used.","description":"A relevant YouTube video about dishwasher air gaps was opened and played, and the summary captures when an air gap is required, the high loop alternative, and why an air gap is used.\n\nHow a grader verifies this: Grader can confirm the YouTube video page is open with playback started and that the reported summary matches the topic and covers requirement conditions, alternatives, and purpose.","max_points":250},{"criterion":"The browser state preserves visual proof by keeping the recipe page, Le Creuset product page, and YouTube video page open in separate tabs.","description":"The browser state preserves visual proof by keeping the recipe page, Le Creuset product page, and YouTube video page open in separate tabs.\n\nHow a grader verifies this: Grader can inspect the browser tab bar and confirm all three relevant pages remain open for side-by-side review.","max_points":100}]}} +{"task_id":"1211fbaa646424ab75869c0379431d5d049d2c9b","confirmed_task":"I’m trying to build a Father’s Day gift shortlist for someone who likes practical tools but also wears casual, outdoorsy stuff, so can you help me compare a few very different options in the browser and keep the promising pages open in separate tabs so I can look at them afterward? Start on Home Depot and pull up the exact product page for Milwaukee model 48-11-2450, then note the current price because I want to use that battery as the practical baseline gift. After that, go to Tecovas and find one men’s cowboy boot made of real leather that actually looks like it could still work with casual athletic wear, not just full western styling, and open the actual product page so I can see the photos and color choices. Then check Foot Locker for the Nike Ja 3 and tell me the current price and whether it comes in under $60, since I’m trying to see if that’s the budget footwear option compared with the Tecovas boot. Once you’ve seen those prices, go to lululemon’s men’s We Made Too Much section and pick one breathable men’s athletic item that would pair well with whichever footwear option seems more realistic based on the earlier pricing, and leave that product page open too. Finally, on Old Navy, find one men's jogger that looks comfortable and affordable just so I have a separate apparel reference point for overall gift-value shopping, and open the actual listing page. At the end, give me a concise shortlist with each item’s name, price, link, and a quick note on why it fits the overall gift plan.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Lifestyle > Fashion and Apparel"],"precomputed_rubric":{"items":[{"criterion":"The exact Home Depot product page for Milwaukee model 48-11-2450 is opened and the current listed price is reported.","description":"The exact Home Depot product page for Milwaukee model 48-11-2450 is opened and the current listed price is reported.\n\nHow a grader verifies this: Grader can confirm the Home Depot tab shows model 48-11-2450 on the product page and that the response includes the matching price and link.","max_points":200},{"criterion":"A Tecovas men's cowboy boot made of real leather is selected from its actual product page, with the product name, current price, and page link reported.","description":"A Tecovas men's cowboy boot made of real leather is selected from its actual product page, with the product name, current price, and page link reported.\n\nHow a grader verifies this: Grader can confirm the Tecovas tab is a men's boot product page indicating leather construction, and that the response includes the boot name, its listed price, and the product page link.","max_points":200},{"criterion":"The Foot Locker Nike Ja 3 product page is used to report the current price and explicitly state whether it is under $60.","description":"The Foot Locker Nike Ja 3 product page is used to report the current price and explicitly state whether it is under $60.\n\nHow a grader verifies this: Grader can confirm the Foot Locker tab shows a Nike Ja 3 listing and that the response includes the visible price plus a correct under-$60 judgment.","max_points":200},{"criterion":"One men’s breathable lululemon We Made Too Much item is chosen with its name and sale price, and the choice is aligned with whichever footwear option seems more realistic from earlier pricing.","description":"One men’s breathable lululemon We Made Too Much item is chosen with its name and sale price, and the choice is aligned with whichever footwear option seems more realistic from earlier pricing.\n\nHow a grader verifies this: Grader can confirm the lululemon tab is in Men’s We Made Too Much or a product reached from it, shows a sale price, and the final note references the earlier Tecovas vs. Nike Ja 3 price comparison.","max_points":250},{"criterion":"One Old Navy men's jogger is selected from its product page with the product name and current price reported.","description":"One Old Navy men's jogger is selected from its product page with the product name and current price reported.\n\nHow a grader verifies this: Grader can confirm the Old Navy tab is an actual men's jogger product listing and that the response includes the matching name, price, and link.","max_points":150}]}} +{"task_id":"156e2acc95361db4145f5bc313abb63807750089","confirmed_task":"I’m helping a veteran’s family pull together one practical benefits note while they wait on a few moving pieces, and I want it grounded in the actual pages so I can double-check things later. Please start on DFAS and find the exact form a surviving spouse would use to begin an SBP annuity claim, then open the actual form or claim page so you can confirm the form name and number and leave that DFAS page open as proof. Once you have that survivor-benefit piece, go to Reddit and look through VA benefits_claims discussions about getting more detail on an in-progress VA claim than the normal VA.gov tracker shows, especially references to the benefits_claims API endpoint or similar methods people are using, and open the most useful discussion thread in its own tab so I can see the comments myself. After that, use Google to get to the IRS guidance on cash gifts and figure out whether receiving money from relatives is taxable, who would be responsible for any gift tax reporting, and when a gift tax return is required, because the family may need temporary help while waiting on benefits; please open the actual IRS page, not just a summary site. Then use Google again to find a reliable source explaining what happens to SNAP benefits during a U.S. federal government shutdown, so we can understand whether food assistance usually continues, whether there are timing risks or exceptions, and what the practical takeaway is for monthly planning; open the source page you rely on so I can verify it. In the end, give me one short plain-language brief that starts with the SBP form name and number, then covers the VA claim-status tip from Reddit, then the IRS cash-gift guidance, and ends with the SNAP shutdown impact.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Law and Government > Government","Finance > Insurance"],"precomputed_rubric":{"items":[{"criterion":"Identify the exact DFAS form needed to start an SBP spouse annuity claim, including the correct form name and form number.","description":"Identify the exact DFAS form needed to start an SBP spouse annuity claim, including the correct form name and form number.\n\nHow a grader verifies this: Grader can confirm the response matches the DFAS page left open showing the SBP spouse annuity claim form details.","max_points":280},{"criterion":"Summarize a Reddit-sourced method for getting additional status information on an in-progress VA claim beyond the standard tracker, including what extra details the benefits_claims API endpoint or similar approach can reveal.","description":"Summarize a Reddit-sourced method for getting additional status information on an in-progress VA claim beyond the standard tracker, including what extra details the benefits_claims API endpoint or similar approach can reveal.\n\nHow a grader verifies this: Grader can inspect the open Reddit discussion tab and verify the summary reflects the thread’s discussion of added claim-status detail beyond the normal tracker.","max_points":240},{"criterion":"Summarize official IRS guidance on cash gifts, including whether the recipient owes tax, whether the giver may have gift-tax responsibility, and when a gift tax return is required.","description":"Summarize official IRS guidance on cash gifts, including whether the recipient owes tax, whether the giver may have gift-tax responsibility, and when a gift tax return is required.\n\nHow a grader verifies this: Grader can compare the response against the open IRS page reached via Google and confirm the tax responsibility and filing-threshold explanation is accurate.","max_points":220},{"criterion":"Summarize reliable information on how a U.S. federal government shutdown affects SNAP benefits, including whether benefits generally continue and any exceptions, delays, or timing risks.","description":"Summarize reliable information on how a U.S. federal government shutdown affects SNAP benefits, including whether benefits generally continue and any exceptions, delays, or timing risks.\n\nHow a grader verifies this: Grader can review the open source page found through Google and confirm the answer accurately reflects continuity of benefits and any caveats.","max_points":160},{"criterion":"Return the findings as one short plain-language brief that begins with the SBP form name/number and then covers the VA claim-status tip, IRS cash-gift guidance, and SNAP shutdown impact in that order.","description":"Return the findings as one short plain-language brief that begins with the SBP form name/number and then covers the VA claim-status tip, IRS cash-gift guidance, and SNAP shutdown impact in that order.\n\nHow a grader verifies this: Grader can check the final response structure, ordering, and brevity against the requested sequence and confirm all four topics are included.","max_points":100}]}} +{"task_id":"0ab48db6076089bbcf42047d162009a50eb9ca50","confirmed_task":"I’m trying to put together a simple, semi-healthy meal prep menu for the week, mostly so I can batch-cook lunches and have a couple of make-ahead desserts ready too. Could you start on Google and search for an easy Instant Pot lentils recipe, then open the actual recipe page and pull out the basics I’d need for meal prep: what type of lentils it uses, the water-to-lentil ratio, and the cook time? Please leave that recipe tab open so I can glance at it later. Then go to Ambitious Kitchen and find a healthy turkey chili recipe that includes extra vegetables, because I want something hearty to pair with the lentils for lunches; open the recipe itself and note the exact recipe title plus at least two added vegetables from the ingredients list, and keep that page open in its own tab too so I can compare the two recipes side by side. After that, on Feel Good Foodie, look up recipes for halva, chia pudding, and pecan bars, open the actual recipe page for each one in separate tabs, and grab the page URLs so I have the real recipe links. Once you’ve seen all three, pick the two dessert options that seem most practical for make-ahead prep and tell me which two you’d choose. At the end, give me one concise meal prep summary that explains the lentil base, the turkey chili pairing, and the two dessert picks with their Feel Good Foodie recipe URLs.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Food and Drink > Cooking and Recipes"],"precomputed_rubric":{"items":[{"criterion":"Identify an easy Instant Pot lentils recipe found via Google and capture the lentil type, water ratio, and cook time from the opened recipe page.","description":"Identify an easy Instant Pot lentils recipe found via Google and capture the lentil type, water ratio, and cook time from the opened recipe page.\n\nHow a grader verifies this: Grader can confirm a Google search was used, an actual recipe page was opened and left available, and the reported lentil type, ratio, and cook time match visible recipe instructions.","max_points":280},{"criterion":"Find a healthy turkey chili recipe on Ambitious Kitchen, open the recipe page, and record the exact recipe title plus at least two vegetables from the ingredients list.","description":"Find a healthy turkey chili recipe on Ambitious Kitchen, open the recipe page, and record the exact recipe title plus at least two vegetables from the ingredients list.\n\nHow a grader verifies this: Grader can verify the browser is on an Ambitious Kitchen recipe page, the title matches the visible page title, and at least two reported vegetables appear in the ingredients list.","max_points":240},{"criterion":"Locate Feel Good Foodie recipe pages for halva, chia pudding, and pecan bars, open each in its own tab, and capture the correct recipe page URL for each.","description":"Locate Feel Good Foodie recipe pages for halva, chia pudding, and pecan bars, open each in its own tab, and capture the correct recipe page URL for each.\n\nHow a grader verifies this: Grader can confirm three Feel Good Foodie recipe tabs are open for halva, chia pudding, and pecan bars, and the provided URLs correspond to those visible pages.","max_points":240},{"criterion":"Produce a concise meal prep summary that explains the lentil base, the turkey chili pairing, and selects the two most practical make-ahead dessert options from the Feel Good Foodie results with their URLs.","description":"Produce a concise meal prep summary that explains the lentil base, the turkey chili pairing, and selects the two most practical make-ahead dessert options from the Feel Good Foodie results with their URLs.\n\nHow a grader verifies this: Grader can compare the final summary against the gathered recipe details and confirm that exactly two dessert choices were selected from the three Feel Good Foodie pages and that their URLs are included.","max_points":240}]}} +{"task_id":"9cf9ea2af003a1efe18b079f18f4824cc581ccb0","confirmed_task":"I’m putting together a super short CapCut quick-start note for a friend who edits on a Mac and is totally new to CapCut Desktop, so could you help me pull the pieces together in a way that follows a real beginner workflow? Start on Google and find a beginner-friendly source that shows how to make a plain black screen clip in CapCut Desktop, then open the actual result page and keep it open so I can see the instructions myself. After that, still using Google, find a clear source that explains how to reorder or move clips around in the CapCut timeline, because I want the note to explain how to drop that black screen into the right spot in a project; open that source in its own tab too so I can compare the two instructions side by side. Then go to CapCut’s own website and find their instructions for adding curved text, since I want one slightly more advanced text trick in the same note, and leave the CapCut page open as proof of the official steps. Finally, go to Apple Support and find Apple’s official instructions for calibrating a Mac display with Display Calibrator Assistant, because if the screen is off then the black screen and text styling can look wrong; open the Apple page and pull out the key steps for actually running the assistant. When you’re done, give me one compact how-to note that ties these together as a simple workflow: make the black screen, move it into place on the timeline, add curved text if needed, and then calibrate the Mac display if colors or contrast look off.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Computers Electronics and Technology > Graphics Multimedia and Web Design","Science and Education > Education"],"precomputed_rubric":{"items":[{"criterion":"A beginner-friendly source for creating a blank black screen clip in CapCut Desktop is found, opened, and summarized accurately.","description":"A beginner-friendly source for creating a blank black screen clip in CapCut Desktop is found, opened, and summarized accurately.\n\nHow a grader verifies this: Grader can confirm an open non-Google instruction page showing CapCut black screen creation guidance and a matching summary in the final note.","max_points":240},{"criterion":"A clear source for reordering or moving clips in the CapCut timeline is found, opened in its own tab, and summarized so the black screen can be positioned correctly.","description":"A clear source for reordering or moving clips in the CapCut timeline is found, opened in its own tab, and summarized so the black screen can be positioned correctly.\n\nHow a grader verifies this: Grader can confirm a separate open instruction page about moving or reordering CapCut timeline clips and a summary that explains repositioning the black screen clip.","max_points":220},{"criterion":"CapCut’s own site is used to find and summarize the official steps for adding curved text to a video project.","description":"CapCut’s own site is used to find and summarize the official steps for adding curved text to a video project.\n\nHow a grader verifies this: Grader can confirm an open CapCut domain page with curved text guidance and a final summary that clearly attributes the curved text process to CapCut’s official site.","max_points":220},{"criterion":"Apple’s official instructions for running Display Calibrator Assistant on a Mac are found and the key steps are summarized accurately.","description":"Apple’s official instructions for running Display Calibrator Assistant on a Mac are found and the key steps are summarized accurately.\n\nHow a grader verifies this: Grader can confirm an open Apple Support page about display calibration and a summary including the main steps to launch and use Display Calibrator Assistant.","max_points":180},{"criterion":"The final response is one compact quick-start note that combines the black screen, clip reordering, curved text, and Mac display calibration guidance into a single simple editing workflow.","description":"The final response is one compact quick-start note that combines the black screen, clip reordering, curved text, and Mac display calibration guidance into a single simple editing workflow.\n\nHow a grader verifies this: Grader can confirm the final output is a single concise note that integrates all four topics in a logical sequence rather than listing them as unrelated facts.","max_points":140}]}} +{"task_id":"65c9e7d383e015d946572f04512d5aa166a8f015","confirmed_task":"I’m putting together a quick, accessible study workflow for myself on latent growth models, and I want it to feel grounded in what I’d actually see in a browser rather than just a generic summary. Please start on Bing and search for “Latent Growth Models,” then look only at the main organic web results on the first page and pull out the top three with their titles, source sites, and a one-line note on what each seems to cover, because I want to know which sources are most visible right away. Open those three results in separate tabs and leave the search results page open too so I can compare them afterward. Once you’ve got that context, use Google to find one solid ChatGPT prompt template for turning source material into an outline, ideally from a page that actually shows the full prompt wording, because I want to reuse that structure for my own study note; open the page with the prompt and copy the exact template text. After that, go to 10015.io and use its bionic reading converter on this exact note text: “Latent growth models are statistical methods used to estimate change over time across individuals. They help researchers understand trajectories, differences in growth, and predictors of change.” I want the converted version in bionic reading style so it’s easier for me to scan, and please leave the converter result visible on the page as proof. Then finish by using Bing to find a free plagiarism checker that would work for a short note like this, open the actual tool page so I can verify it’s live, and tell me briefly how I’d paste in that same note and run an originality check before sharing it with classmates.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Science and Education > Science and Education - Other","Science and Education > Education"],"precomputed_rubric":{"items":[{"criterion":"Identify the top 3 organic Bing results for the query “Latent Growth Models,” including each result’s title, source, and a brief summary of what it covers.","description":"Identify the top 3 organic Bing results for the query “Latent Growth Models,” including each result’s title, source, and a brief summary of what it covers.\n\nHow a grader verifies this: Grader confirms the Bing results page shows the query, the response lists three organic results matching visible titles/source labels from the page, and the relevant result tabs were opened.","max_points":350},{"criterion":"Find one strong ChatGPT prompt template for turning source material into an outline and provide the exact prompt wording from the selected page.","description":"Find one strong ChatGPT prompt template for turning source material into an outline and provide the exact prompt wording from the selected page.\n\nHow a grader verifies this: Grader confirms a Google search was performed, a source page containing a visible prompt template was opened, and the returned wording matches the prompt shown on that page.","max_points":250},{"criterion":"Convert the provided latent growth models note into bionic reading format and return the converted text.","description":"Convert the provided latent growth models note into bionic reading format and return the converted text.\n\nHow a grader verifies this: Grader confirms the 10015.io converter page shows the input note and a visible converted output in bionic reading style, and the response includes the transformed text.","max_points":200},{"criterion":"Find one free plagiarism checker and explain how to use it to test the same study note for originality.","description":"Find one free plagiarism checker and explain how to use it to test the same study note for originality.\n\nHow a grader verifies this: Grader confirms a Bing search for plagiarism checkers was performed, a live tool page was opened, and the response names the tool and gives concise usage instructions tied to the provided note.","max_points":200}]}} +{"task_id":"62689a0f29cfc6bd0b4510c240dec4de17a361e5","confirmed_task":"I want to send a friend a quick little roundup that mixes pop-culture, memes, and actual news, so could you start on Reddit and find a discussion thread about Coraline, open the actual thread page, and grab the exact thread title for me because I want the wording to be precise. While you’re still on Reddit, go over to r/starterpacks and pick two recent meme-style posts that feel funny enough to sit next to the Coraline item, and open each of those posts in its own tab so I can compare them later and make sure they’re really the kind of starterpack jokes I’d send to someone. Then switch to CNN and find the latest breaking news story about Iran, open the article itself, and note the exact headline plus the publication time so I can include one current-events item that’s clearly sourced. In the end, give me a short, friend-ready update that includes the Coraline thread title, the two r/starterpacks post titles, the CNN Iran headline with its publication time, and a natural sounding summary of these.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Computers Electronics and Technology > Social Media Networks","Arts & Entertainment > Arts and Entertainment - Other","News & Media Publishers"],"precomputed_rubric":{"items":[{"criterion":"The response includes the exact title of an opened Reddit discussion thread about Coraline.","description":"The response includes the exact title of an opened Reddit discussion thread about Coraline.\n\nHow a grader verifies this: Grader can confirm the title matches the text visible on the Reddit thread page.","max_points":250},{"criterion":"The response includes the titles of two recent r/starterpacks posts that are clearly meme-style/funny posts, and both selected posts are opened in separate tabs.","description":"The response includes the titles of two recent r/starterpacks posts that are clearly meme-style/funny posts, and both selected posts are opened in separate tabs.\n\nHow a grader verifies this: Grader can confirm the two visible Reddit post pages or tabs correspond to r/starterpacks posts and that their titles match the reported titles.","max_points":310},{"criterion":"The response includes CNN’s latest breaking news story about Iran with both the exact headline and the publication time.","description":"The response includes CNN’s latest breaking news story about Iran with both the exact headline and the publication time.\n\nHow a grader verifies this: Grader can verify the opened CNN article page shows the reported headline and publication time and that it is the latest applicable Iran breaking news item found on CNN.","max_points":310},{"criterion":"All gathered items are combined into a short, friend-ready mixed-interest update.","description":"All gathered items are combined into a short, friend-ready mixed-interest update.\n\nHow a grader verifies this: Grader can confirm the final write-up includes all four content elements in a concise roundup format suitable to send to a friend.","max_points":130}]}} +{"task_id":"39255449e341c41a589b8a4e17f073be3a4809c9","confirmed_task":"I’m trying to buy a pair of Kobe Bryant sneakers for my son’s birthday, so I want a quick but trustworthy read on what’s coming out soon and what I could actually buy right now. Please start on Sole Collector and look for upcoming Kobe Bryant signature-line releases, pulling the shoe names and release dates from the actual release coverage pages if they’re listed, and leave the most relevant Sole Collector page open so I can glance at it myself. Then check Sneaker News for upcoming Kobe release dates and any useful details like colorways or launch context, and open the main article you used in its own tab so I can compare it side by side with Sole Collector. After that, go to Foot Locker and see whether they have any Kobe entries on their release calendar or product pages, and note whether anything looks upcoming versus currently available; if you find a relevant release or product page, keep that open too so I can visually verify it. Finally, go to Nike and search for Kobe shoes that are actually in stock right now, open any live purchasable product pages you find in separate tabs, and if there’s an available pair, select a common men’s size like 10 if the page allows it just so I can see that it’s really buyable. At the end, give me a concise summary that combines the upcoming release names and dates from Sole Collector, Sneaker News, and Foot Locker with source attribution, plus a note on any in-stock Kobe pairs you found on Nike and which product tabs you left open for me.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Sports > Sports - Other","Lifestyle > Fashion and Apparel"],"precomputed_rubric":{"items":[{"criterion":"Identify upcoming Kobe Bryant signature sneaker releases from Sole Collector with release names and dates when listed.","description":"Identify upcoming Kobe Bryant signature sneaker releases from Sole Collector with release names and dates when listed.\n\nHow a grader verifies this: Grader can confirm the open Sole Collector page shows Kobe release coverage and that the reported summary includes matching shoe names and dates from that page.","max_points":220},{"criterion":"Identify upcoming Kobe release dates and relevant details from Sneaker News and keep the source page open in its own tab.","description":"Identify upcoming Kobe release dates and relevant details from Sneaker News and keep the source page open in its own tab.\n\nHow a grader verifies this: Grader can verify a Sneaker News tab is open with Kobe release information and that the summary includes dates and details visible on that page.","max_points":220},{"criterion":"Check Foot Locker for Kobe release calendar entries or related product pages and determine whether items are upcoming, available, or unavailable.","description":"Check Foot Locker for Kobe release calendar entries or related product pages and determine whether items are upcoming, available, or unavailable.\n\nHow a grader verifies this: Grader can confirm an open Foot Locker page relevant to Kobe releases or products and that the reported availability status matches what is visible on the page.","max_points":180},{"criterion":"Find any currently in-stock Kobe shoes on Nike, open the live product pages in separate tabs, and attempt to select a common men's size like 10 when possible.","description":"Find any currently in-stock Kobe shoes on Nike, open the live product pages in separate tabs, and attempt to select a common men's size like 10 when possible.\n\nHow a grader verifies this: Grader can verify one or more Nike product tabs are open for Kobe shoes and that the page shows in-stock/purchasable state, including visible size selection if available.","max_points":230},{"criterion":"Provide a concise consolidated summary of upcoming Kobe release names and dates from Sole Collector, Sneaker News, and Foot Locker with source attribution, plus note any in-stock Nike pairs found and which tabs were left open.","description":"Provide a concise consolidated summary of upcoming Kobe release names and dates from Sole Collector, Sneaker News, and Foot Locker with source attribution, plus note any in-stock Nike pairs found and which tabs were left open.\n\nHow a grader verifies this: Grader can compare the final written summary against the open source pages and confirm source attribution, release-date consolidation, and mention of the Nike product pages left open.","max_points":150}]}} +{"task_id":"0ce94d4e773eff1042a6920232f929a1da98c44d","confirmed_task":"I’m trying to put together an all-black going-out outfit and want you to build it around a dress first so I can see whether the whole look feels cohesive. On Princess Polly, find a black short party dress in size Medium that’s actually available for delivery, open the product page so you can verify the size and delivery status, and leave that tab open because I want the dress to be the anchor piece. Then go to DemoniaCult and find a black Mary Jane shoe that would work with that dress, making sure size 9 or 9.5 is available on the actual product page, and open the best option in its own tab so I can compare the vibe side by side with the dress; please note exactly which of those sizes you found in stock. After that, go to Edikted and pick a dark brown leather oversized jacket without a hood to use as the outer layer, and open the actual product page so I can visually confirm it fits the look. At the end, send me a short outfit summary with the product names and links for all three items, include the dress and shoe prices, and mention the shoe size availability you found on DemoniaCult.","website":"https://www.google.com","level":"easy","reference_length":6,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Lifestyle > Fashion and Apparel"],"precomputed_rubric":{"items":[{"criterion":"Identify one black short party dress on Princess Polly in size Medium that is available for delivery, and capture its product name, price, and link.","description":"Identify one black short party dress on Princess Polly in size Medium that is available for delivery, and capture its product name, price, and link.\n\nHow a grader verifies this: Grader can confirm a Princess Polly product page is open showing a black short party dress with Medium selected or available and delivery availability visible, along with the recorded name, price, and URL.","max_points":240},{"criterion":"Use the Princess Polly dress as the anchor item for the outfit and keep its product page open for reference.","description":"Use the Princess Polly dress as the anchor item for the outfit and keep its product page open for reference.\n\nHow a grader verifies this: Grader can confirm the dress page remains open in a tab and that later selections are described as pairing with that dress.","max_points":110},{"criterion":"Identify at least one black Mary Jane shoe on DemoniaCult that is available in size 9 or 9.5, and capture its product name, price, link, and which size is available.","description":"Identify at least one black Mary Jane shoe on DemoniaCult that is available in size 9 or 9.5, and capture its product name, price, link, and which size is available.\n\nHow a grader verifies this: Grader can confirm a DemoniaCult product page is open showing a black Mary Jane shoe with size 9 or 9.5 available, plus the recorded name, price, URL, and size availability.","max_points":240},{"criterion":"Keep the chosen DemoniaCult shoe open in its own tab as a pairing for the dress.","description":"Keep the chosen DemoniaCult shoe open in its own tab as a pairing for the dress.\n\nHow a grader verifies this: Grader can confirm the shoe product page remains open in a separate tab and is referenced as the selected pairing with the dress.","max_points":90},{"criterion":"Identify one dark brown leather oversized jacket without a hood on Edikted, and capture its product name and link from the actual product page.","description":"Identify one dark brown leather oversized jacket without a hood on Edikted, and capture its product name and link from the actual product page.\n\nHow a grader verifies this: Grader can confirm an Edikted product page is open showing a dark brown leather oversized jacket with no hood indicated visually or in the product details, along with the recorded name and URL.","max_points":170},{"criterion":"Provide a final outfit summary that includes all three selected items with product names and links, includes the dress and shoe prices, and explicitly states the shoe size availability found.","description":"Provide a final outfit summary that includes all three selected items with product names and links, includes the dress and shoe prices, and explicitly states the shoe size availability found.\n\nHow a grader verifies this: Grader can confirm the final response lists the Princess Polly dress, DemoniaCult shoe, and Edikted jacket with names and links, includes the dress and shoe prices, and clearly notes whether size 9 or 9.5 was available.","max_points":150}]}} +{"task_id":"3e106825aab3db868d3b94eb1bd594f9fd4a89be","confirmed_task":"I’m thinking about switching my work setup to a Chromebook, and before I do that I want to sanity-check whether my main tools will actually work. First, please go to Hubstaff’s official site and find their guidance about using the desktop app on a Chromebook or ChromeOS, because I need to know whether this would be a normal install or whether I’d have to use some Chrome extension or browser-based workaround instead. Open the actual Hubstaff help or support page that answers this and leave it open so I can look at the wording myself. If it turns out Chromebook use depends more on browser tools, then go to Brave’s official site and find the real Chromebook install page or instructions page for Brave so I have the official setup link ready; open that in its own tab too so I can compare both pages side by side. After that, head to Chrome’s official developer documentation and look up DevTools AI assistance, then summarize how I would get started with it and what data it uses, since that may matter more if I’m working mostly in the browser on a Chromebook. Please keep the official Chrome docs page open as proof. Finally, use Google to find a clear troubleshooting page for the annoying issue where Chrome keeps opening in Guest mode instead of my normal profile, and click through to the actual help page or forum post that gives step-by-step fixes so I have a recovery reference if this Chromebook browser setup gets weird. Leave that troubleshooting page open too, and then give me a clean summary of the Hubstaff Chromebook compatibility conclusion, the supported ChromeOS option Hubstaff mentions, the official Brave Chromebook install link, the DevTools AI getting-started and data-use summary, and the Guest mode fix steps you found.","website":"https://www.google.com","level":"easy","reference_length":7,"categories":["Computers Electronics and Technology > Computers Electronics and Technology - Other","Computers Electronics and Technology > Programming and Developer Software"],"precomputed_rubric":{"items":[{"criterion":"Correctly state Hubstaff’s official Chromebook compatibility conclusion for the desktop app.","description":"Correctly state Hubstaff’s official Chromebook compatibility conclusion for the desktop app.\n\nHow a grader verifies this: Grader can confirm the final answer matches the wording or meaning shown on an open Hubstaff help/support page about Chromebook or ChromeOS support.","max_points":240},{"criterion":"Identify the supported Chrome OS option Hubstaff describes for Chromebook users.","description":"Identify the supported Chrome OS option Hubstaff describes for Chromebook users.\n\nHow a grader verifies this: Grader verifies the supported option is visible on the Hubstaff page left open, such as extension-based or browser-based tracking guidance for ChromeOS.","max_points":160},{"criterion":"Provide the official Brave Chromebook install or instructions page URL from brave.com.","description":"Provide the official Brave Chromebook install or instructions page URL from brave.com.\n\nHow a grader verifies this: Grader confirms a Brave-owned page is open in its own tab and the returned URL points to the official Brave Chromebook installation/download instructions.","max_points":140},{"criterion":"Summarize how to get started with Chrome DevTools AI assistance using official Chrome documentation.","description":"Summarize how to get started with Chrome DevTools AI assistance using official Chrome documentation.\n\nHow a grader verifies this: Grader checks the open developer.chrome.com page and confirms the summary includes setup or enablement steps described there.","max_points":160},{"criterion":"Explain what data DevTools AI assistance uses according to official Chrome documentation.","description":"Explain what data DevTools AI assistance uses according to official Chrome documentation.\n\nHow a grader verifies this: Grader confirms the answer’s data-use description matches the official developer.chrome.com documentation left open.","max_points":120},{"criterion":"Provide a clear ordered troubleshooting sequence for fixing Chrome opening in Guest mode and restoring the normal profile, based on a source found through Google.","description":"Provide a clear ordered troubleshooting sequence for fixing Chrome opening in Guest mode and restoring the normal profile, based on a source found through Google.\n\nHow a grader verifies this: Grader confirms the final troubleshooting page is open from a Google result and that the returned steps reflect the source’s actionable sequence.","max_points":120},{"criterion":"Return all requested outputs together in one final response: Hubstaff conclusion, supported ChromeOS option, Brave link, DevTools AI summary, and Guest mode fix steps.","description":"Return all requested outputs together in one final response: Hubstaff conclusion, supported ChromeOS option, Brave link, DevTools AI summary, and Guest mode fix steps.\n\nHow a grader verifies this: Grader checks that the final response includes every requested component and that the referenced pages remain open as browser proof.","max_points":60}]}} +{"task_id":"4246dec196c9a3382b4224c7ec3a34a20be9f43f","confirmed_task":"I’m trying to put together a budget-friendly iPad Air M3 bundle without overpaying, so could you start on Target and look through the actual iPad Air M3 product pages to see which listed configuration is cheapest right now, including any sale price or visible discount text, and leave the cheapest product page open so I can look at the photos and storage/color details myself. Once you know which Target deal is the lowest, use that as the anchor for the bundle and go to Best Buy to find an Apple Magic Keyboard listing for the iPad Pro 13-inch that’s specifically open-box or refurbished and in Good condition, because I want a lower-cost keyboard option for a large Apple tablet setup; open that listing in its own tab so I can compare it side by side with the iPad. After that, check Amazon for a protective case for the iPad Air M3 that explicitly says it supports portrait-mode stand positioning, and open the actual product page so you can quote the wording that proves portrait support and I can verify the listing details on screen. To round things out, hop over to Slickdeals and see what the current featured top deal for iPads is, then tell me the deal title and price so I can decide whether it makes more sense to buy this bundle now or wait for a broader tech bargain. In the end, give me a short bundle summary anchored on the cheapest Target iPad Air M3 you found, with the keyboard, the portrait-capable case, and the Slickdeals iPad deal.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Computers Electronics and Technology > Consumer Electronics","Ecommerce & Shopping > Ecommerce and Shopping - Other"],"precomputed_rubric":{"items":[{"criterion":"Identify the lowest current price shown among Target iPad Air M3 product pages, including any visible discount information and the exact configuration it applies to.","description":"Identify the lowest current price shown among Target iPad Air M3 product pages, including any visible discount information and the exact configuration it applies to.\n\nHow a grader verifies this: Grader confirms the reported price, discount text, and configuration match the cheapest visible Target iPad Air M3 product page left open.","max_points":320},{"criterion":"Provide one Best Buy listing for an Apple Magic Keyboard compatible with iPad Pro 13-inch that is open-box or refurbished and specifically in Good condition, along with its current price.","description":"Provide one Best Buy listing for an Apple Magic Keyboard compatible with iPad Pro 13-inch that is open-box or refurbished and specifically in Good condition, along with its current price.\n\nHow a grader verifies this: Grader confirms the Best Buy tab shows a qualifying listing with compatibility for iPad Pro 13-inch, condition marked Good, and the reported price.","max_points":240},{"criterion":"Provide one Amazon protective case for iPad Air (M3), include the product name, and quote listing text that explicitly confirms portrait-mode stand positioning support.","description":"Provide one Amazon protective case for iPad Air (M3), include the product name, and quote listing text that explicitly confirms portrait-mode stand positioning support.\n\nHow a grader verifies this: Grader confirms the Amazon product page is open and contains the quoted text explicitly indicating portrait-mode stand support.","max_points":240},{"criterion":"Report the current featured top deal on Slickdeals for iPads, including its title and displayed price.","description":"Report the current featured top deal on Slickdeals for iPads, including its title and displayed price.\n\nHow a grader verifies this: Grader confirms the reported deal title and price match the currently featured top deal for iPads visible on Slickdeals.","max_points":100},{"criterion":"Present the final answer as a budget-oriented bundle summary anchored on the cheapest Target iPad Air M3 deal, incorporating the keyboard option, portrait-capable case, and Slickdeals reference.","description":"Present the final answer as a budget-oriented bundle summary anchored on the cheapest Target iPad Air M3 deal, incorporating the keyboard option, portrait-capable case, and Slickdeals reference.\n\nHow a grader verifies this: Grader confirms the final summary clearly uses the cheapest Target iPad Air M3 as the bundle anchor and includes all required components in a budget comparison framing.","max_points":100}]}} +{"task_id":"b6b8ad71aa3112840790066d7d62b498babdfa5c","confirmed_task":"I’m trying to decide whether driving this week is a bad idea, so can you build me a quick weather risk snapshot that starts with what it feels like right now and then widens out to the bigger trouble spots? First, on Google, search for Baltimore, Maryland weather and grab the current temperature plus the plain-English condition like cloudy, sunny, rain, or whatever it says, just so I have a baseline for home conditions. Then go to Wunderground and look up Syracuse, New York, and check the 10-day/7-day style forecast to find the lowest temperature expected over the next 7 days, including which day it happens, because I want to compare that colder destination against Baltimore. After that, use the National Weather Service forecast page for the Rittman, Ohio area near Marshallville and tell me what the current forecast says and whether there are any active alerts posted there, since that would really affect an Ohio leg of the drive; please open the actual forecast page and leave it visible so I can see the alert area and forecast text myself. Finally, go to the NWS Mount Holly page, find the winter forecast graphic, and report the snowfall amount shown there so I can tell whether the Mid-Atlantic part looks like a nuisance event or something more serious; if the graphic opens separately, leave that tab open too so I can look at the map. In the end, send me a short location-by-location summary with the key weather risk for Baltimore, Syracuse, Rittman/Marshallville, and the Mount Holly region.","website":"https://www.google.com","level":"easy","reference_length":6,"categories":["Science and Education > Weather","Travel and Tourism > Travel and Tourism - Other"],"precomputed_rubric":{"items":[{"criterion":"Report Baltimore, Maryland’s current temperature and current weather conditions from Google.","description":"Report Baltimore, Maryland’s current temperature and current weather conditions from Google.\n\nHow a grader verifies this: Grader can confirm the answer against the Google weather module showing Baltimore weather with a numeric temperature and condition label.","max_points":180},{"criterion":"Report the lowest forecasted temperature in Syracuse, New York over the next 7 days from Wunderground, including the day it occurs.","description":"Report the lowest forecasted temperature in Syracuse, New York over the next 7 days from Wunderground, including the day it occurs.\n\nHow a grader verifies this: Grader can verify the selected low and day on the Syracuse forecast page in Wunderground’s multi-day forecast view.","max_points":220},{"criterion":"Report the current National Weather Service forecast for the Rittman, Ohio area near Marshallville.","description":"Report the current National Weather Service forecast for the Rittman, Ohio area near Marshallville.\n\nHow a grader verifies this: Grader can confirm the forecast wording on the forecast.weather.gov page for the specified area.","max_points":180},{"criterion":"State whether any active weather alerts are posted for the Rittman, Ohio area near Marshallville.","description":"State whether any active weather alerts are posted for the Rittman, Ohio area near Marshallville.\n\nHow a grader verifies this: Grader can verify the presence or absence of alert banners, watches, warnings, or advisories on the same NWS forecast page left open by the agent.","max_points":140},{"criterion":"Report the predicted snowfall amount shown in the winter forecast graphic on the NWS Mount Holly page.","description":"Report the predicted snowfall amount shown in the winter forecast graphic on the NWS Mount Holly page.\n\nHow a grader verifies this: Grader can confirm the snowfall amount directly from the winter forecast graphic or image tab left open from the Mount Holly page.","max_points":180},{"criterion":"Return the findings as a short summary organized by location or region, with a brief key weather risk for each one.","description":"Return the findings as a short summary organized by location or region, with a brief key weather risk for each one.\n\nHow a grader verifies this: Grader can check that the final response includes Baltimore, Syracuse, Rittman/Marshallville, and the Mount Holly region, each with the requested weather detail and a concise risk statement.","max_points":100}]}} +{"task_id":"b1bd700090c23df9e9f6b7b9557ac418df602b8d","confirmed_task":"I’m trying to put together a realistic poetry submission plan for this month, so could you help me look up a couple of places I might actually submit to and then pair that with some funding opportunities? Start on Google and find the American Poetry Journal submission guidelines, then open the actual guidelines page and tell me how they want submissions sent, whether there’s a fee, and if they mention a reading period, because I want to know if it’s something I can act on right away. After that, still using Google, find the online submissions page for Pidgeonholes and open the direct submissions page itself in a separate tab so I can compare the two options side by side; please give me the exact submission URL and leave that tab open. Once you’ve got those two submission outlets, go to Poets & Writers and find at least three writing contests, awards, or grants that could help support my submission plan, and for each one note the opportunity name and the application deadline, or clearly say if no deadline is listed. In the end, send me a short summary with the American Poetry Journal submission method, fee, and any reading period you found, the direct Pidgeonholes submissions link, and the three Poets & Writers opportunities with their deadlines.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Arts & Entertainment > Books and Literature","Finance > Finance - Other"],"precomputed_rubric":{"items":[{"criterion":"Identify the American Poetry Journal submission guidelines and report the required submission method, any stated fee, and any reading period information if present.","description":"Identify the American Poetry Journal submission guidelines and report the required submission method, any stated fee, and any reading period information if present.\n\nHow a grader verifies this: Grader can confirm the agent opened the actual American Poetry Journal guidelines page and the final response includes the submission method plus fee and reading period details or a clear note if not stated.","max_points":350},{"criterion":"Provide the direct online submissions URL for Pidgeonholes and keep the submissions page open in its own tab.","description":"Provide the direct online submissions URL for Pidgeonholes and keep the submissions page open in its own tab.\n\nHow a grader verifies this: Grader can confirm a live Pidgeonholes submissions page is open in a separate browser tab and the exact URL is included in the final response.","max_points":200},{"criterion":"List at least three writing contests, awards, or grants from Poets & Writers, each with the opportunity name and application deadline, or explicitly note if no deadline is listed.","description":"List at least three writing contests, awards, or grants from Poets & Writers, each with the opportunity name and application deadline, or explicitly note if no deadline is listed.\n\nHow a grader verifies this: Grader can confirm the opportunities on Poets & Writers pages and check that the final response includes three names with corresponding deadlines or clear no-deadline notes.","max_points":300},{"criterion":"Deliver a concise final poetry submission plan summary combining the American Poetry Journal details, the direct Pidgeonholes submissions link, and the three Poets & Writers opportunities with deadlines.","description":"Deliver a concise final poetry submission plan summary combining the American Poetry Journal details, the direct Pidgeonholes submissions link, and the three Poets & Writers opportunities with deadlines.\n\nHow a grader verifies this: Grader can verify the final answer consolidates all required findings into one concise summary without omitting any requested fields.","max_points":150}]}} +{"task_id":"fb3f6eb23fad9b18c6c612d213d32ea40d891092","confirmed_task":"I’m thinking about signing up for a couple of online research-study platforms, but before I hand over my info I want a practical sense of what the participant experience is actually like. Please start on Google and look up Respondent.io, then open the actual Respondent site and any clearly relevant public help or participant pages so you can tell me, in plain English, how it works for participants — especially how someone signs up, builds a profile, finds or qualifies for studies, and how payment is handled. Keep the most useful Respondent page open so I can glance at it later. Then go to Terac’s site and figure out how that platform works for participants too, with extra attention to how someone gets set up to take part in studies, because I want to compare whether Terac’s onboarding feels simpler or more involved than Respondent’s. If you find a page that explains joining or participation, leave that open in its own tab as proof. After that, switch over to SurveyMonkey and open the screener page and verify it loads with visible survey questions so I can see what an actual participant flow feels like end to end, and leave that final page visible. When you’re done, give me a short comparison of Respondent versus Terac, say clearly whether Terac seems easier or more involved to get started with, and confirm that the SurveyMonkey screener page loaded with visible survey questions.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Business and Consumer Services > Business Services","Jobs and Career > Jobs and Employment"],"precomputed_rubric":{"items":[{"criterion":"Summarize how Respondent.io works for participants, including signup/profile creation, how users find or qualify for studies, and how payment is handled, based on publicly available information from official or clearly relevant pages.","description":"Summarize how Respondent.io works for participants, including signup/profile creation, how users find or qualify for studies, and how payment is handled, based on publicly available information from official or clearly relevant pages.\n\nHow a grader verifies this: Grader confirms the final response includes all four elements and that a relevant Respondent page is open or was visited from Google search results.","max_points":300},{"criterion":"Summarize how Terac works and specifically explain how participants get set up to take part in studies using information from Terac’s public site.","description":"Summarize how Terac works and specifically explain how participants get set up to take part in studies using information from Terac’s public site.\n\nHow a grader verifies this: Grader confirms the final response describes Terac’s platform purpose and participant setup flow, and that a relevant Terac page explaining participation or joining is open or was visited.","max_points":250},{"criterion":"Directly compare Terac’s onboarding/setup with Respondent’s baseline and state clearly whether Terac appears easier or more involved for participants.","description":"Directly compare Terac’s onboarding/setup with Respondent’s baseline and state clearly whether Terac appears easier or more involved for participants.\n\nHow a grader verifies this: Grader confirms the final response contains an explicit comparison and a clear easier/more involved judgment grounded in the two platform summaries.","max_points":200},{"criterion":"Open the SurveyMonkey screener page and verify it loads with visible survey questions, leaving the page visible as proof.","description":"Open the SurveyMonkey screener page and verify it loads with visible survey questions, leaving the page visible as proof.\n\nHow a grader verifies this: Grader can confirm the SurveyMonkey screener page was opened and that visible survey questions or screener content are displayed on the page.","max_points":250}]}} +{"task_id":"69782bfcfdb3311496bc9048bf66915b33e692cd","confirmed_task":"I’m trying to pick a Pilates place in the Fresno/Clovis area and want something concrete I can actually compare on screen, not just a vague list. Please start on Google and find at least two Pilates studios in Clovis, California that clearly offer classes, then open each studio’s actual schedule or booking page in its own tab and leave those tabs open so I can look at the class calendars myself later. Once you’ve got those Clovis options, broaden it into a short Fresno-area comparison by finding at least three Pilates class options around Fresno or Clovis, with each studio’s real schedule or booking link, because I want to see what nearby choices exist if the Clovis spots don’t fit my schedule. After that, go to Title 29 Fitness’s website and figure out what it offers in Fresno, especially anything relevant to Pilates or group fitness, and capture the class schedule details shown there; if there’s a schedule page or booking flow, open that too so I have visual proof it’s current. In the end, give me one concise comparison that includes the two Clovis studios, the broader Fresno-area list, and where Title 29 seems to fit among them based on what you actually saw in the browser.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Health > Nutrition Diets and Fitness","Hobbies and Leisure > Hobbies and Leisure - Other"],"precomputed_rubric":{"items":[{"criterion":"Identify at least two Clovis, California Pilates studios that offer classes and provide each studio’s name plus a direct class schedule or booking page link.","description":"Identify at least two Clovis, California Pilates studios that offer classes and provide each studio’s name plus a direct class schedule or booking page link.\n\nHow a grader verifies this: Grader can confirm two distinct Clovis studios are listed and that their schedule or booking pages are open in separate tabs or directly referenced with valid links.","max_points":300},{"criterion":"Provide a broader Fresno-area list of at least three Pilates class options, each with the studio name and a direct schedule or booking page link.","description":"Provide a broader Fresno-area list of at least three Pilates class options, each with the studio name and a direct schedule or booking page link.\n\nHow a grader verifies this: Grader can verify at least three Fresno-area options are named and each includes a schedule or booking link visible from the opened pages or final response.","max_points":300},{"criterion":"Look up Title 29 Fitness in Fresno and capture what it offers along with the class schedule details shown on its website.","description":"Look up Title 29 Fitness in Fresno and capture what it offers along with the class schedule details shown on its website.\n\nHow a grader verifies this: Grader can confirm the response includes offerings described from title29fitness.com and schedule details taken from the visible site pages or booking flow.","max_points":200},{"criterion":"End with one concise comparison that includes the Clovis options, the wider Fresno-area list, and an explanation of where Title 29 Fitness fits among them.","description":"End with one concise comparison that includes the Clovis options, the wider Fresno-area list, and an explanation of where Title 29 Fitness fits among them.\n\nHow a grader verifies this: Grader can verify the final response contains a brief synthesis comparing all gathered options and explicitly situating Title 29 relative to the Clovis and Fresno-area choices.","max_points":200}]}} +{"task_id":"67ad95421a303ad78cfcd8c3f5a7f6668d2c6a75","confirmed_task":"I’m helping a family member shop for an affordable used Toyota and want a realistic comparison across a couple of car sites before we decide what to pursue. On Edmunds, please search around Augusta, Georgia and find the cheapest used Toyota RAV4 listing that has both AWD and heated seats, then open the actual vehicle listing so you can confirm those features on the page and note the price, year, mileage, dealer or seller, and anything else basic that stands out; leave that listing open in its own tab so I can look at the photos and details later. Then, using that RAV4 as the benchmark for what the market looks like, go to Cars.com and search for any used Toyota within 50 miles of Augusta, GA priced at $10,000 or less, and open one matching listing that seems like a good budget reference so I can see what a lower-cost Toyota option looks like on another marketplace; keep that listing open too so I can compare the two tabs side by side. After that, on CarGurus, pull up the comparison details for the Toyota Camry XLE AWD and the Mazda3 Turbo Hatchback and capture the key specs for each — engine, horsepower, drivetrain, fuel economy, and MSRP — because I want to know whether sticking with Toyota’s AWD choices makes more sense than considering a non-Toyota AWD alternative. In the end, send me a concise summary with the Edmunds RAV4 listing details, the Cars.com budget Toyota listing details, and the side-by-side spec comparison.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Vehicles > Makes and Models","Ecommerce & Shopping > Ecommerce and Shopping - Other"],"precomputed_rubric":{"items":[{"criterion":"Find the lowest-priced used Toyota RAV4 on Edmunds near Augusta, GA that includes both AWD and heated seats, and open the actual listing page.","description":"Find the lowest-priced used Toyota RAV4 on Edmunds near Augusta, GA that includes both AWD and heated seats, and open the actual listing page.\n\nHow a grader verifies this: Grader can confirm an Edmunds vehicle detail page is open for a used Toyota RAV4 near Augusta with AWD and heated seats visible in the listing details or features, and that it is the lowest-priced qualifying result found.","max_points":350},{"criterion":"Capture the Edmunds RAV4 listing’s basic details including price, year, mileage, and dealer or seller information.","description":"Capture the Edmunds RAV4 listing’s basic details including price, year, mileage, and dealer or seller information.\n\nHow a grader verifies this: Grader can verify the reported Edmunds details against the open listing page fields for price, model year, mileage, and dealer or seller name.","max_points":200},{"criterion":"On Cars.com, find one used Toyota within 50 miles of Augusta, GA priced at $10,000 or less and open the actual listing page.","description":"On Cars.com, find one used Toyota within 50 miles of Augusta, GA priced at $10,000 or less and open the actual listing page.\n\nHow a grader verifies this: Grader can confirm a Cars.com vehicle listing page is open and that the listing meets the Toyota, distance, and price constraints shown in the search or listing context.","max_points":200},{"criterion":"On CarGurus, provide a side-by-side comparison of the Toyota Camry XLE AWD and Mazda3 Turbo Hatchback covering engine, horsepower, drivetrain, fuel economy, and MSRP.","description":"On CarGurus, provide a side-by-side comparison of the Toyota Camry XLE AWD and Mazda3 Turbo Hatchback covering engine, horsepower, drivetrain, fuel economy, and MSRP.\n\nHow a grader verifies this: Grader can verify the extracted specs against the CarGurus comparison or model pages for both vehicles.","max_points":150},{"criterion":"Return a concise final summary that includes the Edmunds RAV4 listing, the Cars.com budget Toyota listing, and the Camry XLE AWD versus Mazda3 Turbo Hatchback spec comparison.","description":"Return a concise final summary that includes the Edmunds RAV4 listing, the Cars.com budget Toyota listing, and the Camry XLE AWD versus Mazda3 Turbo Hatchback spec comparison.\n\nHow a grader verifies this: Grader can confirm the final response includes all three required components with the relevant details from the prior steps.","max_points":100}]}} +{"task_id":"5632294e494e6e86eb94739235bfa2373b868868","confirmed_task":"I’m getting ready to do a full DIY front and rear brake job on my 2020 Chevrolet Traverse with the 3.6L V6, so could you use RockAuto to look up OEM-style front and rear brake pad and rotor kits for that exact vehicle and jot down the key details like brand, part line, what each kit includes, and anything that helps me tell the options apart? If there are separate front and rear kits, open the actual product pages or info popups so I can visually compare them, and leave the most relevant RockAuto results open in their own tabs. Once you’ve got the parts figured out, go to the Haynes US site and find the repair manual that would actually help with brake pad and rotor replacement for this Traverse, and grab the manual title, link, and coverage years so I know it matches my SUV. Then check Amazon for a hose clamp tightening tool that looks suitable to keep nearby for the job, give me one solid option with its current price, and open the product page so I can see the photos and reviews for myself. In the end, send me a short summary with the RockAuto brake kit details, the Haynes manual name and link, and the Amazon tool name and price.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Vehicles > Makes and Models","Ecommerce & Shopping > Ecommerce and Shopping - Other"],"precomputed_rubric":{"items":[{"criterion":"RockAuto is used to identify OEM-style front and rear brake pad and rotor kit options for a 2020 Chevrolet Traverse 3.6L V6, with key product details captured for the available options.","description":"RockAuto is used to identify OEM-style front and rear brake pad and rotor kit options for a 2020 Chevrolet Traverse 3.6L V6, with key product details captured for the available options.\n\nHow a grader verifies this: Grader can confirm the RockAuto vehicle selection and brake kit listings, plus visible product/info pages or tabs showing front and rear kit details such as brand, line, and included parts.","max_points":400},{"criterion":"Relevant RockAuto front and rear kit pages or info views are opened and left available for visual comparison.","description":"Relevant RockAuto front and rear kit pages or info views are opened and left available for visual comparison.\n\nHow a grader verifies this: Grader can confirm multiple RockAuto tabs, product pages, or info popups remain open showing the chosen front and rear kit options.","max_points":100},{"criterion":"A Haynes manual suitable for a 2020 Chevrolet Traverse brake pad and rotor replacement is found, with the manual title, link, and coverage years recorded.","description":"A Haynes manual suitable for a 2020 Chevrolet Traverse brake pad and rotor replacement is found, with the manual title, link, and coverage years recorded.\n\nHow a grader verifies this: Grader can confirm the Haynes manual page is open and visibly shows the manual title and coverage information matching the Traverse.","max_points":200},{"criterion":"One suitable hose clamp tightening tool is found on Amazon, with its product name and current price recorded.","description":"One suitable hose clamp tightening tool is found on Amazon, with its product name and current price recorded.\n\nHow a grader verifies this: Grader can confirm the Amazon product page is open and shows the selected tool name and visible price.","max_points":150},{"criterion":"The final response provides a short combined summary of the RockAuto brake kit details, the Haynes manual name and link with coverage years, and the Amazon tool name and price.","description":"The final response provides a short combined summary of the RockAuto brake kit details, the Haynes manual name and link with coverage years, and the Amazon tool name and price.\n\nHow a grader verifies this: Grader can compare the final written summary against the information visible on the RockAuto, Haynes, and Amazon pages.","max_points":150}]}} +{"task_id":"eb8a01554a84cf5d16a84a766d0f6cfb55d33c81","confirmed_task":"I’m planning a holiday party in Minneapolis and want a quick shortlist of bar caterers that actually look local and usable. Please start on The Knot and look specifically for Minneapolis-area wine or liquor bar service caterers, then open the actual vendor listings in separate tabs so I can compare them visually, and pull together at least five options with each business name and city/state. If Surdyk’s shows up in that shortlist, go to Surdyk’s Catering and look through their site to see whether they feel more full-service than the others, then summarize at least three catering services or service types they offer, including any food and beverage options you can verify on the page, and leave the Surdyk’s page open so I can glance at it later. After that, because I may want a nonalcoholic menu item to pair with whichever caterer seems best, use Google to find a pressure-cooker or Instant Pot mulligatawny soup recipe, open the actual recipe page, and give me the ingredient list plus the basic cooking steps from that one recipe. Please return everything as one concise planning summary so I can compare the caterers, any Surdyk’s details if relevant, and the soup idea all in one place.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Food and Drink > Beverages","Community and Society > Holidays and Seasonal Events"],"precomputed_rubric":{"items":[{"criterion":"Provide at least five Minneapolis-area wine or liquor bar service caterers sourced from The Knot, each with business name and location in city/state format.","description":"Provide at least five Minneapolis-area wine or liquor bar service caterers sourced from The Knot, each with business name and location in city/state format.\n\nHow a grader verifies this: Grader confirms at least five vendor names and matching city/state details from The Knot listings, with evidence that vendor pages were opened or reviewed in browser tabs.","max_points":400},{"criterion":"If Surdyk’s is included in the The Knot shortlist, summarize at least three Surdyk’s Catering full-service offerings or service types from surdykscatering.com, including food and/or beverage offerings.","description":"If Surdyk’s is included in the The Knot shortlist, summarize at least three Surdyk’s Catering full-service offerings or service types from surdykscatering.com, including food and/or beverage offerings.\n\nHow a grader verifies this: Grader confirms Surdyk’s appeared in the shortlist and that the summary includes three verified offerings visible on Surdyk’s site; Surdyk’s page remains open as browser proof.","max_points":250},{"criterion":"Provide one pressure-cooker or Instant Pot mulligatawny soup recipe found via Google, including the ingredient list and basic cooking steps from a single recipe page.","description":"Provide one pressure-cooker or Instant Pot mulligatawny soup recipe found via Google, including the ingredient list and basic cooking steps from a single recipe page.\n\nHow a grader verifies this: Grader confirms Google was used to reach a recipe page and that the final response includes ingredients and basic steps consistent with one visible recipe source.","max_points":200},{"criterion":"Return all requested information as one concise planning summary combining the caterer shortlist, any applicable Surdyk’s comparison details, and the soup recipe information.","description":"Return all requested information as one concise planning summary combining the caterer shortlist, any applicable Surdyk’s comparison details, and the soup recipe information.\n\nHow a grader verifies this: Grader checks that the response is a single concise summary covering all required outputs without omitting any applicable section.","max_points":150}]}} +{"task_id":"ada70bfe81da1cd33bec47d79a9d279d7734a686","confirmed_task":"I’m trying to plan a Yorkshire family day out around Christmas and want to compare two festive options properly before I recommend one. Could you start on the official Stockeld Park site and open their Winter or Christmas ticket page, then the main activities page if needed, and pull together exactly what’s included with a standard festive ticket so I can see whether it feels like a full day out for kids; if there are photos or activity sections on the page, open the main ticket page and the activities/details page in separate tabs and leave them open so I can glance at them later. Then use Google to find the current Harrogate Christmas Funland page or official event listing, open the actual event page, and summarize what’s included there too, along with the location and the event dates, because I want to compare whether it sounds more substantial than Stockeld Park; if you find more than one relevant result, open the most official-looking listing in its own tab and verify it’s live before using it. After that, go to the official York Maze site and pull together the practical visitor details for using it as a backup daytime activity nearby, especially where it is, the opening times or seasonal opening info, and any important ticket or visit-planning notes like booking ahead, age guidance, or whether it’s seasonal, and leave the key visitor info page open as well. In the end, give me a concise side-by-side comparison of Stockeld Park versus Harrogate Christmas Funland, then a short recommendation on which festive option seems better for a family day out and whether York Maze sounds like a realistic backup plan.","website":"https://www.google.com","level":"easy","reference_length":6,"categories":["Travel and Tourism > Tourist Attractions","Community and Society > Holidays and Seasonal Events"],"precomputed_rubric":{"items":[{"criterion":"Accurately summarize the activities included with Stockeld Park Winter/Christmas tickets using the official Stockeld Park pages.","description":"Accurately summarize the activities included with Stockeld Park Winter/Christmas tickets using the official Stockeld Park pages.\n\nHow a grader verifies this: Grader can confirm the summary matches visible included activities on the open Stockeld Park ticket/details tabs.","max_points":240},{"criterion":"Find and use a live, relevant Harrogate Christmas Funland page or official listing and summarize what is included in the experience.","description":"Find and use a live, relevant Harrogate Christmas Funland page or official listing and summarize what is included in the experience.\n\nHow a grader verifies this: Grader can see an open Harrogate Christmas Funland page reached via Google and verify the included features against the visible listing content.","max_points":200},{"criterion":"Include Harrogate Christmas Funland’s event location and event dates.","description":"Include Harrogate Christmas Funland’s event location and event dates.\n\nHow a grader verifies this: Grader can verify the location and dates directly on the open event/listing page.","max_points":140},{"criterion":"Summarize York Maze visitor details including where it is, opening times or seasonal opening information, and important ticket or visit-planning notes from the official York Maze site.","description":"Summarize York Maze visitor details including where it is, opening times or seasonal opening information, and important ticket or visit-planning notes from the official York Maze site.\n\nHow a grader verifies this: Grader can confirm these details on the open York Maze visitor information page.","max_points":200},{"criterion":"Present Stockeld Park and Harrogate Christmas Funland as a concise side-by-side comparison focused on what is included and overall suitability for a family festive outing.","description":"Present Stockeld Park and Harrogate Christmas Funland as a concise side-by-side comparison focused on what is included and overall suitability for a family festive outing.\n\nHow a grader verifies this: Final response clearly compares both attractions using findings from Steps 1 and 2 rather than listing them separately.","max_points":120},{"criterion":"Provide a short recommendation on which festive attraction seems better and whether York Maze works as a backup daytime activity.","description":"Provide a short recommendation on which festive attraction seems better and whether York Maze works as a backup daytime activity.\n\nHow a grader verifies this: Final response includes a reasoned recommendation grounded in the gathered details from all three sites.","max_points":100}]}} +{"task_id":"140960bb7293bdeeb6bcc60931681cb9b815351b","confirmed_task":"I'm trying to plan a really simple errand loop around Chapel Hill and Carrboro, so could you start on Google and find at least three public little free pantries or community food boxes in or very close to Chapel Hill/Carrboro, then open the actual map or listing pages for each one in separate tabs so I can visually confirm they're real places and still look active. Once you have those, go to Publix and check the current weekly BOGO deals, and from that ad pick the deals that would work for a carnivore-style dinner, meaning meat, seafood, cheese, eggs, or other animal-based items only, because I want to turn the pantry run into a quick grocery stop too. From those BOGO items, choose one specific dinner pairing made only from the qualifying deals and leave the weekly ad or product pages open so I can look at the prices and packaging myself. After that, go to Bisou Bisou's site and find one cocktail on the menu that is actually green, then open the menu page and tell me the drink name and ingredients so I have an optional treat stop after the errands. Please give me the pantry locations with addresses or clear location descriptions, the exact Publix BOGO items you used and the dinner pairing, plus the green cocktail name and ingredients.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Community and Society > Philanthropy","Reference Materials > Maps"],"precomputed_rubric":{"items":[{"criterion":"Find at least three public food box or mini pantry locations in or near Chapel Hill/Carrboro and provide an address or clear location description for each.","description":"Find at least three public food box or mini pantry locations in or near Chapel Hill/Carrboro and provide an address or clear location description for each.\n\nHow a grader verifies this: Grader can confirm three distinct pantry or food box locations from Google results or map/listing pages, with separate tabs or visible listing details showing each location.","max_points":350},{"criterion":"Identify current Publix BOGO items that fit a carnivore diet.","description":"Identify current Publix BOGO items that fit a carnivore diet.\n\nHow a grader verifies this: Grader can confirm the listed items appear in the current Publix weekly BOGO ad or product pages and that the items are animal-based foods such as meat, seafood, cheese, eggs, or similar.","max_points":250},{"criterion":"Propose one dinner pairing made only from the carnivore-diet-friendly Publix BOGO items identified.","description":"Propose one dinner pairing made only from the carnivore-diet-friendly Publix BOGO items identified.\n\nHow a grader verifies this: Grader can verify that every component of the proposed dinner pairing comes directly from the qualifying BOGO items found in Step 2.","max_points":200},{"criterion":"Identify one green cocktail from Bisou Bisou's cocktail menu and provide its name and ingredients.","description":"Identify one green cocktail from Bisou Bisou's cocktail menu and provide its name and ingredients.\n\nHow a grader verifies this: Grader can confirm the cocktail appears on the Bisou Bisou menu page and that the response includes the exact drink name and ingredient list from the site.","max_points":200}]}} +{"task_id":"10548585c3214aa1a15f7ceef8aa4fde0c2fcdf7","confirmed_task":"I’m putting together a quick set of Chromebook help notes for someone who keeps asking me whether they can use Firefox instead of Chrome and whether they can still get to their saved Apple passwords when they’re in Chrome, so could you check a few things in a real browser for me? Start on Mozilla’s official Firefox site and open the actual Chromebook or ChromeOS instructions so you can confirm whether Firefox can really be installed there and what Mozilla says the install method is; leave that page open because I want the official wording as a reference. Then go to the Chrome Web Store and find the official iCloud Passwords extension from Apple, open the actual listing page, and verify from the page itself that it’s for accessing iCloud passwords in Chrome; keep that tab open too so I can see the publisher and listing URL. Since these notes should also cover a Google Docs issue they run into all the time, use Google Search to look up reliable troubleshooting for images in Google Docs that show an exclamation mark or refuse to load, then open a useful result and pull out the recommended fixes. After that, open the UserTesting contributor sign-in page so I can confirm a normal login screen is reachable for a site where saved credentials might matter, then in another tab open Patreon’s homepage just to verify ordinary browsing works there too and leave both tabs open so I can compare them. Finally, go to YouTube, open the video titled \"Youtube Rewind 2011,\" start playback, and tell me what you see in the first moments so I know media playback works in the browser. At the end, give me a concise summary of what you confirmed on each site, including the Firefox Chromebook answer, the iCloud Passwords extension name and listing URL, the Google Docs image fixes, and whether UserTesting, Patreon, and YouTube all behaved normally.","website":"https://www.google.com","level":"easy","reference_length":7,"categories":["Computers Electronics and Technology > Computers Electronics and Technology - Other"],"precomputed_rubric":{"items":[{"criterion":"The Mozilla Firefox Chromebook/ChromeOS instructions page is opened and the agent correctly confirms whether Firefox can be installed on Chromebook, including Mozilla’s stated install/download method.","description":"The Mozilla Firefox Chromebook/ChromeOS instructions page is opened and the agent correctly confirms whether Firefox can be installed on Chromebook, including Mozilla’s stated install/download method.\n\nHow a grader verifies this: Grader can see an official Mozilla Firefox/Support page about Chromebook or ChromeOS open, and the final answer states the installability outcome plus the method described on that page.","max_points":200},{"criterion":"The official Apple iCloud Passwords extension listing is opened in the Chrome Web Store, and the agent records the exact extension name and listing URL while confirming its purpose is to access iCloud passwords in Chrome.","description":"The official Apple iCloud Passwords extension listing is opened in the Chrome Web Store, and the agent records the exact extension name and listing URL while confirming its purpose is to access iCloud passwords in Chrome.\n\nHow a grader verifies this: Grader can see the Chrome Web Store listing page with Apple as publisher or official branding, and the final answer includes the extension name, URL, and purpose.","max_points":200},{"criterion":"The agent finds troubleshooting guidance for Google Docs images showing an exclamation mark or not loading and summarizes the recommended fixes from a relevant opened result.","description":"The agent finds troubleshooting guidance for Google Docs images showing an exclamation mark or not loading and summarizes the recommended fixes from a relevant opened result.\n\nHow a grader verifies this: Grader can see Google Search results and/or an opened troubleshooting page, and the final answer includes concrete recommended fixes rather than a vague statement.","max_points":180},{"criterion":"The UserTesting contributor sign-in page is opened and the sign-in screen is confirmed as reachable.","description":"The UserTesting contributor sign-in page is opened and the sign-in screen is confirmed as reachable.\n\nHow a grader verifies this: Grader can see a UserTesting sign-in page with login fields or contributor sign-in UI visible, and the final answer explicitly confirms access.","max_points":100},{"criterion":"Patreon’s homepage is opened in its own tab and normal homepage access is confirmed from visible branding or title.","description":"Patreon’s homepage is opened in its own tab and normal homepage access is confirmed from visible branding or title.\n\nHow a grader verifies this: Grader can see Patreon homepage branding or title in the open tab, and the final answer confirms homepage access worked.","max_points":100},{"criterion":"The YouTube video titled \"Youtube Rewind 2011\" is opened and playback is started successfully.","description":"The YouTube video titled \"Youtube Rewind 2011\" is opened and playback is started successfully.\n\nHow a grader verifies this: Grader can see the YouTube watch page with the specified title and a playing state or progressed timestamp, and the final answer describes the first visible moments.","max_points":100},{"criterion":"A concise final summary covers all six sites and includes the Firefox Chromebook conclusion, iCloud Passwords extension details, Google Docs image troubleshooting, UserTesting sign-in confirmation, Patreon homepage confirmation, and YouTube playback confirmation.","description":"A concise final summary covers all six sites and includes the Firefox Chromebook conclusion, iCloud Passwords extension details, Google Docs image troubleshooting, UserTesting sign-in confirmation, Patreon homepage confirmation, and YouTube playback confirmation.\n\nHow a grader verifies this: Grader checks the final response for all required site-specific findings with no major omissions.","max_points":120}]}} +{"task_id":"fbcfa176b2e1aa42200d4f3adb66dcf0a6ca62ee","confirmed_task":"I’m trying to put together a very small monthly subscription budget and want to compare a couple of creator memberships against ChatGPT so I can see what actually fits. Please start on Patreon with Matt and Shane’s Secret Podcast and note the membership tier names shown on their page, then also open the Patreon pages for Matt and Shane’s Secret Podcast and Chris Sain in separate tabs so I can visually compare the available tier names and prices side by side. After that, go to ChatGPT’s pricing page on chatgpt.com and capture the current plan names and prices, because I want to know whether adding ChatGPT would still be realistic alongside just one creator membership. To round things out, use Google to get to the actual HellHades membership plans page and check whether any plan specifically mentions interface improvements or automation features, since that kind of perk would make the comparison more meaningful than price alone. Please leave the Patreon tabs and the ChatGPT pricing page open so I can glance at them afterward, and then give me a concise summary that groups the Patreon tiers, ChatGPT pricing, and the HellHades feature note into a simple budget-minded recommendation.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Arts & Entertainment > Streaming & Online TV","Computers Electronics and Technology > Computers Electronics and Technology - Other","Finance > Finance - Other"],"precomputed_rubric":{"items":[{"criterion":"Identify the subscription tier names shown on the Matt and Shane’s Secret Podcast Patreon membership page.","description":"Identify the subscription tier names shown on the Matt and Shane’s Secret Podcast Patreon membership page.\n\nHow a grader verifies this: Grader can confirm the agent visited the Patreon page for Matt and Shane’s Secret Podcast and extracted the visible tier names from that page.","max_points":200},{"criterion":"Record the available membership tiers for both Matt and Shane’s Secret Podcast and Chris Sain on Patreon, including each tier’s name and price as shown on their membership/join pages.","description":"Record the available membership tiers for both Matt and Shane’s Secret Podcast and Chris Sain on Patreon, including each tier’s name and price as shown on their membership/join pages.\n\nHow a grader verifies this: Grader can confirm two Patreon creator pages were opened in separate tabs and that the reported tier names and prices match what is visible on each page.","max_points":300},{"criterion":"Capture the ChatGPT plan names and prices from the ChatGPT pricing page.","description":"Capture the ChatGPT plan names and prices from the ChatGPT pricing page.\n\nHow a grader verifies this: Grader can confirm the chatgpt.com pricing page was opened and that the reported plan names and prices match the visible pricing cards or table.","max_points":200},{"criterion":"Review the HellHades membership plans page and state whether any plan specifically mentions interface improvements or automation features.","description":"Review the HellHades membership plans page and state whether any plan specifically mentions interface improvements or automation features.\n\nHow a grader verifies this: Grader can confirm the agent reached the actual HellHades membership page from Google results and checked the visible plan descriptions for those feature mentions.","max_points":150},{"criterion":"Provide a concise budget-oriented recommendation that combines the Patreon tier comparison, ChatGPT pricing, and the HellHades feature note into a judgment about whether ChatGPT fits alongside one creator membership.","description":"Provide a concise budget-oriented recommendation that combines the Patreon tier comparison, ChatGPT pricing, and the HellHades feature note into a judgment about whether ChatGPT fits alongside one creator membership.\n\nHow a grader verifies this: Grader can confirm the final response synthesizes findings from Patreon, ChatGPT, and HellHades into a short recommendation rather than listing raw data only.","max_points":150}]}} +{"task_id":"c8d7c6136ca692eac6d7532e275d5f8d11ec971b","confirmed_task":"I’m trying to put together a really simple morning routine that starts with a quick math refresher and then shifts into beginner yoga I can actually keep using. Please start on Google and search for a video lesson that clearly teaches both explicit and recursive formulas for arithmetic sequences, then open the actual video page and note the title, creator or channel, and URL so I have a study piece to come back to; if it looks solid, leave that tab open for me. After that, go to YouTube and find a morning yoga video that’s right around 20 minutes long, open the actual video page, and tell me the title and duration so I can see whether it feels short enough for a real weekday routine. If that general option seems reasonable, stay on YouTube and specifically check whether Yoga With Adriene has a morning yoga video under 20 minutes that would fit the same need, and confirm from the video page or description whether it’s actually a vinyasa flow style session; please open that in its own tab too so I can compare the two yoga options side by side. To round this out, use Google to find at least three approachable or funny yoga instructors who post free classes on YouTube, and for each one give me the instructor’s name, a channel or website link, and a short note on why they seem especially beginner-friendly compared with the yoga videos you found earlier. In the end, I just want a concise resource list for this morning routine, with the key links, and please keep the math video plus the two yoga video tabs open so I can look at them myself.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Science and Education > Math","Health > Nutrition Diets and Fitness"],"precomputed_rubric":{"items":[{"criterion":"A video lesson found via Google is opened that teaches both explicit and recursive formulas for arithmetic sequences, with the title, creator/channel name, and URL recorded.","description":"A video lesson found via Google is opened that teaches both explicit and recursive formulas for arithmetic sequences, with the title, creator/channel name, and URL recorded.\n\nHow a grader verifies this: Grader can confirm a Google search was used and the final open tab is the actual video page showing a relevant title/channel and a valid URL.","max_points":280},{"criterion":"A YouTube morning yoga video of approximately 20 minutes is opened, with its title and duration provided.","description":"A YouTube morning yoga video of approximately 20 minutes is opened, with its title and duration provided.\n\nHow a grader verifies this: Grader can verify the YouTube video page is open and the visible title and runtime are around 20 minutes.","max_points":220},{"criterion":"A Yoga With Adriene morning yoga video under 20 minutes is opened in its own tab, and the response correctly confirms yes or no whether it is a vinyasa flow style session.","description":"A Yoga With Adriene morning yoga video under 20 minutes is opened in its own tab, and the response correctly confirms yes or no whether it is a vinyasa flow style session.\n\nHow a grader verifies this: Grader can confirm the separate YouTube tab is a Yoga With Adriene video under 20 minutes and that the vinyasa determination is supported by visible page text such as the title or description.","max_points":240},{"criterion":"At least three approachable or funny yoga instructors who post free YouTube classes are listed, each with instructor name, channel or website link, and a brief beginner-friendly rationale tied to the earlier yoga options.","description":"At least three approachable or funny yoga instructors who post free YouTube classes are listed, each with instructor name, channel or website link, and a brief beginner-friendly rationale tied to the earlier yoga options.\n\nHow a grader verifies this: Grader can confirm three distinct instructors were found via Google and that each entry includes instructor name, a valid channel or website link, and a comparative note about why the instructor seems approachable for beginners.","max_points":260}]}} +{"task_id":"e15345ed27f1933065af403601876e5f6597a943","confirmed_task":"I’m putting together a very simple reading-support mini lesson for a student who does better with easier-to-scan text, and I want you to help me pull the pieces together in the browser. Start on Google and find one online English grammar practice quiz that feels appropriate for about 5th grade, then open the actual quiz page so you can verify it’s really a student-facing practice activity and leave that tab open for me as a reference; I need the quiz title and direct URL. Then use Google again to find a printable worksheet or practice page for an 8th-grade student about basic marketing strategies like product, price, place, and promotion or closely related introductory marketing concepts, because I want it to work as an extension activity after the grammar warm-up; open the real worksheet or resource page in its own tab so I can see that it looks classroom-appropriate and printable, and note the title and URL. After that, go to Slidesgo and pick a fun, classroom-appropriate presentation template that could reasonably hold both the grammar warm-up and the marketing extension in one student lesson deck, and open the template’s actual page so I can see the preview images; please include the template name, URL, and whether it’s available for Google Slides or PowerPoint. Finally, go to 10015.io’s bionic reading converter and convert this exact lesson intro into bionic reading format so I can paste it into the first slide of the template you chose: “Today we will warm up with a short grammar quiz, then practice how people use basic marketing strategies like product, price, place, and promotion. Read each direction carefully and do your best.” Please keep the useful tabs open and send me the quiz, worksheet, template, and the converted text in one clean summary.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Science and Education > Education"],"precomputed_rubric":{"items":[{"criterion":"A real online English grammar practice quiz appropriate for about 5th grade is found via Google, and the final response includes the quiz title and direct quiz page URL.","description":"A real online English grammar practice quiz appropriate for about 5th grade is found via Google, and the final response includes the quiz title and direct quiz page URL.\n\nHow a grader verifies this: Grader can confirm there is an open tab showing the actual quiz page, not just search results, and the reported title/URL match the visible page.","max_points":280},{"criterion":"A printable worksheet or practice page about basic marketing strategies or a closely related introductory marketing topic for about 8th grade is found via Google, and the final response includes the resource title and access URL.","description":"A printable worksheet or practice page about basic marketing strategies or a closely related introductory marketing topic for about 8th grade is found via Google, and the final response includes the resource title and access URL.\n\nHow a grader verifies this: Grader can confirm there is an open tab showing the actual worksheet/resource page with printable or classroom-use cues, and the reported title/URL match the visible page.","max_points":280},{"criterion":"One Slidesgo template that is fun and classroom-appropriate for combining both activities into a single lesson deck is selected, and the final response includes the template name, Slidesgo URL, and use/download option.","description":"One Slidesgo template that is fun and classroom-appropriate for combining both activities into a single lesson deck is selected, and the final response includes the template name, Slidesgo URL, and use/download option.\n\nHow a grader verifies this: Grader can confirm the Slidesgo template detail page is open with visible preview images and that the named template, URL, and Google Slides/PowerPoint option match the page.","max_points":220},{"criterion":"The provided lesson intro is converted on 10015.io into bionic reading format and the full converted text is returned.","description":"The provided lesson intro is converted on 10015.io into bionic reading format and the full converted text is returned.\n\nHow a grader verifies this: Grader can confirm the converter page shows transformed output corresponding to the provided passage and that the returned text matches the visible converted result.","max_points":220}]}} +{"task_id":"38039bd8d8469c245faab531cb508c3c975c4869","confirmed_task":"I’m putting together a simple hummingbird-themed flyer draft and want you to help me gather a few references in a practical order so I can actually see what might work together on screen. First, on Google Images, search for a photo of a hummingbird flying among flowers in a sunlit garden and open the actual image result that feels strongest as the main visual reference, then keep that image page open in its own tab and save the image result URL for me. Once you have that nature image as a reference point, go to Canva’s Templates page, search for flyer templates, and pick two template names that would suit a bright, colorful garden-style hummingbird flyer; please open each template in a separate tab too so I can compare the layouts visually later. After that, use Google to find a simple CSS example for styling an HTML unordered list with ul and li selectors, including an example that changes the bullet style with list-style-type, because I may turn the flyer details into a small webpage and want a clean bulleted section style. Finally, go to Purdue OWL and find the APA guidance for citing a PowerPoint presentation so I know how to credit it properly if I turn this flyer concept into slides. At the end, send me the hummingbird image result URL, the two Canva template names, the CSS example you found, and a concise summary of the Purdue OWL APA citation guidance, and leave the image tab plus the two Canva template tabs open so I can look at them.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Hobbies and Leisure > Photography","Arts & Entertainment > Visual Arts and Design"],"precomputed_rubric":{"items":[{"criterion":"Provide one Google Images result URL for a photo that matches a hummingbird flying among flowers in a sunlit garden, with the selected image page left open.","description":"Provide one Google Images result URL for a photo that matches a hummingbird flying among flowers in a sunlit garden, with the selected image page left open.\n\nHow a grader verifies this: Grader confirms the final response includes a Google Images result URL and the browser shows the chosen hummingbird image page open in a tab with imagery matching the described scene.","max_points":300},{"criterion":"Provide the names of two Canva flyer templates that plausibly suit a bright hummingbird garden flyer, with each template opened in its own tab.","description":"Provide the names of two Canva flyer templates that plausibly suit a bright hummingbird garden flyer, with each template opened in its own tab.\n\nHow a grader verifies this: Grader confirms two template names are listed in the response and corresponding Canva template pages are open in separate tabs showing flyer layouts.","max_points":250},{"criterion":"Provide a basic CSS example for styling an unordered list using ul and li selectors, including a demonstration of list-style-type.","description":"Provide a basic CSS example for styling an unordered list using ul and li selectors, including a demonstration of list-style-type.\n\nHow a grader verifies this: Grader confirms the response includes CSS code with ul/li styling and an explicit use of list-style-type to change bullet appearance.","max_points":200},{"criterion":"Provide Purdue OWL APA guidance for citing a PowerPoint presentation, including the key citation rules in concise form.","description":"Provide Purdue OWL APA guidance for citing a PowerPoint presentation, including the key citation rules in concise form.\n\nHow a grader verifies this: Grader confirms the response summarizes citation guidance drawn from Purdue OWL and includes the essential APA formatting elements for a PowerPoint presentation citation.","max_points":250}]}} +{"task_id":"4d6c838dec27532db3f999755cebc1732f4cbe8b","confirmed_task":"I’m putting together a quick note for a friend who does front-end web work on a Mac and is also dealing with a few annoying Apple-device issues, so could you help me verify everything in a browser? First, go to JetBrains and figure out which IDE they position specifically for editing and organizing web code, because that’s the one I want to recommend, and open the actual product page so I can see that you’re on the right tool. Then use Google to look into the problem where HomePods randomly start playing Apple Music and suddenly jump in volume, and base that on public discussions or support threads so we have a likely explanation plus at least one practical fix. After that, still using Google, find one reliable way to share or mirror an iPhone screen to another device like a TV or Mac, since that could help my friend demonstrate the issue, and open the source page in its own tab. Finally, use Google again to find how to switch an iPad from the floating mini keyboard back to the full-size keyboard, ideally from an Apple support page or another clearly trustworthy source, and leave that page open too so I can glance at the exact instructions. Once you’ve checked all of that, send me one compact note that names the JetBrains IDE and includes the HomePod explanation and fix, the iPhone mirroring method with steps, and the iPad keyboard fix.","website":"https://www.google.com","level":"easy","reference_length":6,"categories":["Computers Electronics and Technology > Programming and Developer Software","Computers Electronics and Technology > Consumer Electronics"],"precomputed_rubric":{"items":[{"criterion":"Correctly identify the JetBrains IDE intended for editing and organizing web code and name it as the recommendation.","description":"Correctly identify the JetBrains IDE intended for editing and organizing web code and name it as the recommendation.\n\nHow a grader verifies this: Grader can confirm the browser is on the relevant JetBrains product page and the final note names the correct IDE.","max_points":200},{"criterion":"Provide a plausible explanation for HomePods randomly playing Apple Music and suddenly increasing volume based on public discussions or support-style sources.","description":"Provide a plausible explanation for HomePods randomly playing Apple Music and suddenly increasing volume based on public discussions or support-style sources.\n\nHow a grader verifies this: Grader can confirm a Google results path to a discussion/support source and see the explanation reflected in the final note.","max_points":180},{"criterion":"Include at least one suggested fix for the HomePod random playback/volume issue.","description":"Include at least one suggested fix for the HomePod random playback/volume issue.\n\nHow a grader verifies this: Grader can verify a fix was extracted from the researched source and included in the final note.","max_points":170},{"criterion":"Summarize one reliable method for sharing or mirroring an iPhone screen to another device, including the necessary steps.","description":"Summarize one reliable method for sharing or mirroring an iPhone screen to another device, including the necessary steps.\n\nHow a grader verifies this: Grader can confirm a source page for iPhone mirroring is open in its own tab and the final note includes a usable step summary.","max_points":170},{"criterion":"Summarize how to switch an iPad from the floating mini keyboard back to the full-size keyboard.","description":"Summarize how to switch an iPad from the floating mini keyboard back to the full-size keyboard.\n\nHow a grader verifies this: Grader can confirm a trustworthy instruction page is open and the final note includes the correct gesture or keyboard-button method.","max_points":130},{"criterion":"Return one compact note that combines the JetBrains IDE recommendation with troubleshooting tips for all three Apple-related issues.","description":"Return one compact note that combines the JetBrains IDE recommendation with troubleshooting tips for all three Apple-related issues.\n\nHow a grader verifies this: Grader can review the final response and confirm it includes the IDE name, HomePod explanation and fix, iPhone mirroring steps, and iPad keyboard steps in one concise note.","max_points":150}]}} +{"task_id":"0106b570440ffe4427d5e916f39ec986ab3de917","confirmed_task":"I want to make myself a quick bargain roundup and keep it grounded in deals that are actually live on the sites right now. Please start on Slickdeals and open whatever is currently being shown as the featured best deal, then grab the exact title and current price so I have a benchmark for what counts as a standout offer today; leave that deal page open in its own tab so I can look at it afterward. Then go to CheapCharts and browse the current iTunes deals to find one on-sale movie, one on-sale TV season, and one on-sale audiobook that feel like easy low-cost digital add-ons compared with the Slickdeals benchmark, and open each of those actual CheapCharts deal pages in separate tabs so I can visually compare them. After that, head to the Epic Games Store homepage, then open the current seasonal or featured sale page, and also pull up the product page for Split Fiction in another tab so I can include one game-store option alongside the media deals and have proof you actually viewed both Epic pages. When you’re done, give me a concise roundup with the Slickdeals featured deal, the three CheapCharts picks labeled by category with prices, and a short confirmation that you viewed the Epic Games Store homepage, the current seasonal or featured sale page, and the Split Fiction product page.","website":"https://www.google.com","level":"easy","reference_length":6,"categories":["Ecommerce & Shopping > Coupons and Rebates","Ecommerce & Shopping > Ecommerce and Shopping - Other"],"precomputed_rubric":{"items":[{"criterion":"The current featured best deal on Slickdeals is identified from the site and its exact title and displayed price are recorded.","description":"The current featured best deal on Slickdeals is identified from the site and its exact title and displayed price are recorded.\n\nHow a grader verifies this: Grader can confirm the open Slickdeals deal tab matches the reported title and price visible on the deal page.","max_points":240},{"criterion":"One current on-sale CheapCharts movie is selected and its title, category, and displayed price are recorded from the actual deal page.","description":"One current on-sale CheapCharts movie is selected and its title, category, and displayed price are recorded from the actual deal page.\n\nHow a grader verifies this: Grader can confirm an open CheapCharts movie tab shows the same title and price and that it is a movie listing.","max_points":140},{"criterion":"One current on-sale CheapCharts TV season is selected and its title, category, and displayed price are recorded from the actual deal page.","description":"One current on-sale CheapCharts TV season is selected and its title, category, and displayed price are recorded from the actual deal page.\n\nHow a grader verifies this: Grader can confirm an open CheapCharts TV season tab shows the same title and price and that it is a TV season listing.","max_points":140},{"criterion":"One current on-sale CheapCharts audiobook is selected and its title, category, and displayed price are recorded from the actual deal page.","description":"One current on-sale CheapCharts audiobook is selected and its title, category, and displayed price are recorded from the actual deal page.\n\nHow a grader verifies this: Grader can confirm an open CheapCharts audiobook tab shows the same title and price and that it is an audiobook listing.","max_points":140},{"criterion":"The Epic Games Store homepage, current seasonal or featured sale page, and Split Fiction product page are all viewed, with the sale page and product page opened for visible browser proof.","description":"The Epic Games Store homepage, current seasonal or featured sale page, and Split Fiction product page are all viewed, with the sale page and product page opened for visible browser proof.\n\nHow a grader verifies this: Grader can confirm browser history or open tabs show the Epic homepage was visited and that a seasonal or featured sale page and Split Fiction product page were opened.","max_points":180},{"criterion":"The final roundup is concise and includes the Slickdeals featured deal, all three CheapCharts items with category labels and prices, and explicit confirmation of the Epic page views.","description":"The final roundup is concise and includes the Slickdeals featured deal, all three CheapCharts items with category labels and prices, and explicit confirmation of the Epic page views.\n\nHow a grader verifies this: Grader can compare the final response against the captured site data and confirm all required items and confirmations are present.","max_points":160}]}} +{"task_id":"c6b29e8564a7ae86dc50a1f074bdc2b5abb3754a","confirmed_task":"I want to grab some Pokémon trading cards pretty quickly, but only if they’re actually available to buy right now, so could you check a few retailers for me and keep the product pages open in separate tabs so I can visually compare them afterward? Start on BestBuy.com and search for Pokémon trading card products, then find at least two items that show they’re in stock right now and note each product’s full name, current price, and exactly what the availability message says on the page. After that, go to BarnesandNoble.com and find one Pokémon card item that’s clearly in stock, open the actual product page so I can see the listing itself, and grab the title and listed price. Then head to Walmart.com, search for a Pokémon card product, and make sure the one you pick is sold and shipped by Walmart rather than a marketplace seller, then record the product name and price and leave that product page open too. Once you’ve got those pages, compare the in-stock options across all three stores and tell me which available item is the cheapest overall and where I should buy it. Then, make the tab for this option active.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Games > Games - Other"],"precomputed_rubric":{"items":[{"criterion":"Identify at least two Pokémon trading card products on Best Buy that are currently in stock and capture each item’s full product name, current price, and visible availability status.","description":"Identify at least two Pokémon trading card products on Best Buy that are currently in stock and capture each item’s full product name, current price, and visible availability status.\n\nHow a grader verifies this: Grader can confirm two separate Best Buy product pages or listings are open and each shows a Pokémon card item with a price and an in-stock style availability message visible on screen.","max_points":350},{"criterion":"Identify one Pokémon trading card item on Barnes & Noble that is in stock and record its product title and listed price from the product page.","description":"Identify one Pokémon trading card item on Barnes & Noble that is in stock and record its product title and listed price from the product page.\n\nHow a grader verifies this: Grader can confirm a Barnes & Noble product page is open for a Pokémon card item and that the page visibly shows the title, price, and an in-stock/available status.","max_points":200},{"criterion":"Identify one Pokémon card product on Walmart that is sold and shipped by Walmart and record its product name and price.","description":"Identify one Pokémon card product on Walmart that is sold and shipped by Walmart and record its product name and price.\n\nHow a grader verifies this: Grader can confirm a Walmart product page is open and visibly indicates Walmart as the seller/shipper, along with the product title and price.","max_points":200},{"criterion":"Keep the selected product pages open in separate tabs for visual comparison.","description":"Keep the selected product pages open in separate tabs for visual comparison.\n\nHow a grader verifies this: Grader can confirm multiple retailer product tabs remain open, including at least the chosen Barnes & Noble and Walmart pages and the selected Best Buy product pages.","max_points":50},{"criterion":"Compare the verified in-stock options across Best Buy, Barnes & Noble, and Walmart and state which available product is the cheapest overall and where to buy it. The cheapest option is the active tab on the page.","description":"Compare the verified in-stock options across Best Buy, Barnes & Noble, and Walmart and state which available product is the cheapest overall and where to buy it. The cheapest option is the active tab on the page.\n\nHow a grader verifies this: Grader can compare the recorded prices from the open pages and confirm the final answer names the lowest-priced in-stock item and the correct retailer, and that this option is the currently active tab on the page.","max_points":200}]}} +{"task_id":"5e09c06aa92c1e252db5106a72e8d61e59356c7e","confirmed_task":"I’m trying to ease into a simple yoga routine here in Fresno and want something that combines one local studio option with one easy at-home session. Could you start on coilyoga.com and look through what Coil Yoga in Fresno offers so I can get a feel for the studio’s vibe, the kinds of classes they teach, and what a beginner might actually be walking into; please make sure you open the actual classes page and pull at least one specific detail from there, and leave that page open so I can look at it later. Then, with that local context in mind, go to toweryogafresno.com and find Tower Yoga Fresno’s schedule, and tell me the next three upcoming classes with their start times so I can see what would realistically fit into my week; if possible, open the schedule in its own tab and keep it visible as proof of the class times. After that, head to YouTube and search for “50 minute yin yoga,” then compare the visible results and pick the one with the highest view count so I have a home practice to pair with the studio option; open the actual video page, tell me the title, channel, and view count, and leave the video tab open so I can reference it. Once you’ve seen all three sites, give me a short beginner-friendly recommendation on whether Coil Yoga or Tower Yoga seems like the better starting point for me based on what you found, and pair that choice with the YouTube session as a simple Fresno yoga starter plan.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Health > Nutrition Diets and Fitness","Hobbies and Leisure > Hobbies and Leisure - Other"],"precomputed_rubric":{"items":[{"criterion":"Summarize Coil Yoga in Fresno using its website, including the studio’s overall offerings, class types, and at least one specific detail taken from the classes page.","description":"Summarize Coil Yoga in Fresno using its website, including the studio’s overall offerings, class types, and at least one specific detail taken from the classes page.\n\nHow a grader verifies this: Grader confirms the response includes a Coil Yoga summary with a concrete classes-page detail and that the classes page is opened or referenced as the source.","max_points":240},{"criterion":"Report the next three upcoming classes from Tower Yoga Fresno’s schedule, each with its start time.","description":"Report the next three upcoming classes from Tower Yoga Fresno’s schedule, each with its start time.\n\nHow a grader verifies this: Grader confirms three upcoming Tower Yoga classes and their start times match the visible schedule page left open in the browser.","max_points":280},{"criterion":"Identify the YouTube search result for “50 minute yin yoga” with the highest visible view count and report its title, channel, and view count.","description":"Identify the YouTube search result for “50 minute yin yoga” with the highest visible view count and report its title, channel, and view count.\n\nHow a grader verifies this: Grader confirms the YouTube search results were compared by visible view counts and the selected video page shows the reported title, channel, and views.","max_points":220},{"criterion":"Recommend whether Coil Yoga or Tower Yoga is the better beginner starting point based on the findings from the two local studio websites.","description":"Recommend whether Coil Yoga or Tower Yoga is the better beginner starting point based on the findings from the two local studio websites.\n\nHow a grader verifies this: Grader confirms the recommendation explicitly chooses one studio and cites evidence from the studio offerings and/or schedule findings.","max_points":140},{"criterion":"Provide a final Fresno yoga starter plan that pairs one local studio recommendation with the selected at-home YouTube session.","description":"Provide a final Fresno yoga starter plan that pairs one local studio recommendation with the selected at-home YouTube session.\n\nHow a grader verifies this: Grader confirms the final answer combines the chosen studio option with the identified YouTube video details into one coherent starter plan.","max_points":120}]}} +{"task_id":"170314aa3a93c7ca6e959be7757cad178efc06dc","confirmed_task":"I’m sketching out a compact but very high-end PC build and want you to do the first pass of browser research so I have a clean shortlist to look at later. Start on Google and find the official ASUS ROG page for the ROG Swift OLED PG27UCDM monitor, because I want to anchor the build around the exact display model rather than a reseller listing; open the real ASUS product page and leave that tab open so I can glance at the specs and photos myself. Once that’s pinned down, use Amazon or Google search results to track down at least three reputable 2025 “best mini-ITX motherboard” recommendation articles from established tech sites, since I’m trying to match a small-form-factor motherboard to a premium monitor-and-workstation setup; open each recommendation page in its own tab so I can compare them side by side, and make sure you capture the page title, site name, and URL for each one. After that, go to B&H Photo and find the actual product page for the NVIDIA RTX Pro 6000 Blackwell graphics card as the GPU candidate for this same build, and leave that B&H page open too so I can verify it’s the real listing. In the end, send me the ASUS monitor page URL, the three motherboard recommendation entries with titles, sites, and links, and the B&H GPU page URL.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Computers Electronics and Technology > Computer Hardware","Computers Electronics and Technology > Consumer Electronics"],"precomputed_rubric":{"items":[{"criterion":"Provide the official ASUS ROG product page URL for the ROG Swift OLED PG27UCDM monitor.","description":"Provide the official ASUS ROG product page URL for the ROG Swift OLED PG27UCDM monitor.\n\nHow a grader verifies this: Grader confirms the returned URL is an official ASUS/ROG product page for the exact PG27UCDM model and that the browser shows the ASUS product page open.","max_points":250},{"criterion":"Identify at least three reputable 2025 best mini-ITX motherboard recommendation pages from established tech sites.","description":"Identify at least three reputable 2025 best mini-ITX motherboard recommendation pages from established tech sites.\n\nHow a grader verifies this: Grader confirms there are at least three distinct recommendation pages focused on 2025 mini-ITX motherboard picks and that each page is open in its own tab.","max_points":300},{"criterion":"For each motherboard recommendation source, provide the page title, site name, and URL accurately.","description":"For each motherboard recommendation source, provide the page title, site name, and URL accurately.\n\nHow a grader verifies this: Grader compares the returned titles, site names, and URLs against the visible article tabs and page headers.","max_points":200},{"criterion":"Provide the B&H Photo product page URL for the NVIDIA RTX Pro 6000 Blackwell graphics card.","description":"Provide the B&H Photo product page URL for the NVIDIA RTX Pro 6000 Blackwell graphics card.\n\nHow a grader verifies this: Grader confirms the returned URL is a B&H product listing for the NVIDIA RTX Pro 6000 Blackwell and that the B&H product page is visibly open.","max_points":200},{"criterion":"Return a complete final summary containing the ASUS monitor URL, all three motherboard recommendation entries, and the B&H GPU URL.","description":"Return a complete final summary containing the ASUS monitor URL, all three motherboard recommendation entries, and the B&H GPU URL.\n\nHow a grader verifies this: Grader checks that all requested items are present together in the final response with no missing fields.","max_points":50}]}} +{"task_id":"9220309abe6e209dfedc978078c93f79fbd45ef1","confirmed_task":"I’m trying to put together a simple men’s outfit shortlist and want a quick mix of accessories, basics, and one resale piece, so could you help me browse a few sites like you would if you were sitting at my laptop with me? Start on ASOS and search for black men’s watches, then tell me whether the results page shows a total item count and what that number is, because I want to know if watches are actually easy to browse there; leave that results page open so I can glance at it later. After that, on SKIMS, pick a safer basics item by finding one men’s soft cotton boxer-brief product that clearly comes in multiple colors and multiple sizes, and tell me the product name plus a few color and size options; keep the product page open so I can see the swatches and size choices myself. Finally, since I may mix in something secondhand, go to the Poshmark page for men's accessories and identify one listing that’s currently available, making sure you open the actual listing page so you can verify it’s still live. At the end, give me a concise shopping summary with all three findings so I have a usable shortlist.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Lifestyle > Fashion and Apparel"],"precomputed_rubric":{"items":[{"criterion":"Report the ASOS search-results total for black men’s watches, or explicitly state that ASOS does not display a total count.","description":"Report the ASOS search-results total for black men’s watches, or explicitly state that ASOS does not display a total count.\n\nHow a grader verifies this: Grader confirms the ASOS results page is open and shows either a visible results count for black men’s watches or evidence that no count is displayed.","max_points":270},{"criterion":"Select one SKIMS men’s soft cotton boxer-brief product that comes in multiple colors and multiple sizes, and provide the product name along with a few available color options and size options.","description":"Select one SKIMS men’s soft cotton boxer-brief product that comes in multiple colors and multiple sizes, and provide the product name along with a few available color options and size options.\n\nHow a grader verifies this: Grader confirms the SKIMS product page is open and visibly shows the chosen men’s soft cotton boxer-brief with multiple color swatches and multiple size choices.","max_points":330},{"criterion":"Identify one currently available listing from the Poshmark men's accessories page.","description":"Identify one currently available listing from the Poshmark men's accessories page.\n\nHow a grader verifies this: Grader confirms the opened listing page shows a listing that is available/live in the men's accessories category.","max_points":200},{"criterion":"Return the findings as a concise shopping summary that includes all three sources and the requested shortlist-oriented details.","description":"Return the findings as a concise shopping summary that includes all three sources and the requested shortlist-oriented details.\n\nHow a grader verifies this: Grader checks the final response includes ASOS count status, SKIMS product with colors and sizes, and one available Poshmark listing.","max_points":200}]}} +{"task_id":"328ce861b58b2cf2e6da520040193710f95cfe56","confirmed_task":"I want to put together a really simple at-home yoga plan using only free YouTube videos because I’m trying to ease into a routine without paying for an app. Could you start on Google and find at least three yoga instructors who seem especially approachable, beginner-friendly, or a little funny and who clearly post free classes on YouTube, so I have a shortlist of personalities that feel welcoming rather than intimidating. For each one, grab the instructor name, the YouTube channel name, and one example video title. Then head over to YouTube and, using that shortlist, pick one morning yoga video that’s around 20 minutes long from one of those instructors so I have an easy option for weekdays; please note the exact title, duration, and link. After that, find a separate Vinyasa flow video on YouTube that’s about 30 minutes long so I have a slightly longer practice option too, and give me its title and link as well. Please open the two chosen videos in separate tabs so I can compare them, and start playing the 30-minute Vinyasa one long enough to confirm it’s the right video before leaving that tab open. In the end, send me the full yoga plan with the instructor shortlist plus both selected videos and their details.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Health > Nutrition Diets and Fitness"],"precomputed_rubric":{"items":[{"criterion":"Provide a shortlist of at least 3 yoga instructors who appear approachable, beginner-friendly, or funny and who post free yoga classes on YouTube, including each instructor’s name, YouTube channel name, and one example video title.","description":"Provide a shortlist of at least 3 yoga instructors who appear approachable, beginner-friendly, or funny and who post free yoga classes on YouTube, including each instructor’s name, YouTube channel name, and one example video title.\n\nHow a grader verifies this: Grader confirms the final response includes 3 or more instructors with all three required fields and that the information could reasonably be sourced from Google results leading to YouTube channels or videos.","max_points":350},{"criterion":"Select one morning yoga video on YouTube that is approximately 20 minutes long and is from one of the instructors in the shortlist.","description":"Select one morning yoga video on YouTube that is approximately 20 minutes long and is from one of the instructors in the shortlist.\n\nHow a grader verifies this: Grader confirms the chosen morning video is identified with title, duration, and link, and that the instructor matches one of the shortlist entries.","max_points":250},{"criterion":"Select one separate YouTube Vinyasa flow video that is approximately 30 minutes long and provide its title and link.","description":"Select one separate YouTube Vinyasa flow video that is approximately 30 minutes long and provide its title and link.\n\nHow a grader verifies this: Grader confirms the final response includes a distinct Vinyasa flow video with title and URL and that the runtime is about 30 minutes based on the visible YouTube listing or player.","max_points":200},{"criterion":"Open the chosen morning yoga video and the chosen Vinyasa flow video in separate browser tabs, and start playing the longer Vinyasa video briefly before leaving it open.","description":"Open the chosen morning yoga video and the chosen Vinyasa flow video in separate browser tabs, and start playing the longer Vinyasa video briefly before leaving it open.\n\nHow a grader verifies this: Grader confirms visible browser state shows both YouTube video tabs open and evidence that the 30-minute Vinyasa video player was started.","max_points":100},{"criterion":"Present the final output as one combined yoga plan containing the instructor shortlist, the chosen morning video with title, duration, and link, and the separate Vinyasa flow video with title and link.","description":"Present the final output as one combined yoga plan containing the instructor shortlist, the chosen morning video with title, duration, and link, and the separate Vinyasa flow video with title and link.\n\nHow a grader verifies this: Grader confirms the response combines all requested pieces into a single coherent plan rather than scattered notes.","max_points":100}]}} +{"task_id":"795687ed918e45a6ad255215aa2a517b3e014aa5","confirmed_task":"I’m curious whether any new seasonal drinks are actually landing well right now, so could you open Reddit and check the newest posts in r/starbucks for anything about recent or seasonal drink releases, then read into the comments enough to tell me whether people seem excited, disappointed, or mixed on them. After that, stay on Reddit and do the same thing in r/DunkinDonuts so I can compare whether Dunkin’s newly discussed drinks are getting a warmer or colder reaction than Starbucks at the moment. If one drink post clearly looks the most loved, most upvoted, or just the most viral between the two subreddits, open that specific post in a separate tab and leave it there so I can look at it myself. Then, as a totally separate palate cleanser, go to Bored Panda and open their collection of cute and funny angry cat photos shared by owners, and tell me the exact page title while leaving that page open too so I can glance through the pictures later.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Food and Drink > Beverages","Computers Electronics and Technology > Social Media Networks"],"precomputed_rubric":{"items":[{"criterion":"Review recent posts on r/starbucks and identify posts about new or seasonal drink releases with a summary of commenter reactions.","description":"Review recent posts on r/starbucks and identify posts about new or seasonal drink releases with a summary of commenter reactions.\n\nHow a grader verifies this: Grader can confirm the agent visited r/starbucks recent/new content and the final response includes at least one relevant new-drink discussion plus sentiment such as positive, negative, or mixed.","max_points":250},{"criterion":"Review recent posts on r/DunkinDonuts about newly discussed drinks and summarize community sentiment.","description":"Review recent posts on r/DunkinDonuts about newly discussed drinks and summarize community sentiment.\n\nHow a grader verifies this: Grader can confirm the agent visited r/DunkinDonuts and the final response includes at least one recent drink discussion with sentiment characterization.","max_points":200},{"criterion":"Explicitly compare Dunkin drink reception against Starbucks drink reception.","description":"Explicitly compare Dunkin drink reception against Starbucks drink reception.\n\nHow a grader verifies this: Final response states whether Dunkin's newly discussed drinks are being received more positively, more negatively, or about the same relative to Starbucks, based on the subreddit findings.","max_points":200},{"criterion":"Identify the single standout drink item or post with the highest reviews, strongest positivity, or most virality, and open it in a separate tab.","description":"Identify the single standout drink item or post with the highest reviews, strongest positivity, or most virality, and open it in a separate tab.\n\nHow a grader verifies this: Grader can confirm a separate Reddit tab is open to the chosen standout post and the final response names that standout item/post.","max_points":200},{"criterion":"Open the Bored Panda collection of cute and funny angry cats shared by owners and provide the exact page title.","description":"Open the Bored Panda collection of cute and funny angry cats shared by owners and provide the exact page title.\n\nHow a grader verifies this: Grader can confirm the Bored Panda page is open and the reported title matches the visible page title.","max_points":150}]}} +{"task_id":"8c30f2f9ceeac75b05c725c5397022bb4f9d32a0","confirmed_task":"I’m putting together a super short beginner-friendly AI explainer for someone who doesn’t know much about the topic yet, so I want it to move from simple definitions to a recognizable product and then end with a real hardware example. Please start on Google and find IBM’s page that explains the main types of artificial intelligence and machine learning, then pull out at least three categories and rewrite them in plain English with one sentence each so they sound easy to follow. Once you’ve got that foundation, go to Copilot.com and figure out what the site is, then give me exactly one sentence on its purpose as an AI product a beginner would probably recognize; if there’s a landing page or main homepage, leave that open so I can glance at it myself. After that, use Google again to find the Tom’s Hardware article about Intel Granite Rapids WS competing with AMD Threadripper, open the actual Tom’s Hardware article in a new tab so I can see the headline and page for myself, and summarize the key points including the main competitive claims and any specs the article cites like core count or boost clock. Then tie that hardware example back to why strong chips matter for AI or advanced computing, and give me one concise final write-up that combines the IBM basics, the one-sentence Copilot description, and the Tom’s Hardware takeaway in a way a beginner could understand.","website":"https://www.google.com","level":"easy","reference_length":6,"categories":["Computers Electronics and Technology > Computers Electronics and Technology - Other","Science and Education > Science and Education - Other"],"precomputed_rubric":{"items":[{"criterion":"Provide at least three IBM-described categories related to artificial intelligence and/or machine learning, each rewritten in plain English with a one-sentence description.","description":"Provide at least three IBM-described categories related to artificial intelligence and/or machine learning, each rewritten in plain English with a one-sentence description.\n\nHow a grader verifies this: Final response includes three or more distinct IBM-based categories with simple one-sentence explanations traceable to the IBM page opened from Google results.","max_points":300},{"criterion":"Identify what Copilot is and summarize its purpose in exactly one sentence as an AI-related product example a beginner would recognize.","description":"Identify what Copilot is and summarize its purpose in exactly one sentence as an AI-related product example a beginner would recognize.\n\nHow a grader verifies this: Final response contains a single sentence describing Copilot’s purpose, and the Copilot homepage or landing page is visibly open in the browser.","max_points":150},{"criterion":"Find and summarize the Tom’s Hardware article about Intel Granite Rapids WS competing with AMD Threadripper, including the main competitive claims and any cited specs such as core count and boost clock.","description":"Find and summarize the Tom’s Hardware article about Intel Granite Rapids WS competing with AMD Threadripper, including the main competitive claims and any cited specs such as core count and boost clock.\n\nHow a grader verifies this: A Tom’s Hardware article page is open in a separate tab, and the final response includes the article’s key claims plus cited specs mentioned in the article.","max_points":250},{"criterion":"Connect the Tom’s Hardware hardware summary back to why powerful hardware matters for AI or advanced computing.","description":"Connect the Tom’s Hardware hardware summary back to why powerful hardware matters for AI or advanced computing.\n\nHow a grader verifies this: Final response explicitly links the hardware comparison to AI workloads, advanced computing, or the need for strong compute performance.","max_points":100},{"criterion":"Return a concise final write-up that combines the IBM categories, the one-sentence Copilot description, and the Tom’s Hardware summary into a beginner-friendly explainer.","description":"Return a concise final write-up that combines the IBM categories, the one-sentence Copilot description, and the Tom’s Hardware summary into a beginner-friendly explainer.\n\nHow a grader verifies this: Final answer is a unified, concise explainer rather than disconnected notes, and it includes all three required parts in a beginner-friendly flow.","max_points":150},{"criterion":"Open the Tom’s Hardware article in a new browser tab during the browsing process.","description":"Open the Tom’s Hardware article in a new browser tab during the browsing process.\n\nHow a grader verifies this: Browser state shows the Tom’s Hardware article open in its own tab, separate from the Google search results tab.","max_points":50}]}} +{"task_id":"2504a7886c3dcb33f1aac7c5d2831985887e789e","confirmed_task":"I’m trying to decide whether brunch in San Francisco makes sense today, so could you start on weather.com and pull up the San Francisco 10-day forecast, then tell me the highs and lows for the next three days so I have a quick weather reality check before I head out. If the forecast looks decent enough for going out, switch over to Beach Chalet Restaurant & Brewery’s site and find the actual brunch page so I can see the posted schedule myself; let me know which days brunch is offered and the listed start and end times, and leave that brunch page open in a tab for me. Then use that timing as a reference and go to Fat Choy World’s website, open its current menu page, and figure out whether it appears to be open right now based on the hours or live status shown there, because I’m trying to decide whether to stick with a brunch plan or pivot to another meal instead. Please keep the Beach Chalet brunch page and the Fat Choy World menu page open in separate tabs so I can compare them visually afterward.","website":"https://www.google.com","level":"easy","reference_length":3,"categories":["Science and Education > Weather","Food and Drink > Restaurants and Delivery","Travel and Tourism > Travel and Tourism - Other"],"precomputed_rubric":{"items":[{"criterion":"Report the forecasted high and low temperatures for each of the next 3 days from weather.com’s San Francisco 10-day forecast.","description":"Report the forecasted high and low temperatures for each of the next 3 days from weather.com’s San Francisco 10-day forecast.\n\nHow a grader verifies this: Grader can confirm the weather.com 10-day forecast page for San Francisco is open and that the reported highs/lows match the first three forecast days shown on the page.","max_points":350},{"criterion":"Find Beach Chalet Restaurant & Brewery’s brunch schedule and report the days brunch is offered along with the posted start and end times.","description":"Find Beach Chalet Restaurant & Brewery’s brunch schedule and report the days brunch is offered along with the posted start and end times.\n\nHow a grader verifies this: Grader can confirm the Beach Chalet brunch page is open in a tab and that the response matches the visible brunch days and hours shown on that page.","max_points":350},{"criterion":"Use Fat Choy World’s current menu page to determine whether the restaurant is open right now based on the posted hours or status.","description":"Use Fat Choy World’s current menu page to determine whether the restaurant is open right now based on the posted hours or status.\n\nHow a grader verifies this: Grader can confirm the Fat Choy World menu page is open in a separate tab and that the open-now determination is supported by the visible hours or status on the page at the time of browsing.","max_points":300}]}} +{"task_id":"71f8e3e9b5a24f37f492fbf97b7d31e08e9a8d61","confirmed_task":"I’m in the UK and trying to work out whether starting a side hustle on top of my £55,500 salary is going to create extra tax admin, so could you check a few things in the browser for me? First, go to the official HMRC side-hustle guidance on taxhelpforhustlers.campaign.gov.uk and pull out the kinds of side-hustle income they say need to be reported, especially so I can tell whether things like reselling items online or doing delivery or other gig work would fall into those categories; please open the actual HMRC guidance page and leave it open so I can look at the wording myself. Then go to the Reed UK tax calculator on reed.co.uk, enter an annual salary of £55,500, and tell me the annual income tax figure it shows as my baseline; if the results page is separate, leave that open too so I can compare it with the HMRC guidance. After that, use Google to find a UK paycheck or salary calculator that shows a single pay-period net pay amount for a £55,500 salary, open the calculator result you use in its own tab, and report one estimated take-home amount for a single pay period. In the end, give me a short summary with the HMRC reportable income categories, the Reed annual income tax amount, the single-period net pay estimate, and a quick conclusion saying whether a side hustle in one of those HMRC categories would likely need reporting in addition to my salary.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Finance > Finance - Other","Law and Government > Government"],"precomputed_rubric":{"items":[{"criterion":"Identify the reportable side-hustle income categories from the official HMRC side-hustle guidance and connect them to common examples such as reselling and delivery or gig/service work.","description":"Identify the reportable side-hustle income categories from the official HMRC side-hustle guidance and connect them to common examples such as reselling and delivery or gig/service work.\n\nHow a grader verifies this: Grader can confirm the HMRC guidance page is open on taxhelpforhustlers.campaign.gov.uk and that the response reflects categories visible on that page, with examples mapped to those categories.","max_points":400},{"criterion":"Use the Reed UK tax calculator to report the annual income tax due for an annual salary of £55,500.","description":"Use the Reed UK tax calculator to report the annual income tax due for an annual salary of £55,500.\n\nHow a grader verifies this: Grader can confirm the Reed calculator results page shows salary input or results corresponding to £55,500 and that the reported annual income tax matches the visible result.","max_points":300},{"criterion":"Use a UK paycheck or salary calculator found via Google to report one estimated net take-home amount for a single pay period for a £55,500 salary.","description":"Use a UK paycheck or salary calculator found via Google to report one estimated net take-home amount for a single pay period for a £55,500 salary.\n\nHow a grader verifies this: Grader can confirm Google was used to reach a calculator, the calculator page is open in its own tab, and the reported net pay amount corresponds to a visible single pay-period result for £55,500.","max_points":200},{"criterion":"Provide a short conclusion that ties the HMRC categories to the salary baseline and states whether a side hustle in one of those categories would likely need reporting in addition to the £55,500 salary.","description":"Provide a short conclusion that ties the HMRC categories to the salary baseline and states whether a side hustle in one of those categories would likely need reporting in addition to the £55,500 salary.\n\nHow a grader verifies this: Grader can confirm the conclusion explicitly references at least one HMRC category from step 1 and correctly frames reporting as additional to the existing salary context.","max_points":100}]}} +{"task_id":"55bd922b66b05bcf4cbf5383333df59acce2ab32","confirmed_task":"I’m putting together a small-form-factor AM5 build and I want to base it on the ASUS ROG Strix B650E-I Gaming WiFi, so please start on the official ASUS site and open the exact product page for that motherboard, then leave it open in its own tab so I can glance at the specs and photos later. After that, use Google to find a community-made AM5 motherboard tier list from a forum, spreadsheet, Reddit post, or similar enthusiast source, open the actual tier list page, and check where the ASUS ROG Strix B650E-I Gaming WiFi shows up so I can sanity-check whether this is still considered a solid pick. If the board looks reasonable there, go to Amazon and find the product page for the Thermalright Peerless Assassin CPU cooler as a possible pairing for the build, and leave that open too so I can compare it visually with the motherboard tab. In the end, send me the three direct links and a short note saying what tier or listing the ASUS board got and whether that makes it seem like a sensible choice for this build.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Computers Electronics and Technology > Computer Hardware","Games > Video Games Consoles and Accessories"],"precomputed_rubric":{"items":[{"criterion":"Provide the direct official ASUS product page URL for the exact ASUS ROG Strix B650E-I Gaming WiFi motherboard.","description":"Provide the direct official ASUS product page URL for the exact ASUS ROG Strix B650E-I Gaming WiFi motherboard.\n\nHow a grader verifies this: Grader can confirm the URL is on rog.asus.com and the visible page title/model name exactly matches ASUS ROG Strix B650E-I Gaming WiFi.","max_points":300},{"criterion":"Provide a direct URL to a community-created AM5 motherboard tier list page found via Google.","description":"Provide a direct URL to a community-created AM5 motherboard tier list page found via Google.\n\nHow a grader verifies this: Grader can confirm the link is not a generic Google results page but the actual community tier list source page, visible in its own tab.","max_points":200},{"criterion":"Correctly report where the ASUS ROG Strix B650E-I Gaming WiFi appears on the community AM5 motherboard tier list, or clearly state if it does not appear, with a brief note on whether that supports it as a reasonable pick.","description":"Correctly report where the ASUS ROG Strix B650E-I Gaming WiFi appears on the community AM5 motherboard tier list, or clearly state if it does not appear, with a brief note on whether that supports it as a reasonable pick.\n\nHow a grader verifies this: Grader can compare the final note against the opened tier list page and confirm the board’s placement/category or absence is accurately described.","max_points":300},{"criterion":"Provide the direct Amazon product page URL for the Thermalright Peerless Assassin CPU cooler.","description":"Provide the direct Amazon product page URL for the Thermalright Peerless Assassin CPU cooler.\n\nHow a grader verifies this: Grader can confirm the URL is an Amazon product listing and the visible product title identifies a Thermalright Peerless Assassin CPU cooler.","max_points":200}]}} +{"task_id":"1d3952479bc687cb5b04e14930533493f959dbe5","confirmed_task":"I’m trying to put together a Christmas gift package that I can actually mail without overthinking it, so could you start on Etsy and look through the Christmas ornaments area for one personalized family Christmas ornament listing that feels like a genuinely giftable idea, ideally something customized with family names or a year, and open the actual listing so I can see the photos and the price rather than just a search result. Once you’ve got that ornament style in mind, switch to Amazon and find one marble cheese board that would pair nicely with it in the same holiday package, because I want a second physical gift that feels festive and easy to wrap; please open the product page in its own tab and note the product name plus the star rating and review count if Amazon shows them. Then go to the USPS online store and find one Forever stamp product that’s currently for sale so I know what I could use for mailing a holiday card with the package, and leave that product page open too so I can verify it myself. At the end, give me a short summary with the Etsy ornament name and price, the Amazon cheese board name with rating and/or review count, and the USPS Forever stamp product name.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Community and Society > Holidays and Seasonal Events","Lifestyle > Gifts and Flowers"],"precomputed_rubric":{"items":[{"criterion":"Identify one Etsy personalized family Christmas ornament from an actual listing page and report its product name and price.","description":"Identify one Etsy personalized family Christmas ornament from an actual listing page and report its product name and price.\n\nHow a grader verifies this: Grader can confirm an Etsy listing page for a family Christmas ornament is open or was visited, with visible product title and price matching the reported ornament.","max_points":350},{"criterion":"Select one Amazon marble cheese board suitable as a complementary Christmas gift and report the product name plus star rating and review count if shown.","description":"Select one Amazon marble cheese board suitable as a complementary Christmas gift and report the product name plus star rating and review count if shown.\n\nHow a grader verifies this: Grader can confirm an Amazon product page for a marble cheese board is open in its own tab or was visited, with visible product title and rating/review information matching the report.","max_points":300},{"criterion":"Provide the name of one Forever stamp product currently for sale on the USPS online store from a product page.","description":"Provide the name of one Forever stamp product currently for sale on the USPS online store from a product page.\n\nHow a grader verifies this: Grader can confirm a USPS store product page for Forever stamps is open or was visited, with a visible product name matching the reported stamp.","max_points":200},{"criterion":"Give a concise final summary that includes the Etsy ornament name and price, the Amazon cheese board name with rating and/or review count, and the USPS Forever stamp product name.","description":"Give a concise final summary that includes the Etsy ornament name and price, the Amazon cheese board name with rating and/or review count, and the USPS Forever stamp product name.\n\nHow a grader verifies this: Grader can compare the final response against the details gathered from the Etsy, Amazon, and USPS pages and confirm all three items are included concisely.","max_points":150}]}} +{"task_id":"890a6880049a42684ac91a2e1809442846f9394c","confirmed_task":"I’m thinking about doing a simple public-transit outing from downtown Chicago to Aurora, Illinois soon, and I want a practical snapshot I can actually look at in the browser. On Google, please search for the recommended public-transit route from Chicago to Aurora, Illinois and open the actual transit directions so you can tell me the main mode I’d be taking and the estimated total travel time; leave that directions page open so I can review the route myself. Since I may need a little context for rail connections and fare structure, then go to Metra’s BNSF line page and summarize what the BNSF line is, where it runs, and where on that page or site fare information is listed; if there’s a separate fares link, open that in another tab so I have a visual reference. After that, head to Weather.com and pull up the local 10-day forecast for Aurora, Illinois, then give me the high and low temperatures for the next 3 days so I can judge whether the trip will feel comfortable. Please send everything back as one concise trip-planning summary, and keep the Google transit directions tab and the relevant Metra page open for comparison.","website":"https://www.google.com","level":"easy","reference_length":5,"categories":["Travel and Tourism > Ground Transportation","Travel and Tourism > Tourist Attractions"],"precomputed_rubric":{"items":[{"criterion":"Find a recommended public-transit route from Chicago, Illinois to Aurora, Illinois on Google and report the main mode of transit and estimated total travel time.","description":"Find a recommended public-transit route from Chicago, Illinois to Aurora, Illinois on Google and report the main mode of transit and estimated total travel time.\n\nHow a grader verifies this: Grader can confirm a Google transit directions/results page is open and the response includes a specific main mode and travel-time estimate taken from that page.","max_points":300},{"criterion":"Summarize Metra’s BNSF line by stating what it is and where it runs.","description":"Summarize Metra’s BNSF line by stating what it is and where it runs.\n\nHow a grader verifies this: Grader can confirm the Metra BNSF line page is open and the response accurately describes the line and its route coverage based on visible page text.","max_points":200},{"criterion":"Identify where fare information for riding the BNSF line is listed, including any relevant fares link or section.","description":"Identify where fare information for riding the BNSF line is listed, including any relevant fares link or section.\n\nHow a grader verifies this: Grader can confirm the response points to a visible fare-information location on the BNSF line page or an opened fares tab/page on metra.com.","max_points":150},{"criterion":"Report the forecasted high and low temperatures for the next 3 days from Weather.com’s 10-day forecast for Aurora, Illinois.","description":"Report the forecasted high and low temperatures for the next 3 days from Weather.com’s 10-day forecast for Aurora, Illinois.\n\nHow a grader verifies this: Grader can confirm a Weather.com 10-day forecast page for Aurora is open and the response includes three day-by-day high/low pairs matching the visible forecast.","max_points":200},{"criterion":"Return all findings as one concise trip-planning summary that combines the Google transit recommendation, BNSF line context, fare-info location, and 3-day weather outlook.","description":"Return all findings as one concise trip-planning summary that combines the Google transit recommendation, BNSF line context, fare-info location, and 3-day weather outlook.\n\nHow a grader verifies this: Grader can confirm the final response is a single concise integrated summary containing all required elements from steps 1 through 3.","max_points":150}]}} +{"task_id":"f979f723a3f6f65ea8d75903425f22c67505daf1","confirmed_task":"I’m trying to put together a Christmas gift built around Pokémon cards, but I want a realistic backup plan in case the main item is sold out. Please start on Collector Store and look up the Pokémon Phantasmal Flames Booster Elite Trainer Box, then open the actual product page and check whether it says it’s in stock or sold out, and note the listed price so I know if the original idea is still viable. After that, go to Best Buy and search for Pokémon trading cards or Pokémon card gift items and pick one gift option that looks like a reasonable substitute, making sure to open the product page so I can see the listing myself and leaving that tab open as a reference. Then use that same general idea on Walgreens by searching for Pokémon trading cards and finding two available options with prices, opening each Walgreens product in its own tab so I can compare them side by side and verify they’re actually live listings. In the end, give me a short backup-plan summary with the Collector Store stock status and price, the Best Buy product name and price, and the two Walgreens options with their prices.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Games > Games - Other","Community and Society > Holidays and Seasonal Events"],"precomputed_rubric":{"items":[{"criterion":"Find the Collector Store product page for the Pokémon Phantasmal Flames Booster Elite Trainer Box and capture its stock status and listed price.","description":"Find the Collector Store product page for the Pokémon Phantasmal Flames Booster Elite Trainer Box and capture its stock status and listed price.\n\nHow a grader verifies this: Grader can confirm the browser is on the Collector Store product page showing the product title plus a visible in-stock or sold-out indicator and a price.","max_points":350},{"criterion":"Identify one Pokémon card gift option on Best Buy and record its product name and price from the product page.","description":"Identify one Pokémon card gift option on Best Buy and record its product name and price from the product page.\n\nHow a grader verifies this: Grader can confirm a Best Buy product page is open with a Pokémon card-related item, and the visible page shows the product name and price.","max_points":200},{"criterion":"Find two available Pokémon trading card options on Walgreens and record the product name and price for each from live product pages.","description":"Find two available Pokémon trading card options on Walgreens and record the product name and price for each from live product pages.\n\nHow a grader verifies this: Grader can confirm two Walgreens product tabs are open, each showing a Pokémon trading card listing with visible product names and prices.","max_points":300},{"criterion":"Provide a short summary that combines the Collector Store stock status and price, the Best Buy backup option with price, and the two Walgreens options with prices as a practical gift backup plan.","description":"Provide a short summary that combines the Collector Store stock status and price, the Best Buy backup option with price, and the two Walgreens options with prices as a practical gift backup plan.\n\nHow a grader verifies this: Grader can compare the final written summary against the visible product pages and confirm all requested items and prices are included accurately.","max_points":150}]}} +{"task_id":"7cc602f239775882273921d82e181020f1769b53","confirmed_task":"I’m trying to put together a simple evening-event outfit that I can actually order in the U.S., so could you start on Theory and find me one green dress that looks dressy enough for an evening plan and is available for U.S. delivery, then open the actual product page so I can see the photos and note the product name and price? Once you’ve got that dress, use the color and overall vibe as your reference point and go to a shoe site to find a matching pair of women’s shoes in size 9.5 that would work for the same outfit, and open that product in its own tab too so I can compare the two pages side by side; please note the shoe name and price. After that, check ooShirts because I may also need a simple custom group shirt order for the event, and confirm whether they offer no-minimum print-on-demand orders shipped within the United States, including whatever turnaround or shipping timing they state on the site. At the end, give me one concise summary with the Theory dress details, the matching shoes details, and the ooShirts no-minimum plus shipping/timing answer, and leave the dress and shoe product pages open for me.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Lifestyle > Fashion and Apparel"],"precomputed_rubric":{"items":[{"criterion":"Identify one green dress on Theory that is available for U.S. delivery and provide its product name and price.","description":"Identify one green dress on Theory that is available for U.S. delivery and provide its product name and price.\n\nHow a grader verifies this: Grader can confirm a Theory product page is open showing a green dress with visible product name and price, along with page indicators that support U.S. shopping or delivery availability.","max_points":350},{"criterion":"Select one matching pair of women’s shoes in size 9.5 that fits the dress’s color and dressy evening vibe, and provide the product name and price.","description":"Select one matching pair of women’s shoes in size 9.5 that fits the dress’s color and dressy evening vibe, and provide the product name and price.\n\nHow a grader verifies this: Grader can confirm a shoe product page is open in a separate tab showing women’s shoes, visible size 9.5 availability or selectable size, and visible product name and price.","max_points":300},{"criterion":"Determine whether ooShirts offers no-minimum print-on-demand orders shipped within the United States, including the stated turnaround or shipping timing.","description":"Determine whether ooShirts offers no-minimum print-on-demand orders shipped within the United States, including the stated turnaround or shipping timing.\n\nHow a grader verifies this: Grader can confirm ooShirts pages show the minimum-order policy and visible text about turnaround, production, or shipping timing for U.S. orders.","max_points":200},{"criterion":"Return the dress details, matching shoes details, and the ooShirts no-minimum and timing answer together in one concise summary, while leaving the dress and shoe product pages open.","description":"Return the dress details, matching shoes details, and the ooShirts no-minimum and timing answer together in one concise summary, while leaving the dress and shoe product pages open.\n\nHow a grader verifies this: Grader can confirm the final response includes all requested details in a concise combined summary and that the dress and shoe tabs remain open.","max_points":150}]}} +{"task_id":"8e639395e157ea4df2747a4a873b5f610d70d180","confirmed_task":"I want to put together a really easy beginner sketch reference pack for a casual drawing session, and I’d like you to grab the pieces in the browser so I can look at them afterward. First, on Pinterest, find one pin with a genuinely useful person drawing reference photo for figure practice—something clear enough that a beginner could sketch from—and open the actual pin page, then keep that tab open and save the pin link for me. Once you’ve got that figure reference, go to Google Images and search for a basic one-room hut from the 1500s in an English style so I can use it as a simple background setting; open the image result or source page that looks most believable and leave that open too so I can see the picture itself. After that, go to Bored Panda and open their collection of angry cat photos as a fun mood reference for the character expression, and tell me the exact page title while keeping that page open as well. In the end, send me the Pinterest pin link, the hut image or source page link, and the Bored Panda page title, plus a short note on how the figure, the hut, and the angry-cat mood could all work together in one simple sketch idea.","website":"https://www.google.com","level":"easy","reference_length":4,"categories":["Hobbies and Leisure > Hobbies and Leisure - Other","Arts & Entertainment > Visual Arts and Design"],"precomputed_rubric":{"items":[{"criterion":"Provide one Pinterest pin URL that leads to a useful person drawing reference photo suitable for figure practice.","description":"Provide one Pinterest pin URL that leads to a useful person drawing reference photo suitable for figure practice.\n\nHow a grader verifies this: Grader can confirm the browser is on an actual Pinterest pin page showing a person reference image and that the returned link matches that open pin.","max_points":350},{"criterion":"Provide one Google Images result or source page URL showing a basic one-room hut from the 1500s in an English style.","description":"Provide one Google Images result or source page URL showing a basic one-room hut from the 1500s in an English style.\n\nHow a grader verifies this: Grader can confirm an image result or source page is open from Google Images and visually depicts a simple hut consistent with the requested historical English-style setting.","max_points":300},{"criterion":"Provide the exact title of the Bored Panda collection featuring angry cat photos shared by their owners.","description":"Provide the exact title of the Bored Panda collection featuring angry cat photos shared by their owners.\n\nHow a grader verifies this: Grader can confirm the Bored Panda page is open and the reported title matches the visible page title/header.","max_points":200},{"criterion":"Include a brief note explaining how the person reference, hut reference, and angry cat mood inspiration could fit together into one beginner-friendly sketch concept.","description":"Include a brief note explaining how the person reference, hut reference, and angry cat mood inspiration could fit together into one beginner-friendly sketch concept.\n\nHow a grader verifies this: Grader can confirm the final note meaningfully references all three selected sources and combines them into a coherent sketch idea.","max_points":150}]}} +{"task_id":"82af9358ab6a0421057340a0c038498348f2b3ec","confirmed_task":"I’m trying to get a handle on my UK telecom budget and want a real-world baseline I can actually look at in the browser. Please start on MoneySavingExpert’s Cheap Mobile Finder and set it to SIM-only deals with unlimited minutes, unlimited texts, and at least 10GB of data, then sort out the three cheapest options you can see in ascending monthly price order and note the provider, monthly price, contract length, and data allowance for each so I can compare what the low end of the market looks like. Once you’ve got that shortlist, open Vodafone UK in another tab and figure out what the “Xtra 40” part of one of their broadband plan names actually means, because I want to know whether that’s describing the broadband speed tier or some extra bundle feature; please leave the relevant Vodafone page open so I can see the wording myself. After that, go to PrintPigeon and find the service where I could upload a PDF and have it printed and mailed as a letter, and tell me the service name, the starting price, and the exact order page where the process begins so I could send myself a one-page note with the SIM comparison and the Vodafone explanation. If possible, keep the MoneySavingExpert results tab and the PrintPigeon order page open in separate tabs so I can visually compare them afterward.","website":"https://www.google.com","level":"easy","reference_length":6,"categories":["Computers Electronics and Technology > Telecommunications","Finance > Finance - Other"],"precomputed_rubric":{"items":[{"criterion":"The MoneySavingExpert Cheap Mobile Finder is used with filters that clearly match SIM-only, unlimited minutes, unlimited texts, and at least 10GB data.","description":"The MoneySavingExpert Cheap Mobile Finder is used with filters that clearly match SIM-only, unlimited minutes, unlimited texts, and at least 10GB data.\n\nHow a grader verifies this: Grader can confirm the visible filtered results page on MoneySavingExpert shows qualifying SIM-only deals consistent with those constraints.","max_points":220},{"criterion":"The three cheapest qualifying SIM-only deals are listed in ascending monthly price order.","description":"The three cheapest qualifying SIM-only deals are listed in ascending monthly price order.\n\nHow a grader verifies this: Grader compares the reported three deals against the visible ordering on the MoneySavingExpert results page and confirms they are the cheapest qualifying options shown.","max_points":240},{"criterion":"For each of the three MoneySavingExpert deals, the provider, monthly price, contract length, and data allowance are included accurately.","description":"For each of the three MoneySavingExpert deals, the provider, monthly price, contract length, and data allowance are included accurately.\n\nHow a grader verifies this: Grader checks each reported field against the corresponding visible deal cards or listing details on the MoneySavingExpert results page.","max_points":180},{"criterion":"The explanation of what Vodafone broadband label “Xtra 40” refers to is correct and based on the Vodafone page.","description":"The explanation of what Vodafone broadband label “Xtra 40” refers to is correct and based on the Vodafone page.\n\nHow a grader verifies this: Grader confirms the open Vodafone page contains wording showing whether “Xtra 40” denotes a speed tier or another plan attribute, and the response matches that meaning in one sentence.","max_points":160},{"criterion":"On PrintPigeon, a service is identified that allows a user to upload or attach a PDF and have it printed and mailed as a letter.","description":"On PrintPigeon, a service is identified that allows a user to upload or attach a PDF and have it printed and mailed as a letter.\n\nHow a grader verifies this: Grader verifies on the open PrintPigeon page that the named service is for sending a printed letter from an uploaded document or PDF.","max_points":100},{"criterion":"The PrintPigeon result includes the service name, starting price, and the specific order/start page where the mailing process begins.","description":"The PrintPigeon result includes the service name, starting price, and the specific order/start page where the mailing process begins.\n\nHow a grader verifies this: Grader checks that the reported service name and starting price match the visible PrintPigeon page and that the provided page is the actual order or start page for initiating the mailing.","max_points":100}]}} +{"task_id":"63d68bb25e279fc22e6e3592d8ca59add33b6eb1","confirmed_task":"I’m trying to buy a family car in the Minneapolis–St. Paul area and want a solid shortlist I can actually look through later, so please use Cars.com to search near Minneapolis, Minnesota for family-friendly vehicles priced at $50,000 or less. I’m shopping for a family of four, so focus on practical options like midsize SUVs, crossovers, minivans, or other vehicles that clearly make sense for everyday family use. As you go through the results, open each promising listing in its own tab and make sure the actual vehicle page shows the price, mileage, model year, and dealership location, because I want every option to be something I can visually inspect afterward. Please gather about 15 listings that are still live and under budget, and if a listing doesn’t make the seating or family suitability obvious, cross-check that exact vehicle on Edmunds so we can confirm the body style or seating before keeping it. Once you’ve got the set, create a CryptPad Sheet called Minneapolis Family Cars with columns for make and model, model year, price, mileage, dealership location, and link to listing, and fill it in so each row matches one of the listing tabs you still have open. After that, give me a short summary of which models show up most often, which ones seem like the best value based on year, mileage, and price, and whether the final mix is mostly SUVs or minivans. Please leave the vehicle tabs and the finished spreadsheet open so I can compare everything on screen.","website":"https://www.google.com","level":"medium","reference_length":8,"categories":["Vehicles > Makes and Models","Ecommerce & Shopping > Ecommerce and Shopping - Other"],"precomputed_rubric":{"items":[{"criterion":"Cars.com is used to search near Minneapolis, Minnesota with the price constrained to $50,000 or less and the results focused on family-suitable vehicle types.","description":"Cars.com is used to search near Minneapolis, Minnesota with the price constrained to $50,000 or less and the results focused on family-suitable vehicle types.\n\nHow a grader verifies this: Grader can see a Cars.com results page showing Minneapolis-area search context and a max-price filter at or below $50,000 with relevant family vehicle results visible.","max_points":120},{"criterion":"About 15 promising family-appropriate vehicle listings are opened in separate browser tabs from the Cars.com results.","description":"About 15 promising family-appropriate vehicle listings are opened in separate browser tabs from the Cars.com results.\n\nHow a grader verifies this: Grader can see multiple open vehicle listing tabs, approximately 15 in total, each corresponding to a distinct candidate vehicle page.","max_points":120},{"criterion":"Each selected listing is individually verified for make/model, year, price, mileage, dealership location, and that it is still a live listing under budget and suitable for a family of four.","description":"Each selected listing is individually verified for make/model, year, price, mileage, dealership location, and that it is still a live listing under budget and suitable for a family of four.\n\nHow a grader verifies this: Grader can inspect the open listing pages and confirm the required details are visible and match the final recorded set, with no included vehicle over $50,000 or obviously unsuitable.","max_points":200},{"criterion":"Any unclear seating or body-style cases are cross-checked on Edmunds before inclusion in the final shortlist.","description":"Any unclear seating or body-style cases are cross-checked on Edmunds before inclusion in the final shortlist.\n\nHow a grader verifies this: Grader can see Edmunds pages or evidence of cross-checking for ambiguous vehicles, confirming family suitability or body style for those cases.","max_points":80},{"criterion":"A CryptPad Sheets titled Minneapolis Family Cars is created.","description":"A CryptPad Sheets titled Minneapolis Family Cars is created.\n\nHow a grader verifies this: Grader can see the CryptPad Sheets title displayed as Minneapolis Family Cars.","max_points":80},{"criterion":"The spreadsheet contains the required columns: make and model, model year, price, mileage, dealership location, and link to listing.","description":"The spreadsheet contains the required columns: make and model, model year, price, mileage, dealership location, and link to listing.\n\nHow a grader verifies this: Grader can see the header row in CryptPad Sheets with all required column names present.","max_points":80},{"criterion":"The spreadsheet is populated with about 15 verified vehicle rows, each matching a currently open listing tab and including a working listing URL.","description":"The spreadsheet is populated with about 15 verified vehicle rows, each matching a currently open listing tab and including a working listing URL.\n\nHow a grader verifies this: Grader can compare the rows in the sheet against the open tabs and confirm that approximately 15 entries are present with corresponding links and extracted details.","max_points":220},{"criterion":"A short summary is produced covering the most frequent models, the best-value options, and whether SUVs or minivans dominate the final set, while leaving the sheet and listing tabs open.","description":"A short summary is produced covering the most frequent models, the best-value options, and whether SUVs or minivans dominate the final set, while leaving the sheet and listing tabs open.\n\nHow a grader verifies this: Grader can review the final written summary and confirm the spreadsheet and vehicle tabs remain open for visual inspection.","max_points":100}]}} +{"task_id":"53419597c0c8897d49f1af65f5255bf265edcfbf","confirmed_task":"I’m a Canadian citizen living in Pittsburgh, PA, and my passport expires in about 3 months, so I’m trying to get everything sorted before a 2-week tourist trip to Japan. Could you start on the official Government of Canada site and find the passport renewal process for a Canadian living in the U.S., including the exact renewal form I’d need, the supporting documents, photo rules, whether I need a guarantor or references, the fee in CAD, how I’m supposed to submit it from the U.S., and the current processing time, because I need to know if this is realistic before I book anything. Once you have that, use the official Canadian embassy/consulate pages to figure out which Canadian mission is closest to Pittsburgh, Pennsylvania 15222 that handles passport services, and open the actual office page so I can see the address, passport service hours, and whether I need an appointment or have to use some booking request process; please leave that page open. After that, check Japan’s official Ministry of Foreign Affairs site to confirm whether a Canadian passport holder going to Japan for tourism for 2 weeks needs a visa, and note any conditions or exceptions that matter. Then go to the Government of Canada travel advisory page for Japan and tell me the current advisory level plus any highlighted health, safety, or entry notes, and keep that advisory page open in another tab so I can look at it myself. Finally, compare travel insurance options on PolicyAdvisor.com and Kanetix.ca for this situation: a Canadian citizen currently living in the U.S. who wants coverage connected to travel to Japan, and I mainly want to see whether either site shows plans that would actually work for someone based in the U.S. rather than Canada, so please capture provider names, medical emergency coverage, trip cancellation/interruption if shown, and any residency or eligibility restrictions. If either site has useful quote or results pages, open the most relevant options in separate tabs so I can compare them visually. At the end, give me a concise summary that ties all of this together and clearly points out any uncertainty, especially around insurance eligibility for a Canadian living in the U.S.","website":"https://www.google.com","level":"medium","reference_length":8,"categories":["Law and Government > Government","Law and Government > Immigration and Visas","Travel and Tourism > Travel and Tourism - Other"],"precomputed_rubric":{"items":[{"criterion":"Correctly identifies the official Government of Canada passport renewal process for a Canadian living abroad in the U.S., including the correct form and main supporting requirements.","description":"Correctly identifies the official Government of Canada passport renewal process for a Canadian living abroad in the U.S., including the correct form and main supporting requirements.\n\nHow a grader verifies this: Grader confirms the final answer references the official Canada passport renewal abroad page and includes the renewal form plus required documents and procedural requirements visible on that page.","max_points":180},{"criterion":"Accurately reports passport renewal fees, submission method from the U.S., and current processing times from the official Canadian source.","description":"Accurately reports passport renewal fees, submission method from the U.S., and current processing times from the official Canadian source.\n\nHow a grader verifies this: Grader checks that the reported fee, submission path, and processing time match the official Canada.ca content viewed during the task.","max_points":120},{"criterion":"Finds the nearest relevant Canadian mission to Pittsburgh and captures its passport service hours and appointment or booking instructions, with the office page opened for visual proof.","description":"Finds the nearest relevant Canadian mission to Pittsburgh and captures its passport service hours and appointment or booking instructions, with the office page opened for visual proof.\n\nHow a grader verifies this: Grader confirms the selected mission is plausibly nearest to Pittsburgh, and the open mission page visibly shows the office identity plus service hours and booking or appointment details.","max_points":140},{"criterion":"Correctly determines whether a Canadian passport holder needs a tourist visa for a 2-week trip to Japan using the official MOFA site.","description":"Correctly determines whether a Canadian passport holder needs a tourist visa for a 2-week trip to Japan using the official MOFA site.\n\nHow a grader verifies this: Grader checks that the answer matches the visa status and any relevant conditions shown on the MOFA page for Canadian travelers.","max_points":120},{"criterion":"Accurately reports the current Canadian government travel advisory level for Japan and at least one notable advisory detail, with the advisory page left open.","description":"Accurately reports the current Canadian government travel advisory level for Japan and at least one notable advisory detail, with the advisory page left open.\n\nHow a grader verifies this: Grader confirms the advisory level and detail match the visible travel.gc.ca Japan advisory page left open in a tab.","max_points":100},{"criterion":"Collects meaningful travel insurance comparison information from PolicyAdvisor relevant to travel involving Japan and specifically notes any residency or eligibility constraints for someone living in the U.S.","description":"Collects meaningful travel insurance comparison information from PolicyAdvisor relevant to travel involving Japan and specifically notes any residency or eligibility constraints for someone living in the U.S.\n\nHow a grader verifies this: Grader checks that at least one relevant PolicyAdvisor result or quote page was reached and that the summary includes provider/plan details plus eligibility or residency limitations visible on the site.","max_points":120},{"criterion":"Collects meaningful travel insurance comparison information from Kanetix relevant to travel involving Japan and specifically notes any residency or eligibility constraints for someone living in the U.S.","description":"Collects meaningful travel insurance comparison information from Kanetix relevant to travel involving Japan and specifically notes any residency or eligibility constraints for someone living in the U.S.\n\nHow a grader verifies this: Grader checks that at least one relevant Kanetix result or quote page was reached and that the summary includes provider/plan details plus visible eligibility or residency limitations.","max_points":120},{"criterion":"Produces a coherent final synthesis that integrates all official findings and compares the two insurance sources while clearly flagging uncertainty or limitations.","description":"Produces a coherent final synthesis that integrates all official findings and compares the two insurance sources while clearly flagging uncertainty or limitations.\n\nHow a grader verifies this: Grader confirms the final response includes all required sections and a side-by-side insurance comparison with explicit notes about uncertainty, especially for a Canadian resident in the U.S.","max_points":100}]}} +{"task_id":"8fcdeed84a0deb05342b07c26116792a5b6a6a3f","confirmed_task":"I’m relocating to Austin in about two months for a new job near the Domain, so I want help narrowing down apartments that would actually work for day-to-day life without blowing my budget. Please start on Zillow and search the Domain/North Austin area for 1-bedroom apartments under $1,800 a month, and filter for places that have both in-unit washer/dryer and a pool because those are my non-negotiables. Open the three best-looking Zillow listings in separate tabs so I can compare the photos, map placement, and amenity details, and pull out the apartment name, full address, rent, and a couple of listing highlights from each. Then do the same search on Apartments.com with the same budget and amenity filters, again opening the three strongest options in their own tabs so I can visually compare them and note the same details. Once you’ve got both sets, compare the six options, remove duplicates if the same property shows up on both sites, and tell me which apartments seem like the best overall fit based on price, amenities, and location near the Domain. After that, use CapMetro’s site and map tools to check whether each shortlisted apartment is near a MetroRail stop or has a practical bus connection into the Domain area, because I want to know whether I could commute without driving every day; if the map view helps, pull it up and keep the most useful transit page open. Then look up the neighborhoods for those apartments on Niche so I can get a feel for what living there would be like, especially safety ratings, walkability info if it’s shown, and whether there are grocery stores nearby for basic errands. Finally, go to the Texas Attorney General website and find the renters’ rights guidance that matters most before signing a lease in Texas, especially anything about deposits, repairs, fees, disclosures, and ending a lease, and leave that page open too so I can read it myself later. In the end, give me one clean apartment-hunting brief that combines the listing comparison, transit practicality to the Domain, neighborhood pros and cons, and a short lease-review checklist I can use when I start contacting properties.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Business and Consumer Services > Real Estate"],"precomputed_rubric":{"items":[{"criterion":"Identify 3 Zillow apartment listings in the Domain/North Austin area that satisfy all stated filters: 1 bedroom, under $1,800/month, in-unit washer/dryer, and pool.","description":"Identify 3 Zillow apartment listings in the Domain/North Austin area that satisfy all stated filters: 1 bedroom, under $1,800/month, in-unit washer/dryer, and pool.\n\nHow a grader verifies this: Grader can confirm Zillow search results and/or open listing tabs show the applied filters and that 3 qualifying listings were opened with visible listing pages.","max_points":160},{"criterion":"Identify 3 Apartments.com apartment listings in the Domain/North Austin area that satisfy all stated filters: 1 bedroom, under $1,800/month, in-unit washer/dryer, and pool.","description":"Identify 3 Apartments.com apartment listings in the Domain/North Austin area that satisfy all stated filters: 1 bedroom, under $1,800/month, in-unit washer/dryer, and pool.\n\nHow a grader verifies this: Grader can confirm Apartments.com search results and/or open listing tabs show the applied filters and that 3 qualifying listings were opened with visible listing pages.","max_points":160},{"criterion":"Create a deduplicated comparison of the Zillow and Apartments.com options using listing-level details such as price, amenities, address/location, and overall fit.","description":"Create a deduplicated comparison of the Zillow and Apartments.com options using listing-level details such as price, amenities, address/location, and overall fit.\n\nHow a grader verifies this: Grader can verify the comparison references the listings gathered from both sites, removes overlaps where the same property appears twice, and ranks or summarizes the best overall options.","max_points":180},{"criterion":"Assess transit access for each shortlisted apartment using CapMetro, specifically whether it is near a MetroRail station or has a practical bus route connection to the Domain area.","description":"Assess transit access for each shortlisted apartment using CapMetro, specifically whether it is near a MetroRail station or has a practical bus route connection to the Domain area.\n\nHow a grader verifies this: Grader can confirm CapMetro pages or map views were used and that each shortlisted apartment has associated station or route information tied to Domain access.","max_points":160},{"criterion":"Provide neighborhood research from Niche for the shortlisted apartment areas, including safety ratings, walkability information if available, and nearby grocery store options.","description":"Provide neighborhood research from Niche for the shortlisted apartment areas, including safety ratings, walkability information if available, and nearby grocery store options.\n\nHow a grader verifies this: Grader can verify Niche neighborhood pages were consulted and that each shortlisted area includes the requested neighborhood details.","max_points":140},{"criterion":"Summarize key Texas renters’ rights guidance from the Texas Attorney General website relevant to lease review, including deposits, repairs, fees, disclosures, and termination-related issues.","description":"Summarize key Texas renters’ rights guidance from the Texas Attorney General website relevant to lease review, including deposits, repairs, fees, disclosures, and termination-related issues.\n\nHow a grader verifies this: Grader can confirm the Texas Attorney General page was opened and the summary reflects topics visibly covered on the official guidance page.","max_points":100},{"criterion":"Deliver a final integrated apartment-hunting brief that combines apartment comparison, transit suitability, neighborhood findings, and renters’ rights guidance into a usable decision aid.","description":"Deliver a final integrated apartment-hunting brief that combines apartment comparison, transit suitability, neighborhood findings, and renters’ rights guidance into a usable decision aid.\n\nHow a grader verifies this: Grader can verify the final output includes all major sections, references the shortlisted apartments consistently, and synthesizes findings into a coherent recommendation or planning brief.","max_points":100}]}} +{"task_id":"6421b906fe97b3799960af31c77f20ff25f756b1","confirmed_task":"I’m putting together a quick graduate outreach brief for a student in San Diego and want a few very specific examples from different kinds of schools and programs. Please start on Lewis & Clark’s admissions site and find the admissions representative who covers San Diego, California, then open that actual regional contact page and capture the rep’s name, email, phone, and anything else listed, and leave the page open so I can visually confirm how they assign territories. Then go to Hunter College Silberman’s site and find the Fall 2026 application deadline for the MSW program, making sure you’re on the real admissions or application page where the date is shown. After that, use Indiana University Kokomo’s graduate programs page to list all of the graduate programs they show there, and keep that page open in its own tab so I can compare the breadth of options at a smaller campus. Since the student is especially interested in California opportunities, go to the UCSF clinical trials site and find at least two glioblastoma trials in Northern California that are currently recruiting, then open the actual trial pages in separate tabs so you can give me each trial’s recruiting status and study location and I can see that they’re still live. For a student-life example, use Google to get to the official Oral Roberts University page that identifies the student activities director, and pull the person’s name plus email and phone if available, making sure it comes from an ORU page rather than a directory aggregator. Finally, use Google to find the official EDF Energy graduate programme page that mentions whether new hires get salary reviews, and quote the exact wording from the page if it’s there. Please give me everything back as a compact sourced summary, but keep the key pages open in tabs so I can glance at the evidence.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Science and Education > Universities and Colleges","Science and Education > Education"],"precomputed_rubric":{"items":[{"criterion":"Correctly identify the Lewis & Clark admissions representative covering San Diego, California, including the representative’s name and listed contact details.","description":"Correctly identify the Lewis & Clark admissions representative covering San Diego, California, including the representative’s name and listed contact details.\n\nHow a grader verifies this: Grader can confirm the open Lewis & Clark regional contact page shows San Diego, California assigned to the named representative and displays the extracted contact information.","max_points":170},{"criterion":"Report the Fall 2026 application deadline for Hunter College Silberman’s MSW program from the official page.","description":"Report the Fall 2026 application deadline for Hunter College Silberman’s MSW program from the official page.\n\nHow a grader verifies this: Grader can confirm the open Hunter Silberman admissions/application page visibly shows the Fall 2026 MSW deadline date reported in the summary.","max_points":140},{"criterion":"List the graduate programs shown on Indiana University Kokomo’s graduate programs page.","description":"List the graduate programs shown on Indiana University Kokomo’s graduate programs page.\n\nHow a grader verifies this: Grader can compare the returned program list against the visible program names on the open IU Kokomo graduate programs page.","max_points":140},{"criterion":"Identify at least two glioblastoma clinical trials in Northern California that are currently recruiting, including each trial’s recruiting status and study location.","description":"Identify at least two glioblastoma clinical trials in Northern California that are currently recruiting, including each trial’s recruiting status and study location.\n\nHow a grader verifies this: Grader can inspect the separate open UCSF trial tabs and verify that each named trial is glioblastoma-related, marked currently recruiting, and has the reported Northern California location.","max_points":220},{"criterion":"Find the Oral Roberts University student activities director’s name and contact information from an official ORU page.","description":"Find the Oral Roberts University student activities director’s name and contact information from an official ORU page.\n\nHow a grader verifies this: Grader can confirm the open ORU page names the student activities director and shows the extracted email and/or phone details.","max_points":130},{"criterion":"Determine whether EDF Energy’s graduate programme includes salary reviews for new hires and provide the exact confirming wording from the official page.","description":"Determine whether EDF Energy’s graduate programme includes salary reviews for new hires and provide the exact confirming wording from the official page.\n\nHow a grader verifies this: Grader can confirm the open EDF Energy page contains the quoted wording and that the answer matches the page text.","max_points":120},{"criterion":"Return a compact summary with sources covering all requested items.","description":"Return a compact summary with sources covering all requested items.\n\nHow a grader verifies this: Grader can verify the final response includes all six requested findings, each paired with a source reference or page title/link, in a concise summary format.","max_points":80}]}} +{"task_id":"295f11f4eebda80a7551944fd9b6f4e01db92666","confirmed_task":"I’m trying to build a fun gift shortlist for someone who’s into Sonic, anime figures, trading cards, and Pokémon, and I want it to feel grounded in real options I could actually buy. Please start on Amazon and search for “Sonic toys,” then sort the results from price low to high so we can see the cheapest ideas first, and grab the first three items shown with their prices as my budget baseline. After that, stay on Amazon and search for “Sonic the Hedgehog character toys,” ideally things like Tails, Knuckles, Shadow, or Amy, and open three promising options in separate tabs so I can compare them visually before you note their names and prices. Then switch over to Amazon and check for a Cars 2 Lightning McQueen figure and note its product name and item code, then look up a Cars Chick Hicks figurine so we have a reference listed price to compare - please leave those product pages open so I can look at the photos. After that, head to Costco and check gift-style product results for three specific items that would fit this person’s interests, such as a Pokémon item, a collectible-style item, or even something like a themed bench or display-worthy gift, and capture the product names and prices from the actual listings. Finally, use Pokellector to look at the newest Pokémon TCG sets and tell me the most recent set names shown there, making sure to open the page where the set images are visible so I can use that as a reference for current packs. In the end, give me a concise shortlist with prices or item codes where available, and explicitly compare the anime collectible figure option against the cheapest Sonic toy options you found first so I can tell whether the premium figure is worth it.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Games > Games - Other"],"precomputed_rubric":{"items":[{"criterion":"Amazon results for \"Sonic toys\" are sorted low to high and the first three visible items with prices are captured.","description":"Amazon results for \"Sonic toys\" are sorted low to high and the first three visible items with prices are captured.\n\nHow a grader verifies this: Grader can confirm the Amazon sort state is Price: Low to High and that three top visible result cards are listed with matching prices.","max_points":180},{"criterion":"Three Amazon Sonic character toy options are identified from a separate character-focused search, with names and prices, and opened in separate tabs.","description":"Three Amazon Sonic character toy options are identified from a separate character-focused search, with names and prices, and opened in separate tabs.\n\nHow a grader verifies this: Grader can confirm a search for Sonic character toys, see three relevant product tabs open, and match the recorded names and prices to those product pages.","max_points":160},{"criterion":"A Cars 2 Lightning McQueen figure is found on Amazon with the exact product name and item code recorded.","description":"A Cars 2 Lightning McQueen figure is found on Amazon with the exact product name and item code recorded.\n\nHow a grader verifies this: Grader can confirm the Amazon product page shows a Cars 2 Lightning McQueen figure and that the recorded product name and item code match the page.","max_points":170},{"criterion":"A Cars Chick Hicks figurine is found on Amazon with the listed price recorded, and the product page is left open for visual reference.","description":"A Cars Chick Hicks figurine is found on Amazon with the listed price recorded, and the product page is left open for visual reference.\n\nHow a grader verifies this: Grader can confirm the Amazon product page shows a Cars Chick Hicks figurine and that the listed price matches the reported value.","max_points":120},{"criterion":"Three Costco gift products relevant to the recipient’s interests are captured with product names and prices from actual listings.","description":"Three Costco gift products relevant to the recipient’s interests are captured with product names and prices from actual listings.\n\nHow a grader verifies this: Grader can confirm three Costco product listings and verify the reported names and prices against the visible listing pages.","max_points":140},{"criterion":"The most recent Pokémon TCG sets shown on Pokellector are listed with their visible set illustration images referenced.","description":"The most recent Pokémon TCG sets shown on Pokellector are listed with their visible set illustration images referenced.\n\nHow a grader verifies this: Grader can confirm the newest sets page on Pokellector and match the reported set names to the visible set tiles/images.","max_points":110},{"criterion":"The final response is a concise themed gift shortlist that includes all collected items, prices or item codes where available, and an explicit comparison between the anime collectible figure option and the cheaper Sonic toy options.","description":"The final response is a concise themed gift shortlist that includes all collected items, prices or item codes where available, and an explicit comparison between the anime collectible figure option and the cheaper Sonic toy options.\n\nHow a grader verifies this: Grader can confirm the final summary includes outputs from all prior steps and contains a direct price/value comparison between the collectible figure and the low-cost Sonic toys.","max_points":120}]}} +{"task_id":"e8b73f739732f8aeb3c473d00f6219af5b8dcdb7","confirmed_task":"We’re expecting our first baby and I want a really practical shortlist of the best hospitals in Los Angeles for giving birth, not just a generic list, so could you use Google to research LA-area hospitals that clearly offer maternity, obstetrics, and labor-and-delivery services and then narrow that down to the 10 strongest options for childbirth? I’d like you to lean on a mix of official hospital maternity pages and something like U.S. News or similar quality indicators so I can tell which places are actually recognized, not just nearby. As you find good candidates, open each hospital’s actual maternity or labor-and-delivery page in its own tab and keep those tabs open so I can visually review the pages and photos afterward. For each of the final 10 hospitals, please verify on the official site that labor and delivery is explicitly offered, note the hospital name and Los Angeles-area location, write a short plain-English description of the maternity program, and capture whether it shows up in rankings, recognitions, or other quality signals. Then create a CryptPad Document file titled exactly “Best LA Maternity Hospitals” and put all 10 hospitals in it with the maternity-page links included, because I want one place where I can compare everything. Once that looks complete, add a short comparison section explaining the main differences between the hospitals, any patterns you noticed in the rankings or maternity offerings, and which few seem like the strongest recommendations overall. Please leave the CryptPad Document open at the end along with the 10 hospital maternity tabs so I can click through them myself.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Health > Health - Other"],"precomputed_rubric":{"items":[{"criterion":"A credible Los Angeles-area candidate pool of hospitals with maternity-related care is identified using Google and reputable sources.","description":"A credible Los Angeles-area candidate pool of hospitals with maternity-related care is identified using Google and reputable sources.\n\nHow a grader verifies this: Search history and opened results show hospital candidates sourced from Google results leading to official hospital pages or reputable healthcare sources.","max_points":120},{"criterion":"Exactly 10 hospitals are selected as the strongest childbirth options based on service availability plus reputation, rankings, or recognitions.","description":"Exactly 10 hospitals are selected as the strongest childbirth options based on service availability plus reputation, rankings, or recognitions.\n\nHow a grader verifies this: The final CryptPad Document contains 10 distinct hospitals and the selection is supported by evidence gathered from search and ranking sources.","max_points":160},{"criterion":"Each of the 10 selected hospitals has its official maternity, obstetrics, or labor-and-delivery page opened in a separate browser tab and left open.","description":"Each of the 10 selected hospitals has its official maternity, obstetrics, or labor-and-delivery page opened in a separate browser tab and left open.\n\nHow a grader verifies this: Browser tab bar shows 10 hospital-domain tabs corresponding to the 10 hospitals listed in the CryptPad Document.","max_points":180},{"criterion":"For every listed hospital, labor and delivery services are explicitly verified on the official hospital page.","description":"For every listed hospital, labor and delivery services are explicitly verified on the official hospital page.\n\nHow a grader verifies this: The open hospital pages visibly mention labor and delivery, childbirth, or equivalent maternity inpatient delivery services for each listed hospital.","max_points":160},{"criterion":"Ranking, recognition, or quality-indicator evidence is gathered for each of the 10 hospitals from U.S. News or similarly reputable sources.","description":"Ranking, recognition, or quality-indicator evidence is gathered for each of the 10 hospitals from U.S. News or similarly reputable sources.\n\nHow a grader verifies this: The document entries include ranking or recognition notes for each hospital, and browsing shows U.S. News or equivalent reputable source pages used to support those notes.","max_points":140},{"criterion":"A CryptPad Document titled exactly 'Best LA Maternity Hospitals' is created and includes for each hospital its name, location, maternity program description, ranking/recognition status, and official maternity page link.","description":"A CryptPad Document titled exactly 'Best LA Maternity Hospitals' is created and includes for each hospital its name, location, maternity program description, ranking/recognition status, and official maternity page link.\n\nHow a grader verifies this: The CryptPad Document title matches exactly and the body contains complete entries for all 10 hospitals with the required fields and links.","max_points":160},{"criterion":"The CryptPad Document ends with a comparative summary highlighting key differences, patterns, and top recommendations, and the doc remains open alongside the hospital tabs.","description":"The CryptPad Document ends with a comparative summary highlighting key differences, patterns, and top recommendations, and the doc remains open alongside the hospital tabs.\n\nHow a grader verifies this: The final section of the CryptPad Document contains a written comparison and recommendation summary, and the browser still shows the doc plus the hospital tabs open.","max_points":80}]}} +{"task_id":"b21a86441ddca8186175bfffcaae0358ed66eec4","confirmed_task":"Can you help me plan a short LA trip from Pittsburgh and keep the key pages open so I can actually look at them afterward? Start on Google Flights, Kayak, or Expedia and search a round-trip from PIT to LAX for a simple 2-day trip, making sure the outbound gets into Los Angeles before 6:00 PM and the return lands back in Pittsburgh before midnight. I’d really prefer a nonstop if one exists, but if not, pick the option with the shortest total travel time that still feels like a good value, and open the top few flight options so I can compare before you choose the best one; then leave the selected flight page open in its own tab. After that, use Google Hotels or a hotel booking site to find me a hotel around Koreatown or West Hollywood with at least a 4.5 rating and a nightly rate under $350, and open the actual hotel listing so I can see the price, rating, photos, and map location, then keep that tab open too. Once the stay looks settled, go to Google Maps and pull up Griffith Observatory, The Getty Center, Santa Monica Pier, one well-rated Korean BBQ place in Koreatown, one coffee shop rated 4.5 or better, and somewhere with a great sunset view, because I want a realistic plan instead of a random list. Use the map routes to figure out a 2-day itinerary that avoids driving during LA rush hour as much as possible, especially around 7 to 10 AM and 4 to 7 PM, and try to keep travel between stops under about 40 minutes when that’s realistic, with a little buffer time between activities so the days don’t feel crammed. Open the important map views or route checks in separate tabs if needed so I can visually compare how far apart things are, and then give me a simple trip summary with the flight you chose, the hotel you recommend, and a 2-day schedule showing activity times plus the travel time between each stop.","website":"https://www.google.com","level":"medium","reference_length":6,"categories":["Travel and Tourism > Air Travel","Travel and Tourism > Accommodation and Hotels"],"precomputed_rubric":{"items":[{"criterion":"A round-trip PIT to LAX flight search is completed on a flight site, with top candidate options reviewed and one selected that arrives in Los Angeles before 6:00 PM outbound and returns to Pittsburgh before midnight.","description":"A round-trip PIT to LAX flight search is completed on a flight site, with top candidate options reviewed and one selected that arrives in Los Angeles before 6:00 PM outbound and returns to Pittsburgh before midnight.\n\nHow a grader verifies this: Grader can see an open flight search/results or details page showing PIT, LAX, round-trip results, visible candidate options, and the selected itinerary meeting the timing constraints.","max_points":220},{"criterion":"The chosen flight reasonably reflects the stated preference hierarchy: nonstop if available, otherwise the shortest total travel time, while still balancing price and schedule.","description":"The chosen flight reasonably reflects the stated preference hierarchy: nonstop if available, otherwise the shortest total travel time, while still balancing price and schedule.\n\nHow a grader verifies this: Visible comparison among top few flight options supports why the selected itinerary is a reasonable best choice based on stops, duration, and price.","max_points":130},{"criterion":"A hotel in Koreatown or West Hollywood is identified with rating at least 4.5 and nightly price under $350, and the actual listing page is opened.","description":"A hotel in Koreatown or West Hollywood is identified with rating at least 4.5 and nightly price under $350, and the actual listing page is opened.\n\nHow a grader verifies this: Open hotel listing visibly shows neighborhood or map placement, nightly price under $350, rating of 4.5 or higher, and listing details/photos.","max_points":200},{"criterion":"Google Maps is used to identify all required stop types: Griffith Observatory, The Getty Center, Santa Monica Pier, a Korean BBQ spot in Koreatown, a coffee shop rated at least 4.5, and a sunset-view location.","description":"Google Maps is used to identify all required stop types: Griffith Observatory, The Getty Center, Santa Monica Pier, a Korean BBQ spot in Koreatown, a coffee shop rated at least 4.5, and a sunset-view location.\n\nHow a grader verifies this: Open Google Maps place pages or map tabs show each required destination category and allow visual confirmation of their locations.","max_points":160},{"criterion":"The itinerary routing is realistic, uses map route checks, avoids LA rush hour driving where practical, keeps travel between stops under about 40 minutes when possible, and includes buffer time.","description":"The itinerary routing is realistic, uses map route checks, avoids LA rush hour driving where practical, keeps travel between stops under about 40 minutes when possible, and includes buffer time.\n\nHow a grader verifies this: Open route/map tabs and the written plan show travel-time checks, sensible sequencing, reduced rush-hour exposure, and spacing between activities.","max_points":170},{"criterion":"A final travel plan is produced with the selected flight, hotel recommendation, and a 2-day itinerary including activity timing and travel times, while keeping the key browser pages open.","description":"A final travel plan is produced with the selected flight, hotel recommendation, and a 2-day itinerary including activity timing and travel times, while keeping the key browser pages open.\n\nHow a grader verifies this: Final summary includes all required trip components, and the flight tab, hotel tab, and relevant map/route tabs remain open for visual review.","max_points":120}]}} +{"task_id":"72875601345415ba90a3c31bd93c25bb5ea54bb2","confirmed_task":"Can you help me plan a Christmas trip to San Francisco from Pittsburgh and do it in the browser so I can actually look at the options with you? Start on Google Flights and search round-trip flights from PIT to SFO leaving December 23 and coming back December 26. I’d really prefer a nonstop if one exists, but if not, pick the best option with a short layover and reasonable total travel time, and please favor something that gets me into San Francisco in the afternoon on December 23 so I still have that evening free. Open the best few flight options and leave the results or selected flight page open so I can review it. After that, use Google Hotels or the hotel results on Google to find a place in a central area, ideally North Beach, Nob Hill, or Union Square, with at least a 4.5-star rating and under $400 per night, because I want something nice but still realistic for Christmas week. When you find the best fit, open the actual hotel page so I can see the photos, nightly price, and map location, and keep that tab open too. Once those are set, switch to Google Maps and map out the trip around the city using the hotel as the base. I want the 3-day plan for December 23 through 25 to include the Golden Gate Bridge, Alcatraz Island, the Ferry Building, one Michelin-recommended restaurant, one well-known bakery or coffee shop, and at least one scenic viewpoint. Try to group things so I’m not zigzagging all over San Francisco, keep most travel legs under about 30 minutes if possible, leave some buffer time between activities, and include at least one segment by public transit instead of driving. Please open the relevant places and route views in Maps so I can visually see how they connect, and then put everything into a CryptPad Document with the flight you’d choose, the hotel you recommend, and a day-by-day itinerary with times and travel methods. Leave the flight page, hotel page, and key map tabs open for me as proof while you finish the report.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Travel and Tourism > Air Travel","Travel and Tourism > Accommodation and Hotels","Community and Society > Holidays and Seasonal Events"],"precomputed_rubric":{"items":[{"criterion":"A Google Flights search for PIT to SFO on December 23 to December 26 is completed, multiple viable options are reviewed, and one recommended itinerary is selected with preference for nonstop or short-layover service and afternoon arrival on December 23.","description":"A Google Flights search for PIT to SFO on December 23 to December 26 is completed, multiple viable options are reviewed, and one recommended itinerary is selected with preference for nonstop or short-layover service and afternoon arrival on December 23.\n\nHow a grader verifies this: Grader can see Google Flights results or a selected itinerary page showing PIT, SFO, the correct dates, and visible flight options or chosen flight details left open in the browser.","max_points":240},{"criterion":"A hotel in San Francisco is identified and opened from Google hotel results, meeting the constraints of central location, ideally North Beach, Nob Hill, or Union Square, at least 4.5 stars, and under $400 per night.","description":"A hotel in San Francisco is identified and opened from Google hotel results, meeting the constraints of central location, ideally North Beach, Nob Hill, or Union Square, at least 4.5 stars, and under $400 per night.\n\nHow a grader verifies this: Grader can see an open hotel page with visible hotel name, star rating, nightly price, photos, and map/location information consistent with the requested neighborhoods or central area.","max_points":200},{"criterion":"Google Maps is used to identify all required San Francisco trip components: Golden Gate Bridge, Alcatraz Island, Ferry Building, one Michelin-recommended restaurant, one well-known bakery or coffee shop, and at least one scenic viewpoint, anchored to the selected hotel.","description":"Google Maps is used to identify all required San Francisco trip components: Golden Gate Bridge, Alcatraz Island, Ferry Building, one Michelin-recommended restaurant, one well-known bakery or coffee shop, and at least one scenic viewpoint, anchored to the selected hotel.\n\nHow a grader verifies this: Grader can see Google Maps place pages, pins, or tabs showing the hotel and all required destinations, with enough visible map context to confirm they were actually opened and examined.","max_points":180},{"criterion":"A coherent 3-day itinerary for December 23 through 25 is planned using map travel times, includes all required stops, avoids excessive cross-city backtracking, keeps most legs under about 30 minutes where feasible, includes some buffer time, and uses public transit for at least one segment.","description":"A coherent 3-day itinerary for December 23 through 25 is planned using map travel times, includes all required stops, avoids excessive cross-city backtracking, keeps most legs under about 30 minutes where feasible, includes some buffer time, and uses public transit for at least one segment.\n\nHow a grader verifies this: Grader can inspect open Google Maps routes or route tabs and the resulting plan to confirm travel methods, approximate times, and logical geographic grouping across the three days.","max_points":240},{"criterion":"A CryptPad Document is created that clearly summarizes the selected flight, recommended hotel, and full 3-day itinerary with activities, timing, and travel methods, while the key browser resources remain open for review.","description":"A CryptPad Document is created that clearly summarizes the selected flight, recommended hotel, and full 3-day itinerary with activities, timing, and travel methods, while the key browser resources remain open for review.\n\nHow a grader verifies this: Grader can see a CryptPad Document containing the trip summary and can also confirm that the flight page, hotel page, and at least one relevant map route or place tab remain open.","max_points":140}]}} +{"task_id":"256342f13c0a03e080f92ee073153fe33a6881c0","confirmed_task":"I’m trying to get a realistic shortlist of the best knee surgeons in New York City because I may need ACL reconstruction or meniscus repair, and I want something I can actually look through myself afterward. Please start in Google and research orthopedic surgeons in NYC who are specifically known for knee ligament reconstruction, ACL surgery, and meniscus repair, then create a spreadsheet called Top ACL Surgeons NYC to keep everything organized. As you find strong candidates, open each surgeon’s official hospital or practice profile page in its own tab so I can compare them side by side, and only keep surgeons whose actual profile page clearly says they perform ACL reconstruction, meniscus repair, knee ligament reconstruction, or very closely related sports knee procedures. For each surgeon you keep, put their full name, hospital or practice affiliation, specialty focus, a short note confirming where ACL reconstruction or meniscus repair is mentioned, and the direct profile link into the spreadsheet. Please keep going until there are exactly 10 verified NYC surgeons in the sheet, and make sure every person listed still has their real profile page open in a tab so I can inspect the pages and see the affiliations myself. Once the list is complete, look across the 10 entries and add a short summary of which hospitals, orthopedic groups, or medical centers show up most often, because I want to know which institutions seem to dominate this specialty in the city.","website":"https://www.google.com","level":"medium","reference_length":6,"categories":["Health > Health - Other","Health > Medicine"],"precomputed_rubric":{"items":[{"criterion":"A spreadsheet titled 'Top ACL Surgeons NYC' is created and used as the working document.","description":"A spreadsheet titled 'Top ACL Surgeons NYC' is created and used as the working document.\n\nHow a grader verifies this: Grader can see a spreadsheet with the exact title open and populated during the task.","max_points":120},{"criterion":"Official hospital or practice profile pages for candidate NYC surgeons are opened in separate browser tabs.","description":"Official hospital or practice profile pages for candidate NYC surgeons are opened in separate browser tabs.\n\nHow a grader verifies this: Grader can see multiple open tabs corresponding to official surgeon profile pages rather than generic search results or directory summaries.","max_points":160},{"criterion":"Each included surgeon is verified on the actual profile page as performing ACL reconstruction, meniscus repair, knee ligament reconstruction, or a clearly equivalent sports knee procedure.","description":"Each included surgeon is verified on the actual profile page as performing ACL reconstruction, meniscus repair, knee ligament reconstruction, or a clearly equivalent sports knee procedure.\n\nHow a grader verifies this: On each selected surgeon tab, the grader can locate visible text or procedure listings that substantiate the inclusion criteria.","max_points":240},{"criterion":"For every selected surgeon, the spreadsheet records full name, hospital or practice affiliation, specialty focus, confirmation note for ACL reconstruction or meniscus repair, and the direct profile link.","description":"For every selected surgeon, the spreadsheet records full name, hospital or practice affiliation, specialty focus, confirmation note for ACL reconstruction or meniscus repair, and the direct profile link.\n\nHow a grader verifies this: Grader can inspect the spreadsheet rows and confirm all required fields are present for each surgeon and correspond to the open profile tabs.","max_points":200},{"criterion":"Exactly 10 New York City surgeons are included, and each spreadsheet entry corresponds to an official profile page that remains open in a tab.","description":"Exactly 10 New York City surgeons are included, and each spreadsheet entry corresponds to an official profile page that remains open in a tab.\n\nHow a grader verifies this: Grader can count exactly 10 completed spreadsheet entries and match each one to an open official profile tab for that surgeon.","max_points":180},{"criterion":"The spreadsheet includes a short summary of which hospitals, orthopedic groups, or medical centers appear most frequently among the 10 surgeons.","description":"The spreadsheet includes a short summary of which hospitals, orthopedic groups, or medical centers appear most frequently among the 10 surgeons.\n\nHow a grader verifies this: Grader can see a written summary in the spreadsheet that synthesizes affiliation frequency across the final 10 entries.","max_points":100}]}} +{"task_id":"d3250da48cc778a40d11683a56fdfca962d6fe19","confirmed_task":"I’m putting together a coordinated holiday gift bundle for one family and want it to feel like everything belongs together instead of looking random. On Kohl’s, please find two gift ideas for siblings that stay under $25 each and are actually available for pickup in store today as a backup to shipping — one that would make sense for a 12-year-old girl and one for an 11-year-old boy — and open each product in its own tab so I can compare the vibe and price side by side. Once you’ve got those, go to Etsy and open a personalized family Christmas ornament listing that looks giftable, then tell me the shop name and exactly what customization choices the listing offers, because I’d like to add something with the family name and need to know what I can personalize. After that, use Target to find a shatterproof gold-and-white ornament set that visually matches the personalized ornament and would work as filler in the bundle, and open the actual product page so I can see the photos and any color or finish options shown. Then head to Walmart and browse for two boys outfit gift options for a younger boy, and pick the one that best matches the overall style and price level of the other gifts so the bundle feels consistent; please keep the better outfit page open for me. Finally, use Google to find one highly rated hot buttered rum recipe from a recognizable recipe site, open the actual recipe page, and give me the recipe name, source, ingredient list, and basic preparation steps so I can include a cozy holiday extra with the package idea. At the end, send me a concise summary with all the selected items, prices, pickup-today details for Kohl’s, the Etsy customization options, the Target ornament details, the two Walmart outfit options with your preferred pick, and the recipe source and steps.","website":"https://www.google.com","level":"medium","reference_length":6,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Community and Society > Holidays and Seasonal Events","Lifestyle > Gifts and Flowers"],"precomputed_rubric":{"items":[{"criterion":"Identify two Kohl’s gift ideas for siblings, one for a 12-year-old girl and one for an 11-year-old boy, each under $25, and confirm pickup in store today on the product pages.","description":"Identify two Kohl’s gift ideas for siblings, one for a 12-year-old girl and one for an 11-year-old boy, each under $25, and confirm pickup in store today on the product pages.\n\nHow a grader verifies this: Grader can confirm two separate Kohl’s product pages are open or were visited, each showing a product title, price below $25, and visible pickup-today availability information.","max_points":240},{"criterion":"Review one Etsy personalized family Christmas ornament listing and report the shop name plus the visible customization or variation options offered.","description":"Review one Etsy personalized family Christmas ornament listing and report the shop name plus the visible customization or variation options offered.\n\nHow a grader verifies this: Grader can confirm an Etsy listing page was opened and that the response includes the shop name and the customization fields or variation choices shown on that listing.","max_points":200},{"criterion":"Find one Target shatterproof gold-and-white ornament set and report the product name along with any listed color or finish options visible on the product page.","description":"Find one Target shatterproof gold-and-white ornament set and report the product name along with any listed color or finish options visible on the product page.\n\nHow a grader verifies this: Grader can confirm a Target product page for a shatterproof gold-and-white ornament set was opened and that the response matches the product title and visible option details on the page.","max_points":180},{"criterion":"Provide two Walmart boys outfit gift options with product names and prices, and identify which one best matches the style and price level of the other selected gifts.","description":"Provide two Walmart boys outfit gift options with product names and prices, and identify which one best matches the style and price level of the other selected gifts.\n\nHow a grader verifies this: Grader can confirm two Walmart product pages or listings were reviewed and that the response includes two outfit names, their prices, and a clearly stated preferred choice.","max_points":180},{"criterion":"Find one highly rated hot buttered rum recipe via Google and include the recipe name, source, ingredient list, and basic preparation steps from the actual recipe page.","description":"Find one highly rated hot buttered rum recipe via Google and include the recipe name, source, ingredient list, and basic preparation steps from the actual recipe page.\n\nHow a grader verifies this: Grader can confirm a Google results page led to a recipe page and that the response includes the recipe title, source site, ingredients, and preparation summary.","max_points":120},{"criterion":"Return a concise final summary that includes all selected products, prices, Kohl’s pickup-today details, Etsy customization information, Target ornament details, the two Walmart outfit options with the preferred pick, and the recipe source and steps.","description":"Return a concise final summary that includes all selected products, prices, Kohl’s pickup-today details, Etsy customization information, Target ornament details, the two Walmart outfit options with the preferred pick, and the recipe source and steps.\n\nHow a grader verifies this: Grader can confirm the final response synthesizes outputs from all prior steps into one coherent holiday bundle summary with no major omissions.","max_points":80}]}} +{"task_id":"4b9eb54dde6c129b27ccb642ef24fb060e736913","confirmed_task":"I’m trying to get more comfortable cooking at home without buying a bunch of gear, so on Amazon please compare a few 3-quart electric multicookers and pick the best one for me if my main use is making rice and steaming vegetables in a small kitchen. I’d like you to open the most promising options in separate tabs so I can visually compare the listings, and then leave the chosen product page open with the title, price, capacity, and the features that make it best for simple beginner meals. Once you’ve picked that cooker, use Google to find one highly rated hot buttered rum recipe from a real recipe site and pull out the ingredients and basic steps, mainly as a simple example of the kind of recipe format I could actually follow. Since I’m still learning the basics, also go to Reddit and find a beginner-friendly discussion about how long to boil chicken breast, then give me the time range people recommend. After that, use the USDA DRI Calculator for a sample adult profile — age 30, 5 feet 6 inches, 150 pounds, sedentary activity level — and record the estimated daily carb, protein, and fat targets so I have a realistic nutrition reference point. Then, using Google, find the City of Milwaukee food license requirements and Wisconsin DATCP guidance for starting a small charcuterie or food business, and summarize the key licensing steps I’d need to look into if I ever wanted to turn basic home cooking into a small side business. Please keep the USDA results page and the Milwaukee licensing source page open so I can look at them myself, and finish with a short summary tying together the multicooker choice, the macro targets, and whether this setup seems like a practical beginner routine.","website":"https://www.google.com","level":"medium","reference_length":6,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Home and Garden > Home and Garden - Other","Food and Drink > Cooking and Recipes"],"precomputed_rubric":{"items":[{"criterion":"Compares multiple Amazon 3-quart electric multicookers and selects one best option for rice and steaming vegetables, including product title, visible price, capacity, and key features from the listing.","description":"Compares multiple Amazon 3-quart electric multicookers and selects one best option for rice and steaming vegetables, including product title, visible price, capacity, and key features from the listing.\n\nHow a grader verifies this: Grader can confirm multiple Amazon product tabs were opened and that the final chosen product page remains open showing the listing title, price, and 3-quart capacity.","max_points":240},{"criterion":"Provides one highly rated hot buttered rum recipe found via Google, including a clear ingredient list and basic preparation steps.","description":"Provides one highly rated hot buttered rum recipe found via Google, including a clear ingredient list and basic preparation steps.\n\nHow a grader verifies this: Grader can confirm a Google results path to a recipe site and that the returned summary includes ingredients and basic steps matching the recipe page.","max_points":160},{"criterion":"Reports a Reddit-sourced beginner recommendation for boiling chicken breast and includes the time range given in the discussion.","description":"Reports a Reddit-sourced beginner recommendation for boiling chicken breast and includes the time range given in the discussion.\n\nHow a grader verifies this: Grader can confirm a Reddit thread was opened and that the reported boiling time range is visible in the discussion content.","max_points":140},{"criterion":"Uses the USDA DRI Calculator with the specified sample adult profile and records estimated daily carbohydrate, protein, and fat targets.","description":"Uses the USDA DRI Calculator with the specified sample adult profile and records estimated daily carbohydrate, protein, and fat targets.\n\nHow a grader verifies this: Grader can confirm the USDA DRI Calculator results page is open and shows macro targets for the entered profile inputs.","max_points":200},{"criterion":"Summarizes the key licensing steps for starting a charcuterie or food business in Milwaukee using City of Milwaukee food license requirements and Wisconsin DATCP guidance.","description":"Summarizes the key licensing steps for starting a charcuterie or food business in Milwaukee using City of Milwaukee food license requirements and Wisconsin DATCP guidance.\n\nHow a grader verifies this: Grader can confirm Google was used to reach the City of Milwaukee licensing source and Wisconsin DATCP guidance, and that the summary mentions both sources' key requirements.","max_points":160},{"criterion":"Briefly explains how the chosen multicooker and USDA macro targets could fit into a balanced beginner cooking routine.","description":"Briefly explains how the chosen multicooker and USDA macro targets could fit into a balanced beginner cooking routine.\n\nHow a grader verifies this: Grader can confirm the final response explicitly connects the selected cooker's use cases with the reported carb, protein, and fat targets in a practical beginner-oriented summary.","max_points":100}]}} +{"task_id":"78ddd1aab59eebace5f6f523d90012aa6c871c54","confirmed_task":"I’m trying to decide whether renting at The Ophelia in Pittsburgh makes more sense than buying nearby, so could you help me look at both sides in the browser? Start on apartments.com and open The Ophelia’s actual floor plan or availability page, then note at least two floor plans that are currently shown as available, including each plan’s name and the bedroom/bathroom setup, and leave that page open so I can look at the layouts myself. Since Pittsburgh winters are rough and I’m also thinking about car-related moving costs, go to WeatherTech and use their vehicle selector for a 2020 Toyota Highlander to find the floor mat and cargo liner options that fit, then open the cargo liner product page and keep that tab open as a reference. After that, use Google to find one LED emblem option for a 2023 Honda Civic, and click through to the actual product page so you can capture the product name and price rather than just a search snippet. Once you have those cost references, go to Zillow and search around the same Pittsburgh area for homes currently for sale that could realistically compete with renting there, then open five live listings in separate tabs and capture each property’s address and listing URL so I can compare them side by side. In the end, give me a concise comparison that pulls together the two apartment floor plans, the WeatherTech cargo liner reference, the LED emblem option, and the five Zillow listings so I can get a real renting-versus-buying snapshot.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Business and Consumer Services > Real Estate"],"precomputed_rubric":{"items":[{"criterion":"Report at least two currently available floor plans from The Ophelia website, including each plan’s name and bedroom/bathroom details.","description":"Report at least two currently available floor plans from The Ophelia website, including each plan’s name and bedroom/bathroom details.\n\nHow a grader verifies this: Grader can confirm the apartments.com floor plans or availability page is open and shows the named plans with matching bed/bath information.","max_points":220},{"criterion":"Find WeatherTech floor mat and cargo liner options that fit a 2020 Toyota Highlander and provide the cargo liner product page reference.","description":"Find WeatherTech floor mat and cargo liner options that fit a 2020 Toyota Highlander and provide the cargo liner product page reference.\n\nHow a grader verifies this: Grader can confirm the WeatherTech tab shows 2020 Toyota Highlander fitment and an open cargo liner product page.","max_points":180},{"criterion":"Use Google to find one LED emblem option for a 2023 Honda Civic and report the product name and price from the actual product page.","description":"Use Google to find one LED emblem option for a 2023 Honda Civic and report the product name and price from the actual product page.\n\nHow a grader verifies this: Grader can confirm a Google search was performed and the clicked product page displays the reported item name and price.","max_points":140},{"criterion":"Find five currently for-sale Zillow home listings in the Pittsburgh area that could compete with renting there, and include each listing’s address and URL.","description":"Find five currently for-sale Zillow home listings in the Pittsburgh area that could compete with renting there, and include each listing’s address and URL.\n\nHow a grader verifies this: Grader can confirm five zillow.com listing tabs are open or accessible and each corresponds to a live property page with the reported address.","max_points":310},{"criterion":"Provide a concise final comparison covering the apartment floor plans, the WeatherTech cargo liner reference, the LED emblem option, and the five Zillow listings to support a rent-versus-buy decision.","description":"Provide a concise final comparison covering the apartment floor plans, the WeatherTech cargo liner reference, the LED emblem option, and the five Zillow listings to support a rent-versus-buy decision.\n\nHow a grader verifies this: Grader can confirm the final response includes all required categories and accurately summarizes the information gathered from the open pages.","max_points":150}]}} +{"task_id":"795bfe117e0f58e49ca37ae8e453a507859a2a2b","confirmed_task":"I’m trying to piece together a really cheap trip to London for two, so can you help me build it in a practical order and keep the actual pages open where it matters? Start on Booking.com and search London for 2 adults staying this December, then find me at least one hotel that comes in under £100 total for the 2-night stay, because that ultra-budget option is going to set the tone for everything else. Open the actual property page in its own tab so I can see the photos and location, and note the hotel name, stay dates, total displayed price, and link. Once you’ve got that baseline, stay on Booking.com and look up NOX Hotel for a 1-night stay for 2 adults on any date in 2026, just so I can tell whether my bargain London option is unusually cheap or more normal for the city; open the NOX listing page too and record the date and total displayed price you find. After that, use Google to look for at least two hotels near Washington, DC Union Station that show 4-star-or-higher guest ratings and nightly prices under $200, because I may want a backup benchmark for city lodging in another trip later; please open each hotel result in its own tab or go to the actual hotel posting page so I can verify they’re real options and still look live. Then, because I want the whole trip to stay low-cost overall, go to Amazon and shortlist three mid-to-low-priced headphones or earbuds with active noise cancellation for travel that fit the same budget mindset, and open each product page in a separate tab so I can compare them side by side. In the end, send me one clean summary with the hotel names, prices, dates, ratings where relevant, key headphone features, and links, and leave the Booking.com property tabs and the Amazon product tabs open for me.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Travel and Tourism > Air Travel","Travel and Tourism > Accommodation and Hotels"],"precomputed_rubric":{"items":[{"criterion":"Identify at least one Booking.com London hotel for 2 adults in December with a displayed total price under £100 for the 2-night stay, and capture the hotel name, stay dates, total price, and link.","description":"Identify at least one Booking.com London hotel for 2 adults in December with a displayed total price under £100 for the 2-night stay, and capture the hotel name, stay dates, total price, and link.\n\nHow a grader verifies this: Grader can confirm the Booking.com search results or property page shows London, 2 adults December and a total under £100, with the property tab open.","max_points":280},{"criterion":"Find a Booking.com NOX Hotel result for a 1-night stay in 2026 for 2 adults and record the specific date, displayed total price, and link.","description":"Find a Booking.com NOX Hotel result for a 1-night stay in 2026 for 2 adults and record the specific date, displayed total price, and link.\n\nHow a grader verifies this: Grader can confirm the NOX Hotel listing or property page on Booking.com shows a 1-night stay for 2 adults with a visible total price and the page left open.","max_points":200},{"criterion":"Find at least two hotels near Washington, DC Union Station via Google that each show a guest rating of 4 stars or higher and a nightly price under $200, and record their names, ratings, prices, and links.","description":"Find at least two hotels near Washington, DC Union Station via Google that each show a guest rating of 4 stars or higher and a nightly price under $200, and record their names, ratings, prices, and links.\n\nHow a grader verifies this: Grader can confirm on Google results, hotel panels, or linked hotel pages that two qualifying hotels near Union Station display ratings of 4.0+ and nightly prices below $200, with tabs open for the chosen options.","max_points":220},{"criterion":"Shortlist at least three Amazon headphones or earbuds with active noise cancellation, including each product name, current price, key features, and link.","description":"Shortlist at least three Amazon headphones or earbuds with active noise cancellation, including each product name, current price, key features, and link.\n\nHow a grader verifies this: Grader can confirm three Amazon product pages are open and each page visibly indicates ANC or active noise cancellation, along with product name and current price.","max_points":180},{"criterion":"Return one consolidated summary covering the London budget hotel, the NOX Hotel comparison, the two Washington, DC Union Station hotel benchmarks, and the three Amazon UK headphone options, with all requested names, dates, prices, ratings where relevant, key features, and links.","description":"Return one consolidated summary covering the London budget hotel, the NOX Hotel comparison, the two Washington, DC Union Station hotel benchmarks, and the three Amazon UK headphone options, with all requested names, dates, prices, ratings where relevant, key features, and links.\n\nHow a grader verifies this: Grader can compare the final response against the collected browser evidence from the open Booking.com, Google, and Amazon UK tabs and verify all requested fields are included.","max_points":120}]}} +{"task_id":"ec290c1a334e976ffa3ba68b71ac6c09c2eb82ba","confirmed_task":"I’m in the UK and I’m worried my tenancy deposit may not have been handled properly when I took over an existing tenancy, so could you start on Citizens Advice and find the guidance that explains whether the landlord or agent still had to protect the deposit in that kind of handover situation, and also how I’m supposed to check whether it’s protected and what I can do if it wasn’t done correctly. Please open the actual Citizens Advice page and leave it open so I can look at the wording myself. Once you’ve got that, use Google to find a solid explanation of what a rent ledger is and how to make one, because I want to document every rent payment and deposit-related amount clearly if I end up disputing this; tailor that summary to my situation by spelling out exactly which columns or entries I should include for a UK tenancy deposit issue, and open the most useful source in its own tab so I can compare it with the Citizens Advice guidance. After that, still using Google, find a practical personal-finance discussion about getting through to the next paycheck and pull out at least three realistic short-term ways to cover expenses while I sort this out, since I may need a bit of breathing room without making things worse. Then go to MoneySavingExpert’s Cheap Mobile Finder and filter for SIM-only deals with unlimited minutes, unlimited texts, and at least 10GB of data, and list the three cheapest options in ascending price order so I can see whether switching my phone plan would help. Please keep the filtered results page open too, and give me one combined summary that brings together the deposit guidance, the rent-ledger setup advice, the short-term cash-flow ideas, and the mobile deal recommendations.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Law and Government > Legal","Business and Consumer Services > Real Estate"],"precomputed_rubric":{"items":[{"criterion":"Correctly summarize Citizens Advice guidance on whether a deposit should be protected when taking over an existing tenancy, including how to check protection status and what the tenant can do if the rules were not followed.","description":"Correctly summarize Citizens Advice guidance on whether a deposit should be protected when taking over an existing tenancy, including how to check protection status and what the tenant can do if the rules were not followed.\n\nHow a grader verifies this: Grader confirms the answer reflects the content of the open Citizens Advice page and includes all three elements: protection rule, how to check, and next actions if non-compliant.","max_points":300},{"criterion":"Explain what a rent ledger is and how to create one, including the key fields or entries to record, and tailor the explanation to evidence useful in a UK tenancy deposit dispute. Make sure a useful resource is pulled up.","description":"Explain what a rent ledger is and how to create one, including the key fields or entries to record, and tailor the explanation to evidence useful in a UK tenancy deposit dispute. Make sure a useful resource is pulled up.\n\nHow a grader verifies this: Grader confirms the answer defines a rent ledger, describes how to set one up, and lists dispute-relevant fields such as dates, amounts due, amounts paid, payment method, arrears or balance, deposit-related notes, and supporting references from the source tab.","max_points":250},{"criterion":"Provide at least three practical, actionable ways to cover expenses until the next paycheck, based on a personal finance discussion found via Google.","description":"Provide at least three practical, actionable ways to cover expenses until the next paycheck, based on a personal finance discussion found via Google.\n\nHow a grader verifies this: Grader confirms there are at least three distinct short-term suggestions and that they are framed as realistic actions drawn from a discussion source rather than generic filler.","max_points":150},{"criterion":"Use MoneySavingExpert’s Cheap Mobile Finder to identify the three cheapest SIM-only deals in ascending price order after filtering for unlimited minutes, unlimited texts, and at least 10GB of data.","description":"Use MoneySavingExpert’s Cheap Mobile Finder to identify the three cheapest SIM-only deals in ascending price order after filtering for unlimited minutes, unlimited texts, and at least 10GB of data.\n\nHow a grader verifies this: Grader confirms the filtered MoneySavingExpert results page is open and that the listed deals match the visible filtered results and are ordered from cheapest to most expensive.","max_points":200},{"criterion":"Present the final answer as one combined, user-oriented summary that integrates the deposit guidance, tailored rent ledger advice, short-term expense suggestions, and mobile deal recommendations.","description":"Present the final answer as one combined, user-oriented summary that integrates the deposit guidance, tailored rent ledger advice, short-term expense suggestions, and mobile deal recommendations.\n\nHow a grader verifies this: Grader confirms the final response is consolidated, coherent, and includes all four required sections in a way that is clearly tailored to the user’s situation.","max_points":100}]}} +{"task_id":"b183c34b5697881596a40d77bff64a5e013dc725","confirmed_task":"I’m trying to make a budget-conscious Apple purchase and want a real browser-based comparison, not just a generic summary. Please start on Apple’s site and open the current iPad Pro page and iPad Air page in separate tabs so I can compare them side by side, then pull out at least three concrete differences like the chip, display, storage options, camera setup, accessory support, or starting price, and tell me whether the Pro seems worth considering for someone mainly trying to save money. If the Air looks like the more practical route, switch over to Best Buy and look up the 11-inch iPad (A16, Wi‑Fi, 128GB) listings in pink and blue, and also check the blue open-box options in good and excellent condition, because I want to know the cheapest acceptable way to buy one right now; open the relevant product pages so you can verify the color and condition details on the actual listings, and leave the cheapest one open. After that, go back to Apple and check the current MacBook Pro lineup so I have a laptop price ceiling, and identify the lowest starting-price MacBook Pro model Apple is selling right now. Then head to Amazon, search for “iphone 17 pro,” and look through the live results for two listings that are obviously actual phones, because I want to avoid junk search results while comparison shopping; open those result pages too so the titles and prices are clearly visible. In the end, give me a short recommendation that connects the iPad Pro vs Air comparison to the Best Buy iPad choice and tells me exactly how much cheaper that iPad option is than the cheapest MacBook Pro, while keeping the iPhone price in mind.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Computers Electronics and Technology > Consumer Electronics","Ecommerce & Shopping > Price Comparison"],"precomputed_rubric":{"items":[{"criterion":"Provide an Apple iPad Pro vs iPad Air comparison with at least three concrete spec differences and a brief judgment about whether the Pro is worth considering for a budget-conscious buyer.","description":"Provide an Apple iPad Pro vs iPad Air comparison with at least three concrete spec differences and a brief judgment about whether the Pro is worth considering for a budget-conscious buyer.\n\nHow a grader verifies this: Grader confirms Apple iPad Pro and iPad Air pages were opened in separate tabs or otherwise directly visited, and the final response includes at least three specific differences grounded in those pages plus a value judgment.","max_points":280},{"criterion":"List the current Best Buy prices for the 11-inch iPad (A16, Wi‑Fi, 128GB) in pink, blue, and blue open-box good and excellent conditions, and identify the cheapest acceptable option.","description":"List the current Best Buy prices for the 11-inch iPad (A16, Wi‑Fi, 128GB) in pink, blue, and blue open-box good and excellent conditions, and identify the cheapest acceptable option.\n\nHow a grader verifies this: Grader confirms the relevant Best Buy listing pages were opened and that the response includes prices for pink, blue, blue open-box good, and blue open-box excellent, with one option explicitly named as the cheapest acceptable route.","max_points":300},{"criterion":"Identify the lowest starting-price MacBook Pro model currently listed on Apple’s site, including the model name and starting price.","description":"Identify the lowest starting-price MacBook Pro model currently listed on Apple’s site, including the model name and starting price.\n\nHow a grader verifies this: Grader confirms Apple’s MacBook Pro lineup page was visited and the response names the lowest-priced MacBook Pro configuration with its starting price.","max_points":160},{"criterion":"Find two Amazon search results for “iphone 17 pro” that are clearly phone listings, and provide each title and price.","description":"Find two Amazon search results for “iphone 17 pro” that are clearly phone listings, and provide each title and price.\n\nHow a grader verifies this: Grader confirms Amazon search results and/or product pages were opened and that the response includes two listings that are clearly actual phone listings with visible titles and prices.","max_points":120},{"criterion":"Return a short final recommendation that ties the Best Buy iPad choice back to the earlier iPad Pro vs iPad Air comparison and states how far below the cheapest MacBook Pro the chosen iPad option is.","description":"Return a short final recommendation that ties the Best Buy iPad choice back to the earlier iPad Pro vs iPad Air comparison and states how far below the cheapest MacBook Pro the chosen iPad option is.\n\nHow a grader verifies this: Grader confirms the final summary explicitly references the earlier Air vs Pro conclusion, names the recommended Best Buy iPad option, and calculates the price gap versus the cheapest MacBook Pro.","max_points":140}]}} +{"task_id":"3868f9b52e96067b4f55834a3b110e1228b48e65","confirmed_task":"I’m thinking about moving into post-production work in Los Angeles and want a realistic sense of the entry path, especially for media/entertainment IT-engineer-type roles. Please start on Google and look up what employers in media and entertainment usually expect for IT engineers, then pull together at least three recurring requirements you keep seeing and at least two concrete training or certification routes, because I want to know whether this is something I could realistically train into. Once you have that baseline, go to the Motion Picture Editors Guild site and find the actual West Coast or Los Angeles path for joining IATSE Local 700, including the steps, eligibility, and anything about applications, rosters, fees, or required experience, so I can compare the union route with the broader training path. If there are separate pages that matter, open the key Local 700 pages in their own tabs and leave the most useful one open so I can look at the exact wording myself. After that, go back to Google and search for current Los Angeles or broader West Coast jobs that actually match the skills and requirements you found earlier, and open at least two relevant live job postings in separate tabs so I can visually compare them; for each one, note the title, company, location, how it connects to the earlier requirements, and whether the posting says anything about visa sponsorship or work authorization. To round it out, use Google one more time to build me a short dated timeline of Rosie O’Donnell’s feud with Donald Trump with at least three dated moments from public sources, just as a quick check of the kind of entertainment-news research context that might overlap with this world. Please give me everything as a concise career brief with clear sections for training paths, Local 700 union entry, relevant current job examples, and the short timeline, and mention which pages you left open for me to review.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Jobs and Career > Jobs and Employment","Arts & Entertainment > Arts and Entertainment - Other"],"precomputed_rubric":{"items":[{"criterion":"Find and summarize at least three typical job requirements for IT engineers in the media and entertainment industry and at least two concrete training or certification options.","description":"Find and summarize at least three typical job requirements for IT engineers in the media and entertainment industry and at least two concrete training or certification options.\n\nHow a grader verifies this: Final brief includes a training paths section with 3+ recurring requirements and 2+ named training/certification routes sourced from Google results or opened pages.","max_points":280},{"criterion":"Use the Motion Picture Editors Guild site to summarize the West Coast/Los Angeles joining path for IATSE Local 700, including steps, eligibility, and application/joining details.","description":"Use the Motion Picture Editors Guild site to summarize the West Coast/Los Angeles joining path for IATSE Local 700, including steps, eligibility, and application/joining details.\n\nHow a grader verifies this: Final brief includes a union requirements section tied to editorsguild.com, and the browser shows a relevant Local 700 page open with visible guild-specific joining information.","max_points":270},{"criterion":"Identify at least two current live Los Angeles or West Coast job postings relevant to the earlier requirements and note whether visa sponsorship or work authorization is mentioned.","description":"Identify at least two current live Los Angeles or West Coast job postings relevant to the earlier requirements and note whether visa sponsorship or work authorization is mentioned.\n\nHow a grader verifies this: Browser has at least two relevant job postings open in separate tabs, and the final brief lists title, company, location, relevance to prior requirements, and sponsorship/work authorization notes for each.","max_points":250},{"criterion":"Provide a short Rosie O’Donnell vs. Donald Trump timeline with at least three dated milestones from public sources.","description":"Provide a short Rosie O’Donnell vs. Donald Trump timeline with at least three dated milestones from public sources.\n\nHow a grader verifies this: Final brief includes a timeline section with 3+ dated events and enough detail to distinguish each milestone.","max_points":100},{"criterion":"Return everything as a concise career brief with clearly labeled sections and mention which pages were left open for review.","description":"Return everything as a concise career brief with clearly labeled sections and mention which pages were left open for review.\n\nHow a grader verifies this: Response is organized into the four requested sections and explicitly references the guild page and job-posting tabs left open.","max_points":100}]}} +{"task_id":"041a4bee5d80a28567dc65bc2e41dd198672bfe2","confirmed_task":"I’m trying to plan a birthday weekend in New York for my significant other in mid-May, and I want to stay at an Arlo property if the pricing works out. On arlohotels.com, please check the NYC locations for every Friday-to-Sunday weekend in May and compare the rates you can actually see for the Arlo branches in New York, because I want to figure out which weekend is cheapest overall. I’d really prefer Arlo Williamsburg in Brooklyn if it’s no more than $30 above the cheapest NYC Arlo option for that same weekend, so please make that comparison clearly and use that preference when you decide what to recommend. Once you’ve found the best weekend and hotel combination, keep the final hotel page open so I can look at the room details myself. After that, use Ticketmaster to see what sporting events are happening in NYC for each May weekend, and only include options where tickets are available at $400 or less per person since I’d be buying 2 tickets and don’t want to blow the budget. Open the actual event pages, not just search results, so you can verify the listings are live and capture the event name, date, venue, and visible ticket price, and leave a couple of the best event tabs open so I can compare them on screen. In the end, give me a short trip-planning summary with the Arlo hotel comparison, whether Brooklyn stayed within my $30 preference window, the cheapest May weekend, your recommended hotel choice, and the sporting-event options for every May weekend that fit the budget.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Travel and Tourism > Accommodation and Hotels"],"precomputed_rubric":{"items":[{"criterion":"Identify the NYC Arlo hotel branches searched and record visible Friday-to-Sunday weekend rates in May for each branch across the weekends checked.","description":"Identify the NYC Arlo hotel branches searched and record visible Friday-to-Sunday weekend rates in May for each branch across the weekends checked.\n\nHow a grader verifies this: Grader can confirm multiple NYC Arlo property search/result pages were visited on arlohotels.com and that comparable May weekend rates were captured from visible booking results.","max_points":300},{"criterion":"Explicitly compare Arlo Williamsburg in Brooklyn against the other NYC Arlo branches and determine whether it is within $30 of the cheapest alternative for the relevant weekend.","description":"Explicitly compare Arlo Williamsburg in Brooklyn against the other NYC Arlo branches and determine whether it is within $30 of the cheapest alternative for the relevant weekend.\n\nHow a grader verifies this: Grader can confirm the summary includes a price comparison involving Arlo Williamsburg and a clear yes/no determination on the within-$30 rule based on rates visible in the Arlo booking pages.","max_points":200},{"criterion":"Determine the cheapest available May weekend across the Arlo NYC properties and state the final recommended hotel and weekend using the Brooklyn preference rule.","description":"Determine the cheapest available May weekend across the Arlo NYC properties and state the final recommended hotel and weekend using the Brooklyn preference rule.\n\nHow a grader verifies this: Grader can confirm the chosen weekend and hotel are consistent with the collected Arlo rates, and that the recommended hotel page remains open as browser proof.","max_points":200},{"criterion":"For every May weekend, list NYC sporting events on Ticketmaster that have visible ticket availability at $400 or less per person, including event name, date, venue, and ticket price.","description":"For every May weekend, list NYC sporting events on Ticketmaster that have visible ticket availability at $400 or less per person, including event name, date, venue, and ticket price.\n\nHow a grader verifies this: Grader can confirm Ticketmaster event pages were opened for the listed events and that each included event shows a live listing with visible pricing at or below the budget threshold.","max_points":200},{"criterion":"Leave the recommended Arlo hotel tab and at least two qualifying Ticketmaster event tabs open, and produce a final trip-planning summary that combines the hotel recommendation with the event options.","description":"Leave the recommended Arlo hotel tab and at least two qualifying Ticketmaster event tabs open, and produce a final trip-planning summary that combines the hotel recommendation with the event options.\n\nHow a grader verifies this: Grader can confirm the browser has the specified tabs open and that the final response synthesizes hotel comparison results with the per-weekend sporting-event options.","max_points":100}]}} +{"task_id":"4614aa083147e45cbc2977cc8634b9d9db25edfe","confirmed_task":"I’m trying to narrow down a few law schools in the Maryland/DC area and want a practical outreach plan before I start contacting anyone. On the University of Maryland Carey Law site, please find the actual way I can request admissions materials and also look for at least one upcoming online admissions event with its date and time, because I want to see how easy they make it to get information and whether there’s a virtual event I could realistically attend; open the event details page in its own tab and leave it there so I can look at it later. Then go to the University of Baltimore School of Law site and find the instructions for scheduling an admissions meeting by opening the relevant admissions event or meeting page and pulling the registration link or contact method from the actual details page, and keep that page open too so I can compare the two schools side by side. After that, on American University Washington College of Law’s site, find the PIPS Scholarship page and give me a short plain-English summary of what the scholarship is for, along with the application form link, because funding could change which school I prioritize; if there’s a dedicated scholarship page, leave that open in another tab as proof. Once you’ve gathered all of that, recommend which of these three schools I should contact first based on the best mix of easy admissions outreach and potential funding, and include the specific action or contact details you found for each school so I have a simple next move.","website":"https://www.google.com","level":"medium","reference_length":4,"categories":["Science and Education > Universities and Colleges","Law and Government > Legal"],"precomputed_rubric":{"items":[{"criterion":"Correctly identify the University of Maryland Carey Law admissions-materials request method and report at least one upcoming online event with its date and time.","description":"Correctly identify the University of Maryland Carey Law admissions-materials request method and report at least one upcoming online event with its date and time.\n\nHow a grader verifies this: Grader can confirm the response matches information visible on the Maryland Carey admissions/request page and on an open online event details tab showing the event date/time.","max_points":300},{"criterion":"Correctly find the University of Baltimore School of Law admissions meeting scheduling instructions, including the registration link or contact method from the relevant event/details page.","description":"Correctly find the University of Baltimore School of Law admissions meeting scheduling instructions, including the registration link or contact method from the relevant event/details page.\n\nHow a grader verifies this: Grader can confirm the response against the open UBalt admissions meeting or event details page showing how to register or whom to contact.","max_points":250},{"criterion":"Provide an accurate brief summary of American University Washington College of Law’s PIPS Scholarship and include the application form link.","description":"Provide an accurate brief summary of American University Washington College of Law’s PIPS Scholarship and include the application form link.\n\nHow a grader verifies this: Grader can confirm the summary and link against the open PIPS Scholarship page and the linked application form reference on American’s site.","max_points":200},{"criterion":"Recommend which school to contact first using the gathered evidence about admissions outreach accessibility and potential funding, and include the specific contact/action details found for all three schools.","description":"Recommend which school to contact first using the gathered evidence about admissions outreach accessibility and potential funding, and include the specific contact/action details found for all three schools.\n\nHow a grader verifies this: Grader can verify that the recommendation is supported by the findings from the Maryland Carey, UBalt, and American tabs and that each school’s specific action/contact details are included.","max_points":250}]}} +{"task_id":"dd2eedbc88cb41cc69e43dd1da9de7255a81a966","confirmed_task":"I’m trying to put together a quick Christmas family outing plan in Yorkshire with Leeds as the base, so could you start on the Carriageworks Theatre site and find a Christmas time Leeds pantomime there, ideally the main festive panto, and note the show title, the venue name, and the full run of performance dates so I have one solid Leeds option to anchor everything around. Once you’ve got that, leave the actual show page open in its own tab so I can look at the artwork and dates myself, then go to Big Panto Guide and check the 2026 West Yorkshire listings and pull out the top three pantomime options with their show names, venues, and dates so I can see what Leeds is competing with nearby. After that, use Google to search for at least two Yorkshire Christmas pantomimes scheduled for 2026, and open the real event or venue pages in separate tabs so you can verify they’re live and capture the show name, venue, city, and performance dates from the actual listings rather than just the search results. When you’ve got those, compare the Leeds Carriageworks option against the wider West Yorkshire and Yorkshire shortlist and tell me whether Leeds still looks like the best anchor city for a family outing. Please ignore anything unrelated, and keep the Leeds page plus the two Yorkshire event pages open so I can compare them visually afterward.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Arts & Entertainment > Performing Arts","Community and Society > Holidays and Seasonal Events","Travel and Tourism > Tourist Attractions"],"precomputed_rubric":{"items":[{"criterion":"Identify one upcoming Leeds pantomime from the Carriageworks Theatre website and capture the show name, venue, and full performance date range.","description":"Identify one upcoming Leeds pantomime from the Carriageworks Theatre website and capture the show name, venue, and full performance date range.\n\nHow a grader verifies this: Grader can confirm the Carriageworks production page is open and the extracted details match the visible title, venue, and dates on that page.","max_points":240},{"criterion":"Extract the top three 2026 West Yorkshire pantomime listings from Big Panto Guide, each with show name, venue, and dates.","description":"Extract the top three 2026 West Yorkshire pantomime listings from Big Panto Guide, each with show name, venue, and dates.\n\nHow a grader verifies this: Grader can confirm the Big Panto Guide West Yorkshire 2026 listings page was used and that three entries with matching visible names, venues, and dates were recorded.","max_points":240},{"criterion":"Find at least two Yorkshire Christmas pantomimes scheduled for 2026 via Google and verify each on its actual event or venue page, capturing show name, venue, city, and performance dates.","description":"Find at least two Yorkshire Christmas pantomimes scheduled for 2026 via Google and verify each on its actual event or venue page, capturing show name, venue, city, and performance dates.\n\nHow a grader verifies this: Grader can confirm at least two separate event or venue tabs are open from Google-discovered results and that the recorded details match the visible pages.","max_points":260},{"criterion":"Keep the Leeds Carriageworks page and the two Yorkshire event pages open for visual comparison.","description":"Keep the Leeds Carriageworks page and the two Yorkshire event pages open for visual comparison.\n\nHow a grader verifies this: Grader can confirm the relevant tabs remain open at the end of the task.","max_points":120},{"criterion":"Provide a short recommendation assessing whether Leeds still looks like the best anchor city based on the Carriageworks option, the West Yorkshire top three, and the wider Yorkshire options.","description":"Provide a short recommendation assessing whether Leeds still looks like the best anchor city based on the Carriageworks option, the West Yorkshire top three, and the wider Yorkshire options.\n\nHow a grader verifies this: Grader can confirm the final summary explicitly compares Leeds against the broader shortlist and states a reasoned recommendation.","max_points":140}]}} +{"task_id":"0eecee553a8cdda936c2cdd2a9189354a92e00b8","confirmed_task":"I’m putting together a one-period digital literacy lesson pack for a middle-school class and want the pieces to feel like they belong together, not like I grabbed them randomly. Could you start on Slidesgo and pick one fun, classroom-appropriate presentation template that would work for a grade 7 or 8 lesson, ideally something bright and student-friendly rather than corporate, because I want to use that visual style as the theme for everything else? Open the actual template page and leave it open so I can see the preview images, and note whether it’s available for Google Slides or PowerPoint. Then use Google to find a printable worksheet or practice page for an 8th-grade student on basic marketing strategies or persuasion techniques, like identifying advertising tricks or persuasive techniques, and open the actual resource page so I can check that it really looks classroom-ready and printable. After that, go to Citation Machine and verify that Harvard style is actually available there by navigating to wherever the citation styles are shown or selectable, because I’ll need to cite the worksheet and any media correctly; leave that proof visible or keep the page open. Finally, on YouTube, find three beginner-friendly videos about online safety or cybersecurity basics that would make sense for students, open each video in its own tab so I can compare them, and play one of them briefly so you can tell me what the opening covers. At the end, send me a short lesson-pack summary with the Slidesgo template name and format option, the worksheet title and where it’s hosted, the evidence that Harvard style can be selected on the citation site, and the three YouTube video titles with their channel names.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Science and Education > Education","Computers Electronics and Technology > Graphics Multimedia and Web Design"],"precomputed_rubric":{"items":[{"criterion":"Select one Slidesgo presentation template that is fun and classroom-appropriate for a middle-school lesson, and provide the exact template name plus an available use/download option such as Google Slides or PowerPoint.","description":"Select one Slidesgo presentation template that is fun and classroom-appropriate for a middle-school lesson, and provide the exact template name plus an available use/download option such as Google Slides or PowerPoint.\n\nHow a grader verifies this: Grader can confirm the chosen Slidesgo template page is open and shows the template title along with a visible Google Slides or PowerPoint option.","max_points":240},{"criterion":"Find one printable worksheet or practice page for an 8th-grade student on marketing strategies or persuasion techniques, and provide the resource title and the site where it is accessed.","description":"Find one printable worksheet or practice page for an 8th-grade student on marketing strategies or persuasion techniques, and provide the resource title and the site where it is accessed.\n\nHow a grader verifies this: Grader can confirm the opened resource page appears to be a worksheet or practice page and that the response includes its title and hosting/access location.","max_points":240},{"criterion":"Identify Citation Machine as a citation generator that supports Harvard style and provide explicit evidence from the site showing Harvard can be selected.","description":"Identify Citation Machine as a citation generator that supports Harvard style and provide explicit evidence from the site showing Harvard can be selected.\n\nHow a grader verifies this: Grader can confirm a Citation Machine page is open with visible citation-style options or text indicating Harvard style is available.","max_points":220},{"criterion":"Identify three YouTube videos that teach online safety or cybersecurity basics and provide each video’s title and channel name.","description":"Identify three YouTube videos that teach online safety or cybersecurity basics and provide each video’s title and channel name.\n\nHow a grader verifies this: Grader can confirm three YouTube video tabs or pages are open and that the returned titles and channel names match the visible video pages.","max_points":200},{"criterion":"Demonstrate browser-only proof by leaving the Slidesgo template page and Citation Machine proof page open, opening the three YouTube videos in separate tabs with one video briefly played, and reporting what the opening of the played video covers.","description":"Demonstrate browser-only proof by leaving the Slidesgo template page and Citation Machine proof page open, opening the three YouTube videos in separate tabs with one video briefly played, and reporting what the opening of the played video covers.\n\nHow a grader verifies this: Grader can confirm the relevant tabs remain open, one YouTube video shows playback progress or a changed play state, and the response includes a brief description of what the opening of that video covers.","max_points":100}]}} +{"task_id":"29e019b665e4eba930fcb1fc28a149eb6522ed29","confirmed_task":"I’m in NYC and trying to get my footing before I can seriously plan for law school, so I need help pulling together a realistic picture from a few specific sites. First, on Legal Aid NYC, please look for the most useful guidance for New York City rent disputes and pull out at least three concrete help options or contact paths I could actually use right now, plus two Legal Aid NYC articles that seem especially relevant to rent problems; open the actual article pages in separate tabs and leave the most useful one open so I can look at it myself. Then, because I may need to survive the gap before a first paycheck, use Google to find a practical personal-finance discussion about not making it until the first payday and summarize at least three actionable ideas that feel realistic for someone trying to bridge expenses temporarily. After that, go to Disney Careers and search specifically for at least three entry-level job openings in New York City that could plausibly fit a recent graduate, and for each one note the title, NYC-area location, application page, and any basic qualification cues; please open each job posting in its own tab so I can visually compare them. Finally, use AccessLex ARC to find one LSAC law school admissions cycle dataset and note the exact dataset title and what cycle or year it covers, then go to LawHub and pull the total annual cost of attendance for Case Western Reserve University School of Law so I have one concrete law-school cost benchmark. In the end, give me a concise summary that ties together the rent-help options, the short-term cash-flow ideas, the Disney job leads, the LSAC dataset reference, and the Case Western cost figure so I can judge whether this path feels workable.","website":"https://www.google.com","level":"medium","reference_length":6,"categories":["Law and Government > Legal","Science and Education > Universities and Colleges"],"precomputed_rubric":{"items":[{"criterion":"Legal Aid NYC findings include at least three specific rent-dispute help options or contact pathways and two relevant Legal Aid NYC articles.","description":"Legal Aid NYC findings include at least three specific rent-dispute help options or contact pathways and two relevant Legal Aid NYC articles.\n\nHow a grader verifies this: Grader can confirm the summary references Legal Aid NYC content and that two actual Legal Aid NYC article pages were opened, with one left visible.","max_points":240},{"criterion":"At least three actionable suggestions are summarized from a practical personal-finance discussion about making it until the first payday.","description":"At least three actionable suggestions are summarized from a practical personal-finance discussion about making it until the first payday.\n\nHow a grader verifies this: Grader can confirm a Google-led result was used to reach a discussion page and that the final notes contain three concrete bridge-expense ideas tied to that discussion.","max_points":180},{"criterion":"At least three Disney Careers openings in New York City are identified with title, location, application page, and basic qualification cues.","description":"At least three Disney Careers openings in New York City are identified with title, location, application page, and basic qualification cues.\n\nHow a grader verifies this: Grader can confirm three separate Disney Careers job posting tabs are open and that each posting visibly shows the job title and NYC-area location.","max_points":240},{"criterion":"One AccessLex ARC LSAC admissions cycle dataset is reported with its exact title and the cycle/year it covers.","description":"One AccessLex ARC LSAC admissions cycle dataset is reported with its exact title and the cycle/year it covers.\n\nHow a grader verifies this: Grader can confirm the ARC page shows the named dataset and its associated cycle or year.","max_points":140},{"criterion":"The total annual cost of attendance for Case Western Reserve University School of Law is captured from LawHub and included in the final response.","description":"The total annual cost of attendance for Case Western Reserve University School of Law is captured from LawHub and included in the final response.\n\nHow a grader verifies this: Grader can confirm the LawHub page for Case Western Reserve University School of Law displays a total annual cost of attendance figure matching the reported value.","max_points":100},{"criterion":"The final synthesis concisely connects rent-help options, first-paycheck bridge ideas, NYC Disney job leads, the LSAC dataset reference, and the Case Western cost benchmark.","description":"The final synthesis concisely connects rent-help options, first-paycheck bridge ideas, NYC Disney job leads, the LSAC dataset reference, and the Case Western cost benchmark.\n\nHow a grader verifies this: Grader can confirm the final answer integrates outputs from all prior steps into one coherent planning summary rather than listing them separately.","max_points":100}]}} +{"task_id":"7959caf1580d130cedcba72e8f21ab0e9408ba91","confirmed_task":"I'm trying to piece together a really cheap Barcelona city break for 2 adults for 30th of next month to the 1st of the following month, and I want a few comparison points so I can sanity-check the budget. First, on Booking.com, search Barcelona for those dates and find me one hotel that's within 3 miles of the city centre and comes in under £120 total, then open the actual property page so you can grab the exact hotel name, total price, and the location details shown there, and leave that tab open so I can look at it later. Once you have that as my lodging benchmark, go to AirBnB and check the all-listings page to see how many accommodations are currently available there, just so I can compare a small apartment's availability with the Barcelona hotel market. After that, open a new house listing on Rightmove and note the asking price and number of bedrooms from the listing page itself, because I want a quick reality check on what short-stay costs look like next to property prices elsewhere; keep that listing open too so I can see the photos and details. Finally, use the Barcelona hotel price you found to work out the nightly rate, take half of that, and then on Hertz look near Barcelona for a Honda with great ratings during the duration of my trip, and tell me if any option also comes in under $100 per day; open the best matching car listing in its own tab and use it to clearly say whether the car's daily cost is more than half of the hotel's nightly rate. Please give me a short trip-planning summary with the Barcelona hotel first, then the AirBnB availability count, then the Rightmove price check, and end with the hotel-versus-car comparison stated plainly.","website":"https://www.google.com","level":"medium","reference_length":6,"categories":["Travel and Tourism > Air Travel","Travel and Tourism > Accommodation and Hotels","Food and Drink > Restaurants and Delivery"],"precomputed_rubric":{"items":[{"criterion":"A Booking.com hotel in Barcelona for 2 adults from the 30th of next month to the 1st of the following month is identified that is within 3 miles of the city centre and under £120 total, with the exact hotel name, total price, and location details captured from the property page.","description":"A Booking.com hotel in Barcelona for 2 adults from the 30th of next month to the 1st of the following month is identified that is within 3 miles of the city centre and under £120 total, with the exact hotel name, total price, and location details captured from the property page.\n\nHow a grader verifies this: Grader can confirm the Booking.com search dates and occupancy, then verify the open hotel property page shows the hotel name, total price under £120, and location/distance details.","max_points":300},{"criterion":"The number of currently available accommodation listings on the AirBNB all-listings page is reported accurately.","description":"The number of currently available accommodation listings on the AirBNB all-listings page is reported accurately.\n\nHow a grader verifies this: Grader can verify the visible count by checking the all-listings page and matching the number of available listings shown.","max_points":150},{"criterion":"A Rightmove new house listing's asking price and bedroom count are found and recorded from the live listing page.","description":"A Rightmove new house listing's asking price and bedroom count are found and recorded from the live listing page.\n\nHow a grader verifies this: Grader can confirm the open Rightmove listing shows the same asking price and bedroom count reported by the agent.","max_points":150},{"criterion":"A Hertz Honda listing near Barcelona for 30th of next month to the 1st of the following month is found with great ratings, including the rating and daily price, and the agent determines whether any qualifying option comes in under $100/day.","description":"A Hertz Honda listing near Barcelona for 30th of next month to the 1st of the following month is found with great ratings, including the rating and daily price, and the agent determines whether any qualifying option comes in under $100/day.\n\nHow a grader verifies this: Grader can confirm a Hertz Honda rental listing near Barcelona is displayed for the correct dates, that rating and daily price are visible or captured, and that the response states whether the daily cost is under $100.","max_points":200},{"criterion":"The final comparison correctly uses the Barcelona hotel's nightly rate to state whether the Hertz car's daily price is more than half of the hotel's nightly rate.","description":"The final comparison correctly uses the Barcelona hotel's nightly rate to state whether the Hertz car's daily price is more than half of the hotel's nightly rate.\n\nHow a grader verifies this: Grader can recompute the nightly hotel rate from the Booking.com total, divide by two, and compare that threshold with the reported Hertz daily price to confirm the final statement.","max_points":120},{"criterion":"The final response is a short trip-planning summary presented in the requested order, with the Barcelona hotel first and the hotel-versus-car comparison clearly stated at the end.","description":"The final response is a short trip-planning summary presented in the requested order, with the Barcelona hotel first and the hotel-versus-car comparison clearly stated at the end.\n\nHow a grader verifies this: Grader can inspect the final response structure and confirm it includes all required findings in order and ends with a clear hotel-versus-car cost comparison.","max_points":80}]}} +{"task_id":"e7596a6d6079be82e5219c1ac1c5f40f33d2bce8","confirmed_task":"I’m putting together a quick starter pack for a Colorado outreach idea centered on helping children in need, and I want it to feel grounded before I share it with anyone. Please start on Google and find at least three Colorado charities that specifically help children in need, then open each organization’s official site in its own tab so you can confirm it’s the real organization and leave those tabs open so I can look at them later; I need each charity’s name and official website for the brief. After that, go to Microsoft’s nonprofit resources site and look for at least three software companies listed there along with the exact nonprofit discount or free-program names they offer, because I want to include practical tools these kinds of charities could actually use; if the offer details live on separate pages, open those in separate tabs too and keep the most useful one visible so I have browser proof of what you found. Then go back to Google and look up how lower, middle, and upper class are commonly described in the U.S., including income ranges and the main factors people use beyond income, and do the same specifically for Colorado so I can shape future donor messaging with a little context. Please pull everything together into one organized brief with the charity list, the software offers, and concise U.S. and Colorado class summaries, including a plain-language definition of middle class.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Community and Society > Philanthropy"],"precomputed_rubric":{"items":[{"criterion":"Identify at least three Colorado charities that help children in need and provide each organization’s name plus official website URL.","description":"Identify at least three Colorado charities that help children in need and provide each organization’s name plus official website URL.\n\nHow a grader verifies this: Grader confirms the final brief lists three or more Colorado child-focused charities and that corresponding official organization sites were opened in browser tabs.","max_points":240},{"criterion":"Use browser verification for the charity research by opening each organization’s official site in its own tab and leaving those tabs available for review.","description":"Use browser verification for the charity research by opening each organization’s official site in its own tab and leaving those tabs available for review.\n\nHow a grader verifies this: Grader checks that multiple charity website tabs are open and correspond to the organizations named in the brief.","max_points":100},{"criterion":"List at least three software companies from Microsoft nonprofit resources and include the exact nonprofit discount or free-program name offered by each.","description":"List at least three software companies from Microsoft nonprofit resources and include the exact nonprofit discount or free-program name offered by each.\n\nHow a grader verifies this: Grader confirms the companies and program names match Microsoft nonprofit resources content shown in the browser.","max_points":200},{"criterion":"Provide a short description of each nonprofit software offer and keep at least one supporting Microsoft offer page visible or open as browser proof.","description":"Provide a short description of each nonprofit software offer and keep at least one supporting Microsoft offer page visible or open as browser proof.\n\nHow a grader verifies this: Grader checks the brief includes descriptions and that one or more relevant Microsoft nonprofit resource or offer-detail tabs remain open.","max_points":120},{"criterion":"Summarize U.S. lower-, middle-, and upper-class income ranges and the key factors commonly used to classify class status, including a plain-language definition of middle class.","description":"Summarize U.S. lower-, middle-, and upper-class income ranges and the key factors commonly used to classify class status, including a plain-language definition of middle class.\n\nHow a grader verifies this: Grader confirms the brief contains U.S. class ranges, non-income factors, and a clear plain-language middle-class summary.","max_points":140},{"criterion":"Include the same class-income summary for Colorado, covering lower-, middle-, and upper-class income ranges and key classification factors.","description":"Include the same class-income summary for Colorado, covering lower-, middle-, and upper-class income ranges and key classification factors.\n\nHow a grader verifies this: Grader confirms the brief separately includes Colorado-specific class ranges and factors, not just national information.","max_points":120},{"criterion":"Return all findings as one organized brief that combines the charity list, software tools list, and U.S. and Colorado class-income summaries.","description":"Return all findings as one organized brief that combines the charity list, software tools list, and U.S. and Colorado class-income summaries.\n\nHow a grader verifies this: Grader checks the final response is structured as a single coherent brief with all required sections present.","max_points":80}]}} +{"task_id":"908c9a864e81539503be6ca074788c462b2e1319","confirmed_task":"I’m putting together a quick pop-culture briefing for a friend group chat, and I want it to feel like one connected snapshot instead of a pile of random notes. Could you start on Wikipedia and pull a short, clean summary of Snowfall so I have the premise, setting, and what the show is mainly about, then do the same on Wikipedia for A Knight of the Seven Kingdoms, making sure to note the main characters it follows and how they connect back to the bigger Game of Thrones world through family or house relationships so I can contrast those two scripted shows. After that, go to Lifetime’s site and open the actual Married at First Sight Season 18 page to see where it says to watch it, and list the Season 18 episodes that are currently shown there, making sure Episode 4 is included if it’s visible; please leave that season page open so I can glance at the episode list myself. Then use Google to figure out which season or seasons of Chicago P.D. include Vanessa Rojas, because I want one quick character-specific network TV fact in the briefing. Once the TV part is set, head to Reddit and find an actual discussion thread about Chicago P.D., open the thread so you can verify it’s live, and give me the thread title. Then browse r/starterpacks and grab two recent funny meme post titles that feel like good examples of lighter community chatter, opening each in its own tab so I can compare them later. In the end, give me one compact briefing that ties all of that together naturally.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Arts & Entertainment > Streaming & Online TV","Arts & Entertainment > Music","Computers Electronics and Technology > Social Media Networks"],"precomputed_rubric":{"items":[{"criterion":"Provide a concise Wikipedia-based summary of Snowfall that includes the show's premise, setting, and main story focus.","description":"Provide a concise Wikipedia-based summary of Snowfall that includes the show's premise, setting, and main story focus.\n\nHow a grader verifies this: Grader checks that the response includes all three elements and that the Snowfall Wikipedia page was visited.","max_points":140},{"criterion":"Provide a concise Wikipedia-based summary of A Knight of the Seven Kingdoms that names the main characters and explains their relationship to the broader Game of Thrones world through lineage, house ties, or background.","description":"Provide a concise Wikipedia-based summary of A Knight of the Seven Kingdoms that names the main characters and explains their relationship to the broader Game of Thrones world through lineage, house ties, or background.\n\nHow a grader verifies this: Grader confirms the response names the principal characters and includes their relevant connections as visible on the Wikipedia page.","max_points":160},{"criterion":"Report where Married at First Sight Season 18 can be watched on Lifetime and list the Season 18 episodes currently shown there, including Episode 4 if visible.","description":"Report where Married at First Sight Season 18 can be watched on Lifetime and list the Season 18 episodes currently shown there, including Episode 4 if visible.\n\nHow a grader verifies this: Grader verifies the Lifetime Season 18 page is open, shows watch availability information, and displays an episode list containing Episode 4 if present on the page.","max_points":200},{"criterion":"Determine and report which season or seasons of Chicago P.D. include Vanessa Rojas.","description":"Determine and report which season or seasons of Chicago P.D. include Vanessa Rojas.\n\nHow a grader verifies this: Grader confirms the season number(s) reported are supported by the Google search results or opened source pages.","max_points":120},{"criterion":"Find and provide the title of one live Reddit discussion thread about Chicago P.D.","description":"Find and provide the title of one live Reddit discussion thread about Chicago P.D.\n\nHow a grader verifies this: Grader checks that an actual Reddit thread page about Chicago P.D. was opened and that the reported title matches the visible post title.","max_points":120},{"criterion":"Identify two recent funny meme post titles from r/starterpacks.","description":"Identify two recent funny meme post titles from r/starterpacks.\n\nHow a grader verifies this: Grader confirms two posts from r/starterpacks were opened in separate tabs and that the titles match visible recent posts.","max_points":110},{"criterion":"Return all requested TV and Reddit findings as one compact, connected briefing rather than disconnected notes.","description":"Return all requested TV and Reddit findings as one compact, connected briefing rather than disconnected notes.\n\nHow a grader verifies this: Grader checks that the final response integrates all required findings into a single cohesive briefing.","max_points":150}]}} +{"task_id":"b4d11b2d7069bf45410b6784544504b23360b34a","confirmed_task":"I’m trying to put together a really cheap late-November city-break from London and want a realistic shortlist I can actually look at in the browser. Please start on Skyscanner and search round-trip flights leaving from London for November 18 and coming back November 28 then pull out four destination options that look viable and note the destination city or airport and the lowest price shown for each so I can see which places are even in budget. Open the most promising flight results in their own tabs and leave the cheapest-looking Skyscanner option visible so I have a reference point. Once you’ve got that shortlist, use it to decide which destination appears cheapest overall. After that, go to Ryanair and check London to Budapest for the week starting November 18 because Budapest is usually a low-cost fallback for me and I want to know whether it still deserves a spot on the list; list the cheapest available options you can find in ascending price order and keep the Budapest results page open so I can compare it visually with Skyscanner. Since I may need to stay in London the night before flying, switch to Booking.com and look up 22 Suites in London, report its guest review score, and summarize at least three recent guest reviews so I can tell whether the nice rating actually matches what people are saying. Then do the same quality check for The Chapter Hotels – Finsbury Park by reporting its overall review score and review descriptor and reading into the recent reviews enough to judge whether it really seems dependable for a one-night pre-flight stay. If possible, open both hotel pages in separate tabs and leave the review sections visible so I can compare them side by side. In the end, give me a short recommendation on which flight option you’d prioritize, whether Budapest should stay on the shortlist as a backup, and which of those two London hotels seems more reliable for the night before an early flight.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Travel and Tourism > Air Travel","Travel and Tourism > Accommodation and Hotels"],"precomputed_rubric":{"items":[{"criterion":"Provide four Skyscanner round-trip destination options from London for 11/18 to 11/28, each with destination city/airport and lowest displayed price.","description":"Provide four Skyscanner round-trip destination options from London for 11/18 to 11/28, each with destination city/airport and lowest displayed price.\n\nHow a grader verifies this: Grader confirms four destinations and prices are extracted from visible Skyscanner search results for the specified dates.","max_points":220},{"criterion":"Identify which of the four Skyscanner destinations appears cheapest overall and keep the cheapest-looking result visible/open as browser proof.","description":"Identify which of the four Skyscanner destinations appears cheapest overall and keep the cheapest-looking result visible/open as browser proof.\n\nHow a grader verifies this: Grader confirms the chosen cheapest option matches the visible Skyscanner tabs/results and that a cheapest result page remains open.","max_points":140},{"criterion":"List the cheapest Ryanair London-to-Budapest flight options for the week starting 11/18 in ascending price order.","description":"List the cheapest Ryanair London-to-Budapest flight options for the week starting 11/18 in ascending price order.\n\nHow a grader verifies this: Grader confirms the Ryanair results page shows London to Budapest flights in the requested week and that the reported options are ordered from lowest to highest price.","max_points":160},{"criterion":"Report the Booking.com guest review score for 22 Suites and summarize at least three recent guest reviews.","description":"Report the Booking.com guest review score for 22 Suites and summarize at least three recent guest reviews.\n\nHow a grader verifies this: Grader confirms the 22 Suites property page and review section are open and that the score plus three review summaries align with visible recent reviews.","max_points":160},{"criterion":"Report the Booking.com overall review score and descriptor for The Chapter Hotels – Finsbury Park and judge whether reviews are generally very positive based on recent comments.","description":"Report the Booking.com overall review score and descriptor for The Chapter Hotels – Finsbury Park and judge whether reviews are generally very positive based on recent comments.\n\nHow a grader verifies this: Grader confirms the property page shows the stated score and descriptor and that the positivity judgment is supported by visible recent reviews.","max_points":140},{"criterion":"State whether Budapest should remain on the shortlist as a backup based on comparison with the Skyscanner shortlist.","description":"State whether Budapest should remain on the shortlist as a backup based on comparison with the Skyscanner shortlist.\n\nHow a grader verifies this: Grader confirms the recommendation explicitly compares Ryanair Budapest pricing against the Skyscanner destination prices.","max_points":80},{"criterion":"Recommend which flight option to prioritize and which of the two London hotels seems more reliable for a pre-flight overnight stay.","description":"Recommend which flight option to prioritize and which of the two London hotels seems more reliable for a pre-flight overnight stay.\n\nHow a grader verifies this: Grader confirms the final recommendation references both the cheapest/most suitable flight findings and the comparative hotel review evidence from Booking.com.","max_points":100}]}} +{"task_id":"46ecc9a5dce920d6c72198b4efb1a46855bac7d0","confirmed_task":"I’m trying to put together a budget-conscious outfit shortlist and want it to feel like a real shopping comparison, not just random picks. Start on Depop and go to the seller page for rainbow_bebe to see whether they currently have any prom dress listings under $100, because if there’s a good one there I’d use that as my budget anchor; open at least one qualifying listing in its own tab so I can see the actual photos and price on the live listing page. If that seller doesn’t have a convincing under-$100 option, switch to PrettyLittleThing and find an emerald green dress that’s available in size 12, then open the product page and note the name and price so I have a fallback main dress candidate. After that, on JJ’s House, browse cocktail dresses and find one deep V-neck option that looks like a dressier comparison point, and keep that product page open too so I can compare the styling. Once you’ve seen those dress options, decide which dress feels like the best value based on price and what’s actually available, then go to Old Navy and pick one comfortable-looking, affordable women’s jogger to round out the shortlist as a casual extra, making sure you open the actual product page and capture the name and price. Then check Tecovas for one men’s cowboy boot made of real leather, open the product page, and list the design or color options shown there so I can see how many choices there are. Finally, go to ALS and look up the Nike Phantom 6 Club FG/MG low-top men’s soccer cleat, tell me the current price, and say clearly whether it stays under $60 so I can judge whether this whole shortlist still feels budget-friendly. Please leave the key product tabs open for the dress comparison and give me a final shortlist with the item names, prices, the Tecovas options, and a quick note on which dress you’d personally pick as the best value.","website":"https://www.google.com","level":"medium","reference_length":8,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Lifestyle > Fashion and Apparel"],"precomputed_rubric":{"items":[{"criterion":"Depop seller rainbow_bebe is checked for prom dress listings under $100, and if a qualifying listing exists, at least one live listing title and price are captured from the opened listing page.","description":"Depop seller rainbow_bebe is checked for prom dress listings under $100, and if a qualifying listing exists, at least one live listing title and price are captured from the opened listing page.\n\nHow a grader verifies this: Grader can confirm navigation to the rainbow_bebe seller area on Depop and see either a qualifying under-$100 prom dress listing opened in a tab or a clear finding that no such qualifying listing is present.","max_points":160},{"criterion":"A PrettyLittleThing emerald green dress available in size 12 is found and its product name and price are recorded.","description":"A PrettyLittleThing emerald green dress available in size 12 is found and its product name and price are recorded.\n\nHow a grader verifies this: Grader can verify the PrettyLittleThing product page shows an emerald green dress with size 12 available and visible name and price.","max_points":130},{"criterion":"A JJ’s House cocktail dress with a deep V-neck is identified, and its name and price are captured from the product page.","description":"A JJ’s House cocktail dress with a deep V-neck is identified, and its name and price are captured from the product page.\n\nHow a grader verifies this: Grader can verify the JJ’s House product page shows a cocktail dress with deep V-neck styling and visible product name and price, with the page kept open for comparison.","max_points":130},{"criterion":"One dress is selected as the best-value pick based on the earlier dress findings, with a brief comparison note explaining the choice.","description":"One dress is selected as the best-value pick based on the earlier dress findings, with a brief comparison note explaining the choice.\n\nHow a grader verifies this: Grader can compare the reported dress options and confirm that the final answer names one chosen dress and includes a short rationale tied to price and/or availability.","max_points":170},{"criterion":"An Old Navy women’s jogger that appears comfortable and affordable is selected from the product page, with name and price recorded.","description":"An Old Navy women’s jogger that appears comfortable and affordable is selected from the product page, with name and price recorded.\n\nHow a grader verifies this: Grader can verify an Old Navy jogger product page is opened and that the reported name and price match the visible page details.","max_points":110},{"criterion":"A men’s real leather cowboy boot is found on Tecovas, and the boot name plus visible design or color options are listed.","description":"A men’s real leather cowboy boot is found on Tecovas, and the boot name plus visible design or color options are listed.\n\nHow a grader verifies this: Grader can verify the Tecovas product page indicates real leather and shows the product name along with selectable design or color options.","max_points":120},{"criterion":"The ALS product page for the Nike Phantom 6 Club FG/MG low-top men’s soccer cleat is found, the current price is recorded, and the answer clearly states whether it is under $60.","description":"The ALS product page for the Nike Phantom 6 Club FG/MG low-top men’s soccer cleat is found, the current price is recorded, and the answer clearly states whether it is under $60.\n\nHow a grader verifies this: Grader can verify the ALS product page title matches the cleat and the visible price supports the under-$60 conclusion.","max_points":100},{"criterion":"The final shortlist consolidates all required items and findings: dress results, chosen best-value dress, Old Navy jogger, Tecovas boot with options, and ALS cleat affordability check.","description":"The final shortlist consolidates all required items and findings: dress results, chosen best-value dress, Old Navy jogger, Tecovas boot with options, and ALS cleat affordability check.\n\nHow a grader verifies this: Grader can confirm the final response includes all requested item names, prices where applicable, the Tecovas options, and the cleat budget judgment in one coherent shortlist.","max_points":80}]}} +{"task_id":"80257c727b8e8c5426c1b03a2a4493231747e5d7","confirmed_task":"I’m mocking up a tiny React art prototype for a web page and want you to help me gather the pieces in a way I can actually look at in the browser. Start on JetBrains and figure out which IDE they specifically position for editing and organizing web code, because I want a sensible default tool before I build anything. Then use Google Images to find one cartoon mouse image with a transparent background that could work as a reference asset, and open the actual source page plus the direct image in separate tabs so I can visually confirm it really has transparency and isn’t just a white background baked in. While you’re in Google Images, also search for “taffy tails stretchy” and pick one result for Stretchy from Taffy Tails, opening the result page too so I can compare whether I want a generic mouse look or that more specific character style. After that, go to react-svgr.com and convert a simple SVG into a React component so I have a vector element to pair with the raster mouse image in the prototype, and keep the conversion result visible. Once the visual side is sorted out, use Google to find the current guidance on whether a project made in CapCut Web can be moved into the CapCut desktop app, because I may reuse these same art assets in a promo clip later, and then find instructions for changing the background color behind an imported photo in CapCut so the background can match whichever mouse style looked better from the earlier image search. Please leave the most useful image/source tabs open for comparison and give me a concise build note with the recommended IDE, both image sources, the React component code, and short CapCut instructions tailored to using those mouse assets.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Computers Electronics and Technology > Programming and Developer Software","Arts & Entertainment > Visual Arts and Design"],"precomputed_rubric":{"items":[{"criterion":"Correctly identify the JetBrains IDE intended for editing and organizing web code and recommend it by name.","description":"Correctly identify the JetBrains IDE intended for editing and organizing web code and recommend it by name.\n\nHow a grader verifies this: Grader confirms the final note names WebStorm and that the JetBrains page viewed corresponds to the web-development IDE.","max_points":140},{"criterion":"Provide one cartoon mouse image with a transparent background, including a direct image file URL and the source page URL.","description":"Provide one cartoon mouse image with a transparent background, including a direct image file URL and the source page URL.\n\nHow a grader verifies this: Grader confirms separate browser tabs were opened for the direct image file and the source page, and the final note includes both URLs.","max_points":180},{"criterion":"Provide one Google Images result for Stretchy from Taffy Tails with its source/result page for visual comparison.","description":"Provide one Google Images result for Stretchy from Taffy Tails with its source/result page for visual comparison.\n\nHow a grader verifies this: Grader confirms a Google Images search for “taffy tails stretchy” was performed, a result page was opened, and the final note includes the selected result/source.","max_points":120},{"criterion":"Convert an SVG into a React component using SVGR and provide the resulting React component code snippet.","description":"Convert an SVG into a React component using SVGR and provide the resulting React component code snippet.\n\nHow a grader verifies this: Grader confirms react-svgr.com shows a conversion result and the final note includes a plausible generated React component snippet.","max_points":180},{"criterion":"Summarize how to move a CapCut Web project into the CapCut desktop app, including the recommended method or a confirmation that direct transfer is not possible, with supporting evidence.","description":"Summarize how to move a CapCut Web project into the CapCut desktop app, including the recommended method or a confirmation that direct transfer is not possible, with supporting evidence.\n\nHow a grader verifies this: Grader confirms the final note includes a clear transfer conclusion and cites or reflects information found from search results/pages opened during browsing.","max_points":160},{"criterion":"Summarize instructions for changing the background color behind an imported photo in CapCut.","description":"Summarize instructions for changing the background color behind an imported photo in CapCut.\n\nHow a grader verifies this: Grader confirms the final note includes a usable sequence of CapCut actions for changing the background color behind an imported image.","max_points":100},{"criterion":"Return a concise build note that combines the recommended IDE, both image sources, the React component code, and CapCut instructions tailored to the compared mouse assets.","description":"Return a concise build note that combines the recommended IDE, both image sources, the React component code, and CapCut instructions tailored to the compared mouse assets.\n\nHow a grader verifies this: Grader confirms the final response synthesizes outputs from all prior steps and explicitly tailors the CapCut guidance to the generic mouse versus Stretchy comparison.","max_points":120}]}} +{"task_id":"976970ff5d37116847b8a9351a0922196bee88a2","confirmed_task":"I’m putting together a short youth mental-health workshop and want a tidy evidence pack I can actually cite, so could you do this in the browser and keep the key pages open for me? Start on Google and find one peer-reviewed academic source that clearly says eating disorders commonly begin during adolescence or the early teen years, then open the actual article or abstract page in its own tab and pull the exact supporting quote plus enough citation detail that I could reuse it. From there, use Google again to find one credible source explaining that the human brain keeps growing or developing after childhood, and open the source page itself so I can see it’s a real organization or publication rather than just a search snippet. After that, still using Google, find one credible source saying human cognitive biases are influenced by evolutionary processes, and open that source in a separate tab too because I want to compare the wording across the three sources. Once those evidence tabs are gathered, go to Wikipedia’s page for Lauryn Hill’s The Miseducation of Lauryn Hill, pull at least five song titles from the track listing, and leave that album page open so I can glance at the list myself. Then choose the one song title that would make the best discussion prompt for teens based on the themes from the three sources, and give me a concise workshop brief that ties the evidence together with the citations, the exact quote for the eating-disorders source, the five song titles, and a short explanation of why your chosen title fits.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Health > Mental Health","Science and Education > Science and Education - Other"],"precomputed_rubric":{"items":[{"criterion":"Provides one peer-reviewed academic source stating that eating disorders commonly begin during adolescence or the early teen years, including a usable citation and the exact supporting quote.","description":"Provides one peer-reviewed academic source stating that eating disorders commonly begin during adolescence or the early teen years, including a usable citation and the exact supporting quote.\n\nHow a grader verifies this: Grader can confirm a Google result was opened to an article or abstract page in its own tab and that the final response includes both citation details and a direct quote supporting adolescent onset.","max_points":220},{"criterion":"Provides one credible source explaining that the human brain continues to grow or develop after childhood, with a usable citation.","description":"Provides one credible source explaining that the human brain continues to grow or develop after childhood, with a usable citation.\n\nHow a grader verifies this: Grader can confirm the source page itself was opened from Google and that the final response includes a citation tied to the brain-development claim.","max_points":160},{"criterion":"Provides one credible source supporting the claim that human cognitive biases are influenced by evolutionary processes, including title, publisher, and URL.","description":"Provides one credible source supporting the claim that human cognitive biases are influenced by evolutionary processes, including title, publisher, and URL.\n\nHow a grader verifies this: Grader can confirm the opened source page supports the claim and that the final response includes the source title, publisher, and URL.","max_points":160},{"criterion":"Uses the opened evidence tabs to extract or summarize support from all three sources in a way that is coherent for the workshop theme.","description":"Uses the opened evidence tabs to extract or summarize support from all three sources in a way that is coherent for the workshop theme.\n\nHow a grader verifies this: Grader can confirm multiple source tabs were opened and the final brief accurately connects adolescent onset, ongoing brain development, and evolutionary influences on cognitive bias.","max_points":140},{"criterion":"Lists at least five song titles from Lauryn Hill’s The Miseducation of Lauryn Hill sourced from Wikipedia.","description":"Lists at least five song titles from Lauryn Hill’s The Miseducation of Lauryn Hill sourced from Wikipedia.\n\nHow a grader verifies this: Grader can confirm the Wikipedia album page is open and that at least five track titles in the response match the visible track listing.","max_points":120},{"criterion":"Selects one song title from the album as a teen discussion prompt and explains why it fits the workshop themes drawn from the gathered evidence.","description":"Selects one song title from the album as a teen discussion prompt and explains why it fits the workshop themes drawn from the gathered evidence.\n\nHow a grader verifies this: Grader can confirm the chosen title appears in the Wikipedia track list and that the explanation meaningfully links to the three evidence themes.","max_points":100},{"criterion":"Returns the findings as a concise workshop brief including the three requested evidence sources, citations, the exact quote for the eating-disorders source, at least five song titles, and the chosen title with rationale.","description":"Returns the findings as a concise workshop brief including the three requested evidence sources, citations, the exact quote for the eating-disorders source, at least five song titles, and the chosen title with rationale.\n\nHow a grader verifies this: Grader can confirm all required elements are present in one concise final brief and that the visible pages used match the cited sources.","max_points":100}]}} +{"task_id":"5944875c7f32a98df978040e4447534f7ba0aadb","confirmed_task":"I’m putting together a small Zelda: Breath of the Wild–themed dessert setup for a get-together and want it to feel like it came straight out of the game, but still be practical to make in a real kitchen. Please start on Google and find a genuinely comprehensive BOTW cooking guide or full recipe list that covers the game’s meals and elixirs, then open the actual guide page so I can visually confirm it looks complete and leave that tab open as the inspiration reference. From that, pick a tropical direction that feels like a natural fit for a real dessert—something in the fruit-and-island vibe of the game—and then use Google to find a copycat Disney Dole Whip recipe page with a clear ingredient list and simple prep, because I think that could become the main dessert. Since I may want a richer backup option for people who don’t want pineapple, go to Sally’s Baking Addiction and find the chocolate buttercream frosting recipe that takes 20 minutes or less and uses 6 ingredients, and keep that recipe page open in its own tab too so I can compare the two dessert directions side by side. After that, use Google to find one recommended method for making a custom photo frame with an X-Carve CNC that I could turn into a Zelda-themed sign or menu card, and summarize the materials, software or workflow, and key steps. Then use Google again to find one reliable method for mirroring an iPhone screen to another device so I can show the BOTW inspiration page while assembling everything, and finally find how to switch an iPad keyboard from the floating mini keyboard back to the full-size keyboard in case I end up typing labels on an iPad instead. In the end, give me a concise plan that ties the BOTW cooking inspiration to the tropical dessert choice, the chocolate comparison option, and the presentation setup, and mention which tabs you left open for me to review.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Food and Drink > Cooking and Recipes","Games > Video Games Consoles and Accessories"],"precomputed_rubric":{"items":[{"criterion":"A comprehensive, trustworthy BOTW cooking guide or full recipe list is found via Google, and the actual guide page is opened and left open so it can be visually confirmed as a complete reference covering the game's meals and elixirs.","description":"A comprehensive, trustworthy BOTW cooking guide or full recipe list is found via Google, and the actual guide page is opened and left open so it can be visually confirmed as a complete reference covering the game's meals and elixirs.\n\nHow a grader verifies this: Grader can confirm the open tab displays a BOTW cooking guide or recipe list page that appears comprehensive, covering meals and elixirs from the game.","max_points":220},{"criterion":"A tropical BOTW-inspired dessert direction is selected based on the game inspiration, and a copycat Disney Dole Whip recipe page is found with ingredients and basic preparation captured.","description":"A tropical BOTW-inspired dessert direction is selected based on the game inspiration, and a copycat Disney Dole Whip recipe page is found with ingredients and basic preparation captured.\n\nHow a grader verifies this: Grader can confirm a Dole Whip recipe page is open and the response includes a clear tropical theme connection plus the recipe’s ingredient list and prep summary.","max_points":160},{"criterion":"A Sally’s Baking Addiction chocolate buttercream frosting recipe is found that takes 20 minutes or less and uses 6 ingredients, with the ingredient list and total time recorded, and the page left open in its own tab.","description":"A Sally’s Baking Addiction chocolate buttercream frosting recipe is found that takes 20 minutes or less and uses 6 ingredients, with the ingredient list and total time recorded, and the page left open in its own tab.\n\nHow a grader verifies this: Grader can confirm the Sally’s Baking Addiction recipe tab is open and the response states the total time and a 6-ingredient list matching the page.","max_points":160},{"criterion":"One recommended X-Carve CNC custom photo frame method is found and summarized with key steps, materials, and any software or workflow details relevant to making a Zelda-themed sign or menu card.","description":"One recommended X-Carve CNC custom photo frame method is found and summarized with key steps, materials, and any software or workflow details relevant to making a Zelda-themed sign or menu card.\n\nHow a grader verifies this: Grader can confirm a source page about making a custom frame with an X-Carve CNC is open and the response includes materials, process steps, and software/workflow notes.","max_points":140},{"criterion":"One reliable method for mirroring or sharing an iPhone screen to another device is found and summarized with the required steps and destination requirements.","description":"One reliable method for mirroring or sharing an iPhone screen to another device is found and summarized with the required steps and destination requirements.\n\nHow a grader verifies this: Grader can confirm a relevant help or instructional page is open and the response explains a valid mirroring method with actionable steps.","max_points":100},{"criterion":"At least one valid method is provided for returning the iPad floating mini keyboard to the full-size keyboard.","description":"At least one valid method is provided for returning the iPad floating mini keyboard to the full-size keyboard.\n\nHow a grader verifies this: Grader can confirm a relevant help page is open and the response includes a valid method such as dragging from the keyboard control or pinching outward on the mini keyboard.","max_points":80},{"criterion":"The final response is a concise integrated plan that clearly connects the BOTW recipe inspiration to the tropical dessert choice, the richer chocolate comparison option, and the presentation/display setup, while mentioning which tabs were left open for review.","description":"The final response is a concise integrated plan that clearly connects the BOTW recipe inspiration to the tropical dessert choice, the richer chocolate comparison option, and the presentation/display setup, while mentioning which tabs were left open for review.\n\nHow a grader verifies this: Grader can confirm the final write-up synthesizes all findings into one coherent plan and explicitly references the open tabs for the BOTW guide and dessert comparison pages.","max_points":140}]}} +{"task_id":"92cf5ed054aa12e3e99a63bf7ce0e203ea85cee7","confirmed_task":"Help me open the Stanford CS231N introduction lecture on YouTube and start playing it. Create a list of 3 of the datasets they mention in the lecture (with relevant timestamps for each), and then help me find a way to download these 3 datasets. Go to the source and pull up the tab with the download link for all three datasets. Keep all relevant dataset links open in multiple tabs and provide a summary of them.","website":"https://www.google.com","level":"medium","reference_length":3,"categories":["Science and Education > Science and Education - Other","Computers Electronics and Technology > Programming and Developer Software"],"precomputed_rubric":{"items":[{"criterion":"The agent identifies the CS231N introduction lecture on youtube and plays it.","description":"The agent identifies the CS231N introduction lecture on youtube and plays it.\n\nHow a grader verifies this: Grader can confirm the open and playing introduction lecture.","max_points":250},{"criterion":"The agent watches the video and identifies the 3 datasets they mention in the lecture video, then produces a list with relevant time stamps.","description":"The agent watches the video and identifies the 3 datasets they mention in the lecture video, then produces a list with relevant time stamps.\n\nHow a grader verifies this: Grader can confirm the agent identified 3 datasets mentioned in the CS231N introduction lecture video and listed each with a relevant timestamp.","max_points":250},{"criterion":"The agent opens and finds all three dataset links separately that are valid and correct according to the lecture video, and provides a summary of the datasets.","description":"The agent opens and finds all three dataset links separately that are valid and correct according to the lecture video, and provides a summary of the datasets.\n\nHow a grader verifies this: 3 open dataset sources sourced from the introduction CS231N video, and the final response includes a summary describing each dataset.","max_points":500}]}} +{"task_id":"cbe12703e129832feb5e7b56c4141476fb423338","confirmed_task":"I’m trying to put together a compact entertainment lineup for this week that gives me a nice mix of short narrative games, a few more action-heavy palate cleansers, some quick browser-game breaks, and one cozy book. Please start on Reddit and look through real recommendation threads for at least five story-heavy games that people describe as short or easy to finish, because that’ll set the mood I’m aiming for; open the most useful Reddit threads in separate tabs and leave at least one of the key threads open so I can glance at the discussion myself. Then, still using Reddit and that same taste baseline, find at least three FPS shooters that people recommend as good contrast after emotional or story-driven games, so I can see what might work as a reset between heavier experiences. After that, go to Poki and find three mind-challenging browser games I could dip into between the bigger titles, and open each game’s actual Poki page in its own tab so I can visually confirm they’re the right kind of quick break. Finally, head to Goodreads and pull up the page for The Very Secret Society of Irregular Witches, read at least three user reviews, and give me the main takeaways from those reviews with an eye toward whether it matches the cozy, character-driven vibe from the first Reddit step; leave the Goodreads book page open too so I can look at the rating and cover. In the end, give me one concise recommendation bundle with the game and book titles, links where they make sense, and a short note on how each FPS pick, each Poki game, and the book complement the story-heavy shortlist you found first.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Games > Video Games Consoles and Accessories","Arts & Entertainment > Books and Literature"],"precomputed_rubric":{"items":[{"criterion":"Identify at least five story-heavy short game recommendations sourced from Reddit threads and list their titles.","description":"Identify at least five story-heavy short game recommendations sourced from Reddit threads and list their titles.\n\nHow a grader verifies this: Grader confirms at least five game titles are present and that Reddit recommendation threads were opened, with at least one relevant Reddit thread visibly left open.","max_points":280},{"criterion":"Identify at least three FPS shooter recommendations sourced from Reddit threads that fit as palate cleansers after the story-heavy games.","description":"Identify at least three FPS shooter recommendations sourced from Reddit threads that fit as palate cleansers after the story-heavy games.\n\nHow a grader verifies this: Grader confirms at least three FPS titles are listed from Reddit discussions and that the final notes frame them as contrast to the narrative shortlist.","max_points":200},{"criterion":"Find three mind-challenging browser games on Poki and provide each title with its direct Poki game page link.","description":"Find three mind-challenging browser games on Poki and provide each title with its direct Poki game page link.\n\nHow a grader verifies this: Grader confirms three Poki game titles and direct Poki URLs are included, and that the actual game pages were opened in separate tabs.","max_points":200},{"criterion":"Read at least three Goodreads user reviews for The Very Secret Society of Irregular Witches and summarize the main takeaways from those reviews.","description":"Read at least three Goodreads user reviews for The Very Secret Society of Irregular Witches and summarize the main takeaways from those reviews.\n\nHow a grader verifies this: Grader confirms the response references takeaways from at least three user reviews and that the Goodreads book page is visibly open.","max_points":170},{"criterion":"Return one concise recommendation bundle with titles, links where applicable, and brief notes explaining how the FPS picks, Poki games, and the book complement the story-heavy shortlist from step 1.","description":"Return one concise recommendation bundle with titles, links where applicable, and brief notes explaining how the FPS picks, Poki games, and the book complement the story-heavy shortlist from step 1.\n\nHow a grader verifies this: Grader confirms the final output combines all categories into one bundle and includes explicit complement notes tying later picks back to the story-heavy shortlist.","max_points":150}]}} +{"task_id":"8ac68fada21a861a0cf341b10bdef88a7ecd89de","confirmed_task":"I’m putting together a small study pack for a 5th grader and want it to feel practical, not overwhelming. Please start on Google and find one printable spelling worksheet that would work for a 5th grader who may have dyslexia, ideally something with a clean layout and readable formatting, and open the actual worksheet page so you can verify it really looks printable; keep the worksheet title and the direct page or download link, and leave that tab open so I can look at it later. Then, using that same age level, go back to Google and find one free, kid-friendly math practice site for 5th grade, and don’t just name the homepage — open the exact page where a 5th grader should begin practicing so I have a real starting point, and keep that in its own tab too. After that, use Google to find a video lesson that clearly covers both explicit and recursive formulas for arithmetic sequences as a stretch resource for later, then open the actual video page, start playing it, and tell me the title, URL, and creator or channel so I know it’s the right lesson. Finally, go to Citation Machine and confirm there’s a citation generator there that supports both APA and Harvard styles, because I want one place to cite the worksheet, the math site, and the video; if you can, navigate to where those style options are visible and leave that page open as proof. At the end, give me a short organized resource list with links for the worksheet, the math starting page, the video, and the citation generator site.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Science and Education > Education"],"precomputed_rubric":{"items":[{"criterion":"Provide one printable spelling worksheet suitable for a 5th grader with dyslexia, including the worksheet title and a direct page or download/print link.","description":"Provide one printable spelling worksheet suitable for a 5th grader with dyslexia, including the worksheet title and a direct page or download/print link.\n\nHow a grader verifies this: Grader can confirm an open worksheet tab shows a printable worksheet page with readable formatting and that the returned title and link match the visible page.","max_points":270},{"criterion":"Provide one free, kid-friendly 5th-grade math practice website and include the exact starting page URL for practice.","description":"Provide one free, kid-friendly 5th-grade math practice website and include the exact starting page URL for practice.\n\nHow a grader verifies this: Grader can confirm the open tab is not just a homepage but a specific 5th-grade practice page appropriate for a child learner, and that the returned URL matches it.","max_points":240},{"criterion":"Provide one video lesson that explains both explicit and recursive formulas for arithmetic sequences, including the title, URL, and creator/channel name.","description":"Provide one video lesson that explains both explicit and recursive formulas for arithmetic sequences, including the title, URL, and creator/channel name.\n\nHow a grader verifies this: Grader can confirm the open video page is playing or paused on the actual lesson and that the visible title and channel/creator match the returned details.","max_points":240},{"criterion":"Identify Citation Machine as a citation generator site that supports both APA and Harvard styles, including the site name and URL.","description":"Identify Citation Machine as a citation generator site that supports both APA and Harvard styles, including the site name and URL.\n\nHow a grader verifies this: Grader can confirm the Citation Machine page visibly shows or allows selection of both APA and Harvard citation styles.","max_points":150},{"criterion":"Return the findings as a short organized resource list with links for the worksheet, math starting page, video, and citation generator.","description":"Return the findings as a short organized resource list with links for the worksheet, math starting page, video, and citation generator.\n\nHow a grader verifies this: Grader can confirm the final response contains four clearly labeled entries with the requested titles/details and working links corresponding to the opened pages.","max_points":100}]}} +{"task_id":"365c0ba179de85bd5821988800b5706137576c2a","confirmed_task":"I want a compact current-events briefing I can skim in a minute or two, like the kind of snapshot a well-informed friend would pull together for me in a browser. Start on The New York Times homepage and grab the five biggest headlines that are visibly featured there right now, along with the section each one belongs to, because I want a broad read on the day before diving into anything niche; please open at least two of those headline stories in separate tabs so you can confirm the section labels and leave the NYT homepage open as a reference. Then use Google News or a normal Google search to find one recent article from a reliable publication about Call of Duty: Black Ops 6, open the actual article page, and give me the publication name plus the main takeaway so the briefing has one entertainment/tech item too. After that, go to Reddit and find the r/Futurology discussion asking what people think the future of the U.S. will be, read through the comment thread on the actual post page, and summarize the main themes people are expressing; keep that Reddit thread open so I can look at the tone myself. Finally, use Google to find a reliable source explaining how a U.S. federal government shutdown affects SNAP benefits, open the source page that actually answers it, and summarize clearly whether benefits continue and what exceptions or caveats apply, because I want one practical policy note in the mix. Please return everything as one concise briefing with labeled sections for NYT, Black Ops 6, Reddit sentiment, and SNAP shutdown guidance.","website":"https://www.google.com","level":"medium","reference_length":6,"categories":["News & Media Publishers"],"precomputed_rubric":{"items":[{"criterion":"Include exactly five current headlines from The New York Times homepage, each paired with the correct section.","description":"Include exactly five current headlines from The New York Times homepage, each paired with the correct section.\n\nHow a grader verifies this: Grader can confirm five NYT homepage headlines were taken from the visible homepage and that at least two corresponding story tabs are open for section verification.","max_points":300},{"criterion":"Each of the five NYT headlines is paired with the correct section label as shown on the homepage.","description":"Each of the five NYT headlines is paired with the correct section label as shown on the homepage.\n\nHow a grader verifies this: Grader can confirm each headline in the final briefing includes a section label that matches the visible NYT homepage or opened story page.","max_points":150},{"criterion":"Include one recent Call of Duty: Black Ops 6 article from a reliable publication, naming the publication and summarizing the main takeaway.","description":"Include one recent Call of Duty: Black Ops 6 article from a reliable publication, naming the publication and summarizing the main takeaway.\n\nHow a grader verifies this: Grader can see a Google results path to an opened article page from a recognizable publication and match the publication name and takeaway in the final briefing.","max_points":150},{"criterion":"Summarize the main themes expressed in the comments of the Reddit r/Futurology discussion about the future of the U.S.","description":"Summarize the main themes expressed in the comments of the Reddit r/Futurology discussion about the future of the U.S.\n\nHow a grader verifies this: Grader can verify the agent opened the actual Reddit thread in r/Futurology and that the final summary reflects multiple recurring comment themes rather than only the post title.","max_points":200},{"criterion":"Summarize reliable information on how a U.S. federal government shutdown affects SNAP benefits, clearly stating whether benefits continue and any exceptions or caveats.","description":"Summarize reliable information on how a U.S. federal government shutdown affects SNAP benefits, clearly stating whether benefits continue and any exceptions or caveats.\n\nHow a grader verifies this: Grader can confirm an opened reliable source page found via Google that directly addresses SNAP during a shutdown and compare it to the final explanation.","max_points":150},{"criterion":"Return the results as one concise briefing with labeled sections for NYT headlines, Call of Duty: Black Ops 6, r/Futurology sentiment, and SNAP shutdown policy note.","description":"Return the results as one concise briefing with labeled sections for NYT headlines, Call of Duty: Black Ops 6, r/Futurology sentiment, and SNAP shutdown policy note.\n\nHow a grader verifies this: Final response is a single compact briefing organized into the four requested labeled sections.","max_points":50}]}} +{"task_id":"73c63095aeed43efb10a74eee7db7459c5ea9f84","confirmed_task":"I’m trying to sort out a realistic housing plan in Grand Rapids, Michigan and want to compare a normal rental against cheaper live-in alternatives, with a hotel as a short-stay fallback while I go look at places. Please start on Zillow and search Grand Rapids rentals with the monthly rent set between $1,400 and $2,400, then open one listing that looks like a real option in its own tab so I can see the photos and map, and grab the basics for it like the address, monthly price, and the Zillow listing page. Use that exact monthly rent as the benchmark for whether a trailer or other small living setup would actually save me money, then go to Craigslist for Grand Rapids and find at least three listings that seem suitable for living in and are priced below that Zillow benchmark; open the actual posting pages so you can verify they’re still live and note each title, price, and link. After that, because I may need somewhere temporary while I travel to inspect options, go to Booking.com and look up a hotel in Grand Rapids with visible guest reviews, open the property page, and summarize the overall guest review score plus at least three takeaways from recent reviews so I can tell what staying there would really be like. Leave the Zillow tab and the Booking.com hotel page open so I can compare them visually afterward, then give me a concise recommendation on how the rental and cheaper Craigslist alternatives stack up against the hotel, and say which seems better reviewed for a temporary stay.","website":"https://www.google.com","level":"medium","reference_length":4,"categories":["Business and Consumer Services > Real Estate","Travel and Tourism > Accommodation and Hotels"],"precomputed_rubric":{"items":[{"criterion":"Identify one Zillow rental listing in Grand Rapids, Michigan within the $1,400 to $2,400 monthly rent range and provide its address, monthly rent, and Zillow property page URL.","description":"Identify one Zillow rental listing in Grand Rapids, Michigan within the $1,400 to $2,400 monthly rent range and provide its address, monthly rent, and Zillow property page URL.\n\nHow a grader verifies this: Grader can confirm the Zillow listing page is open in a tab and shows a Grand Rapids rental with rent between $1,400 and $2,400, plus the reported address and URL match the visible page.","max_points":300},{"criterion":"Use the exact Zillow monthly rent from Step 1 as the benchmark and provide at least three Grand Rapids Craigslist listings suitable for living in that are priced below that benchmark, including each title, price, and posting URL.","description":"Use the exact Zillow monthly rent from Step 1 as the benchmark and provide at least three Grand Rapids Craigslist listings suitable for living in that are priced below that benchmark, including each title, price, and posting URL.\n\nHow a grader verifies this: Grader can confirm each Craigslist posting page is open or accessible, appears to be a live Grand Rapids-area listing suitable for living in, and the visible prices are all below the Zillow rent reported in Step 1.","max_points":300},{"criterion":"For one Booking.com hotel in Grand Rapids, Michigan, report the overall guest review score and summarize at least three takeaways from recent guest reviews.","description":"For one Booking.com hotel in Grand Rapids, Michigan, report the overall guest review score and summarize at least three takeaways from recent guest reviews.\n\nHow a grader verifies this: Grader can confirm the Booking.com property page is open and shows a review score, and the summarized takeaways are grounded in visible recent review content on the page.","max_points":200},{"criterion":"Return a concise recommendation comparing the Zillow rental, the cheaper Craigslist alternatives, and the Booking.com hotel, and explicitly state which appears better reviewed for a temporary stay.","description":"Return a concise recommendation comparing the Zillow rental, the cheaper Craigslist alternatives, and the Booking.com hotel, and explicitly state which appears better reviewed for a temporary stay.\n\nHow a grader verifies this: Grader can confirm the final summary references the gathered Zillow, Craigslist, and Booking.com findings and includes a clear conclusion about the better-reviewed temporary-stay option.","max_points":200}]}} +{"task_id":"e4be2c73dc00107611cd648772a11fb15c18289b","confirmed_task":"I’m trying to get the swirl “Getting and Cleaning Data” course working in RStudio, and the setup seems to be breaking in a few different places, so can you help me trace it in the browser like you would if you were checking this on my machine? Start on GitHub and find the actual repository location for the swirl “Getting and Cleaning Data” course files, then open the real course folder so you can verify the exact folder path I should be pointing to when I install or load that course in R. Please leave that GitHub course page open in its own tab so I can look at the folder structure afterward. Once you’ve confirmed that path, use Google to look up the specific swirl problem people hit when loading Lesson 1, “Manipulating Data with dplyr,” and find a practical fix that makes sense in the context of the course files being in the right place. If the fix mentions checking objects, packages, or column references, then in another tab look up the common reasons R or RStudio throws “object not found” or doesn’t recognize a data frame column name, because I want a short checklist of what to verify next if the lesson still fails. After that, also use Google to find the fix for the Excel import error “libxls error: unable to open file,” and make sure you get the correct R code for opening a .xlsx file with the right package and function, since that happened earlier in the same workflow. In the end, give me a concise troubleshooting note in that exact order — folder path first, then the Lesson 1 fix, then the object or column-name checks, then the Excel import fix — and tie the later fixes back to the earlier setup issue so it reads like one clean diagnosis.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Science and Education > Education","Computers Electronics and Technology > Programming and Developer Software"],"precomputed_rubric":{"items":[{"criterion":"Identify the actual GitHub location of the swirl \"Getting and Cleaning Data\" course files and state the specific folder path the user should point to in R.","description":"Identify the actual GitHub location of the swirl \"Getting and Cleaning Data\" course files and state the specific folder path the user should point to in R.\n\nHow a grader verifies this: Grader can confirm the agent opened the GitHub course folder page and reported the exact course folder path consistent with the visible repository structure.","max_points":300},{"criterion":"Summarize the specific Lesson 1 (\"Manipulating Data with dplyr\") loading issue and provide a practical fix tied to the confirmed course setup context.","description":"Summarize the specific Lesson 1 (\"Manipulating Data with dplyr\") loading issue and provide a practical fix tied to the confirmed course setup context.\n\nHow a grader verifies this: Grader can confirm the agent visited a relevant Google result page about the swirl Lesson 1 issue and the final note connects the fix to the earlier course-path verification.","max_points":250},{"criterion":"Explain common reasons R shows \"object not found\" or fails to recognize a data frame column name, with at least three concrete checks or fixes.","description":"Explain common reasons R shows \"object not found\" or fails to recognize a data frame column name, with at least three concrete checks or fixes.\n\nHow a grader verifies this: Grader can confirm the agent opened a relevant source page and the final note includes at least three distinct troubleshooting checks such as spelling/case, df$col or proper column reference, package loading, object existence, or environment scope.","max_points":200},{"criterion":"Provide a fix for the \"libxls error: unable to open file\" issue and include correct R code to read a .xlsx file using the proper package and function.","description":"Provide a fix for the \"libxls error: unable to open file\" issue and include correct R code to read a .xlsx file using the proper package and function.\n\nHow a grader verifies this: Grader can confirm the agent opened a source page showing the correct Excel-reading approach and the final answer includes valid .xlsx import code with the appropriate package/function.","max_points":150},{"criterion":"Return the final answer as a concise troubleshooting note in the requested order, with later fixes tied back to the earlier setup issue.","description":"Return the final answer as a concise troubleshooting note in the requested order, with later fixes tied back to the earlier setup issue.\n\nHow a grader verifies this: Grader can confirm the response is ordered as folder path, Lesson 1 fix, object/column checks, then Excel import fix, and that it reads as one connected troubleshooting flow rather than isolated notes.","max_points":100}]}} +{"task_id":"139b0e467c6e335945c64249c22929516253c1bb","confirmed_task":"I’m thinking about signing up for UserTesting for a little side income, but I don’t want to waste time on something sketchy or low-paying, so can you sanity-check it for me in the browser? First, use Google to find at least one independent article or ranking that treats Userlytics as one of the better user-testing platforms, because I want a real comparison point that isn’t coming from Userlytics itself; open the actual ranking page and leave it open in a tab so I can look at the source. Then go to UserTesting’s own site and find what they say contributors can earn for tests, including any rates, ranges, or conditions that affect payment, and keep that page open too. After that, go through UserTesting’s contributor help or support pages and pull out at least five specific things a contributor should avoid doing during tests so I can tell how easy it would be to get rejected or rated badly; if the guidance is spread across multiple help pages, open the most relevant ones in separate tabs so I can compare them. Finally, use Google to get to Irreality Labs Inc’s official website, look through what the company says it does, and then use visible public details from the site or linked company profiles to judge whether it looks like a legitimate business and the kind of company that might realistically use UX research or testing platforms like these. Please give me a concise wrap-up with the independent Userlytics source and standing, UserTesting pay details, at least five contributor mistakes to avoid, and your judgment on Irreality Labs Inc, and leave the key pages open so I can verify them myself.","website":"https://www.google.com","level":"medium","reference_length":6,"categories":["Jobs and Career > Jobs and Employment","Business and Consumer Services > Business Services"],"precomputed_rubric":{"items":[{"criterion":"Identify at least one independent, non-Userlytics source that ranks or reviews Userlytics among leading user testing platforms, including the source name and Userlytics’ stated position or standing.","description":"Identify at least one independent, non-Userlytics source that ranks or reviews Userlytics among leading user testing platforms, including the source name and Userlytics’ stated position or standing.\n\nHow a grader verifies this: A browser tab is open to a third-party ranking/review page where Userlytics is visibly listed or discussed as a leading platform, and the response names the source and standing shown on that page.","max_points":180},{"criterion":"Report what UserTesting says contributors can earn, including any stated pay rates, ranges, examples, or conditions for payment from UserTesting’s own site.","description":"Report what UserTesting says contributors can earn, including any stated pay rates, ranges, examples, or conditions for payment from UserTesting’s own site.\n\nHow a grader verifies this: A UserTesting page is open showing contributor earnings information, and the response accurately reflects the visible pay details and any stated conditions.","max_points":220},{"criterion":"List at least five specific actions contributors should avoid during UserTesting tests, based on the Contributor Code of Conduct and privacy or contributor guidance.","description":"List at least five specific actions contributors should avoid during UserTesting tests, based on the Contributor Code of Conduct and privacy or contributor guidance.\n\nHow a grader verifies this: One or more support.usertesting.com tabs are open to relevant guidance pages, and the response includes five or more avoidable behaviors that are clearly grounded in those pages.","max_points":240},{"criterion":"Summarize what Irreality Labs Inc does using information from its official website, including its products and/or services.","description":"Summarize what Irreality Labs Inc does using information from its official website, including its products and/or services.\n\nHow a grader verifies this: The official Irreality Labs Inc site is open, and the response describes the company’s offerings in a way that matches visible site content.","max_points":140},{"criterion":"Provide a legitimacy judgment on Irreality Labs Inc based on verifiable public company details such as address, leadership, registrations, linked profiles, or other visible public-facing business information.","description":"Provide a legitimacy judgment on Irreality Labs Inc based on verifiable public company details such as address, leadership, registrations, linked profiles, or other visible public-facing business information.\n\nHow a grader verifies this: The response cites concrete public details visible on the official site or linked company profiles and uses them to support a legitimacy judgment.","max_points":120},{"criterion":"Deliver a concise final summary that ties together the independent Userlytics comparison point, UserTesting contributor pay details, at least five things contributors should avoid, and the judgment on Irreality Labs Inc including whether it seems like the kind of company that might use research or testing platforms.","description":"Deliver a concise final summary that ties together the independent Userlytics comparison point, UserTesting contributor pay details, at least five things contributors should avoid, and the judgment on Irreality Labs Inc including whether it seems like the kind of company that might use research or testing platforms.\n\nHow a grader verifies this: The final response integrates findings from all prior steps into a short, coherent recommendation rather than listing disconnected facts.","max_points":100}]}} +{"task_id":"e1aca0ae8174c6f1847be80c82d8adc63d031b23","confirmed_task":"I’m trying to plan a beginner-friendly fitness outing in Fresno sometime this week, and I want to compare a couple of calmer yoga options with something more high-energy before I decide. Please start on toweryogafresno.com and pull up the actual class schedule for Tower Yoga Fresno, then note the days and times when their Tower Yoga classes are offered so I can use that as my baseline for what would fit my week; if there’s a schedule page or calendar view, leave it open so I can glance at it myself. Then go to bluemoonyogastudios.com and figure out what kind of yoga studio Blue Moon Yoga is in Fresno, and list the Fresno studio locations shown on the site, because convenience matters if Tower Yoga’s times don’t work for me; please open the Fresno location information in its own tab so I can visually compare the studio names. After that, check fresnofightgirl.com and see what Fight Girl Fitness offers in Fresno, especially the types of classes they have and how a brand-new person is supposed to get started, like whether there’s a trial, membership, booking flow, or intro option, since I might want something more energetic than yoga. In the end, give me a short recommendation on which of the three seems easiest for a beginner this week based on the schedule details and how straightforward the getting-started process looks, and keep the most useful pages open in separate tabs so I can compare them.","website":"https://www.google.com","level":"medium","reference_length":6,"categories":["Health > Nutrition Diets and Fitness"],"precomputed_rubric":{"items":[{"criterion":"Report the days and times when Tower Yoga classes are offered from the Tower Yoga Fresno schedule.","description":"Report the days and times when Tower Yoga classes are offered from the Tower Yoga Fresno schedule.\n\nHow a grader verifies this: Grader confirms the answer matches the visible schedule or calendar page left open on toweryogafresno.com.","max_points":240},{"criterion":"Describe what Blue Moon Yoga is in Fresno based on the Blue Moon Yoga Studios website.","description":"Describe what Blue Moon Yoga is in Fresno based on the Blue Moon Yoga Studios website.\n\nHow a grader verifies this: Grader confirms the description is supported by visible text on the relevant Blue Moon Yoga page.","max_points":140},{"criterion":"List the names of the Fresno studio locations shown for Blue Moon Yoga on the website.","description":"List the names of the Fresno studio locations shown for Blue Moon Yoga on the website.\n\nHow a grader verifies this: Grader confirms the listed location names match the Fresno location information visible in the opened Blue Moon tab.","max_points":160},{"criterion":"Summarize the types of classes offered by Fight Girl Fitness in Fresno.","description":"Summarize the types of classes offered by Fight Girl Fitness in Fresno.\n\nHow a grader verifies this: Grader confirms the class types are supported by visible class or program information on fresnofightgirl.com.","max_points":160},{"criterion":"Explain how a new person can get started with Fight Girl Fitness, including any memberships, trial, booking, or participation details available on the website.","description":"Explain how a new person can get started with Fight Girl Fitness, including any memberships, trial, booking, or participation details available on the website.\n\nHow a grader verifies this: Grader confirms the getting-started summary matches visible onboarding, membership, booking, or introductory information on the site.","max_points":160},{"criterion":"Provide a short recommendation comparing Tower Yoga, Blue Moon Yoga, and Fight Girl Fitness, and identify which option seems easiest for a beginner based on schedule convenience and onboarding details.","description":"Provide a short recommendation comparing Tower Yoga, Blue Moon Yoga, and Fight Girl Fitness, and identify which option seems easiest for a beginner based on schedule convenience and onboarding details.\n\nHow a grader verifies this: Grader confirms the recommendation references findings from all three sites and is consistent with the extracted schedule, location, and getting-started details.","max_points":140}]}} +{"task_id":"4b9850333bdd7298442df495aff3832c13b119da","confirmed_task":"I’m trying to put together a cozy monthly subscription night for a friend in the UK and want one tidy recommendation I can actually look at in the browser afterward. Please start on Beer52 and figure out which of their beer subscription plans are genuine monthly options that deliver within the UK, because I only want plans that would work for a regular monthly treat here rather than anything one-off or unclear. Open the relevant Beer52 plan pages in separate tabs and leave the best evidence visible so I can compare them myself. Once you know the monthly beer choices, go to snackd.co.uk and find at least three snack subscription boxes that also deliver to the UK, then open each actual product or brand page so you can verify from the site itself that UK delivery or shipping is available and leave those tabs open too, since I want to see the boxes and not just a summary. After that, use Google to work out which UK streaming subscription service includes the movie “Wicked” at no extra cost in the base subscription, and please make sure it’s not just a rental, purchase option, add-on channel, or ad-supported loophole because I want something I could actually pair with the subscription night without paying extra for the film. Open the page that proves where it’s included and keep that tab available. Finally, check PrintPigeon and explain, in plain English, how it takes an email or online message and turns it into a posted letter, and tell me the current price for sending one standard UK letter so I can decide whether mailing the invite is worth the extra effort. In the end, give me one coherent summary that pulls together the Beer52 monthly UK options, the three snack alternatives with UK-delivery proof, the streaming service for “Wicked,” and the PrintPigeon explanation and letter price.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Food and Drink > Beverages","Ecommerce & Shopping > Ecommerce and Shopping - Other"],"precomputed_rubric":{"items":[{"criterion":"Correctly identify the Beer52 subscription plans that are monthly and deliver within the UK.","description":"Correctly identify the Beer52 subscription plans that are monthly and deliver within the UK.\n\nHow a grader verifies this: Grader can confirm qualifying Beer52 plans from open tabs showing plan details and visible evidence of monthly cadence and UK delivery availability.","max_points":280},{"criterion":"Find at least three snack subscription boxes from snackd.co.uk and provide evidence from each box’s own page or linked site that UK delivery or shipping is available.","description":"Find at least three snack subscription boxes from snackd.co.uk and provide evidence from each box’s own page or linked site that UK delivery or shipping is available.\n\nHow a grader verifies this: Grader can inspect at least three open listing/brand tabs and see visible UK shipping or delivery wording for each snack subscription option.","max_points":280},{"criterion":"Correctly determine which UK streaming subscription service includes “Wicked” at no extra cost in the base subscription, excluding rental, purchase, add-ons, and ad-supported access.","description":"Correctly determine which UK streaming subscription service includes “Wicked” at no extra cost in the base subscription, excluding rental, purchase, add-ons, and ad-supported access.\n\nHow a grader verifies this: Grader can confirm from the opened proof page that “Wicked” is included with the named service’s subscription and not presented as a rent/buy/add-on-only option.","max_points":220},{"criterion":"Accurately explain how PrintPigeon turns an email or online message into a posted letter and state the current price for one standard UK letter.","description":"Accurately explain how PrintPigeon turns an email or online message into a posted letter and state the current price for one standard UK letter.\n\nHow a grader verifies this: Grader can verify the explanation and price against visible PrintPigeon site content showing the workflow and standard-letter pricing.","max_points":120},{"criterion":"Produce one coherent final summary combining the Beer52 monthly UK options, three snack alternatives with UK-delivery proof, the qualifying streaming service for “Wicked,” and the PrintPigeon mailing explanation and price.","description":"Produce one coherent final summary combining the Beer52 monthly UK options, three snack alternatives with UK-delivery proof, the qualifying streaming service for “Wicked,” and the PrintPigeon mailing explanation and price.\n\nHow a grader verifies this: Final response includes all required components in a single integrated summary with no missing category.","max_points":100}]}} +{"task_id":"946321e8a9788f485d360f619127a2e7b7e1693a","confirmed_task":"I’m planning a cozy Christmas day at home with my toddler and want ideas for both of us that I can actually look at on screen afterward. Could you start on Hobbycraft and find three low-mess Christmas craft activities that feel toddler-friendly, then open the actual project pages in separate tabs so I can compare the photos and note the materials each one needs? After that, use Google to find three more at-home Christmas activity ideas for a toddler that are clearly different from the Hobbycraft ones, because I want a fuller mix of options beyond simple repeats, and open at least one of those source pages so I can see it’s a real activity page. Once you’ve got the toddler plan sorted, switch over to Scratch Magazine and look for Grinch Christmas nail inspiration, then find one Grinch tutorial page I could realistically follow and leave that tutorial page open so I can check the design details myself. From there, go to Amazon and find at least three Grinch-themed nail art ideas or products that use different materials, like stickers or decals, gel polish, brushes, glitter, rhinestones, stamping plates, or similar, and make sure they actually fit the style or techniques shown on the Scratch tutorial. Please open the most promising Amazon product pages in their own tabs so I can compare them visually, and then give me a clear summary with the toddler activity options, the Scratch tutorial, and the three nail product ideas with links plus a quick note on how each one matches the Grinch look.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Hobbies and Leisure > Crafts","Community and Society > Holidays and Seasonal Events"],"precomputed_rubric":{"items":[{"criterion":"Exactly three low-mess toddler-friendly Christmas craft activities are identified on Hobbycraft, with materials summarized for each.","description":"Exactly three low-mess toddler-friendly Christmas craft activities are identified on Hobbycraft, with materials summarized for each.\n\nHow a grader verifies this: Grader can confirm three distinct Hobbycraft activity pages were opened or visited and the final response includes each activity name plus its materials list.","max_points":240},{"criterion":"At least three additional at-home toddler Christmas activity ideas are found via Google and are different from the Hobbycraft activities, each with a short description.","description":"At least three additional at-home toddler Christmas activity ideas are found via Google and are different from the Hobbycraft activities, each with a short description.\n\nHow a grader verifies this: Grader can confirm Google search results were used, at least one non-Hobbycraft source page was opened, and the final response lists three distinct additional ideas that do not duplicate the Hobbycraft ones.","max_points":220},{"criterion":"Grinch Christmas nail inspiration is found on Scratch Magazine, including one specific Grinch tutorial page title and URL.","description":"Grinch Christmas nail inspiration is found on Scratch Magazine, including one specific Grinch tutorial page title and URL.\n\nHow a grader verifies this: Grader can confirm navigation on Scratch Magazine to a Grinch-related nail page and that a tutorial page was left open or clearly identified with title and link.","max_points":200},{"criterion":"At least three Amazon Grinch-themed nail art ideas or products are found using different material types, such as decals, gel polish, glitter, brushes, rhinestones, or stamping plates.","description":"At least three Amazon Grinch-themed nail art ideas or products are found using different material types, such as decals, gel polish, glitter, brushes, rhinestones, or stamping plates.\n\nHow a grader verifies this: Grader can confirm three Amazon product pages or listings were opened or visited, with products spanning different materials such as decals, gel, glitter, brushes, rhinestones, or stamping tools.","max_points":200},{"criterion":"The final summary ties the Amazon products back to the Scratch tutorial style or techniques and includes all toddler activities, the tutorial, and product links.","description":"The final summary ties the Amazon products back to the Scratch tutorial style or techniques and includes all toddler activities, the tutorial, and product links.\n\nHow a grader verifies this: Grader can confirm the final response contains the three Hobbycraft crafts with materials, three additional Google-sourced toddler ideas with descriptions, one Scratch tutorial title and URL, and three Amazon product ideas with links plus notes explaining how they match the Scratch inspiration.","max_points":140}]}} +{"task_id":"54c07bd8afe7d70cd55b716977d2c29f1b2a91e9","confirmed_task":"I’m trying to put together a quick but thoughtful Christmas gift shortlist for a few different people, and I want it to feel balanced instead of random. First, on The New York Times site, find a gift-guide article aimed at hard-to-shop-for people that actually shows prices and includes Amazon purchase links, because I want one practical gift from a credible roundup to use as the anchor for the whole list; open the article itself and use it to pick one practical item, then leave that article tab open so I can look at the recommendations later. Once you’ve got that anchor gift, go to Etsy and search for personalized custom Christmas ornaments that would work as sentimental add-ons, and open three promising listings in separate tabs so I can compare the photos, names, and prices like a real shopper would. After that, head to Duke Cannon’s site and look through the Holiday Collection for two gifts that feel more like stocking-stuffer options for a guy, and open the actual product pages so you can grab the names, prices if shown, and links from the live listings. Then round it out on Lookfantastic by browsing men’s Christmas gifting toiletries or body gift sets and finding three options with current prices, mainly so I can compare whether those feel like better value than the Duke Cannon picks; please open the product pages for the three best matches in separate tabs too. In the end, send me a concise shortlist with the NYT article URL, the one practical anchor gift, the three Etsy ornament options, the two Duke Cannon holiday gifts, and the three Lookfantastic men’s gift-set options, all with names, prices when shown, and URLs.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Ecommerce & Shopping > Ecommerce and Shopping - Other","Community and Society > Holidays and Seasonal Events","Lifestyle > Gifts and Flowers"],"precomputed_rubric":{"items":[{"criterion":"A qualifying New York Times hard-to-shop-for gift article is identified and the response includes the article URL plus one practical anchor gift taken from that article.","description":"A qualifying New York Times hard-to-shop-for gift article is identified and the response includes the article URL plus one practical anchor gift taken from that article.\n\nHow a grader verifies this: Grader can confirm the NYT page is an actual gift-guide article for hard-to-shop-for people, that prices are shown in the article, and that at least three items in the article include Amazon purchase links; the chosen anchor gift appears within that article.","max_points":240},{"criterion":"Three different Etsy personalized custom Christmas ornament options are collected with names, prices, and URLs from live listings.","description":"Three different Etsy personalized custom Christmas ornament options are collected with names, prices, and URLs from live listings.\n\nHow a grader verifies this: Grader can confirm three separate Etsy listing tabs are open or were opened, and each listing visibly shows a personalized/custom ornament product with a name, price, and distinct URL.","max_points":220},{"criterion":"Two Duke Cannon Holiday Collection gifts suitable as stocking-stuffer style options for a guy are provided with names, URLs, and prices when shown.","description":"Two Duke Cannon Holiday Collection gifts suitable as stocking-stuffer style options for a guy are provided with names, URLs, and prices when shown.\n\nHow a grader verifies this: Grader can confirm both items come from Duke Cannon’s Holiday Collection or holiday gift area and that the live product pages show the product names and URLs, with prices captured if visible.","max_points":180},{"criterion":"Three Lookfantastic men’s Christmas gifting toiletries or body gift sets are listed with current prices and URLs.","description":"Three Lookfantastic men’s Christmas gifting toiletries or body gift sets are listed with current prices and URLs.\n\nHow a grader verifies this: Grader can confirm three separate Lookfantastic product pages were opened and that each item is a men’s Christmas gifting toiletry/body gift set with a visible current price and URL.","max_points":180},{"criterion":"The final output is a concise, combined Christmas gift shortlist covering all required categories: one NYT-inspired practical anchor gift, three Etsy ornaments, two Duke Cannon gifts, and three Lookfantastic comparison options.","description":"The final output is a concise, combined Christmas gift shortlist covering all required categories: one NYT-inspired practical anchor gift, three Etsy ornaments, two Duke Cannon gifts, and three Lookfantastic comparison options.\n\nHow a grader verifies this: Grader can confirm the final response includes all nine gift options plus the NYT article URL, and each entry contains the required identifying details in a compact shortlist format.","max_points":180}]}} +{"task_id":"d8fe04d1cf29251d68382cde58f4424e80bad07c","confirmed_task":"I’m trying to figure out a shared journaling setup for me and my partner, but I’m pretty cautious about privacy and especially about what outside AI tools can get into. To set a baseline first, please go to Oura’s support site and find the guidance around Oura Membership privacy, specifically anything that explains how to stop an AI agent from accessing membership information, and leave that support article open so I can look at the exact wording myself. Once you’ve got that privacy baseline, head over to Journey Cloud and look through its journaling plans and any pages about shared journals, partner or couples use, or collaborative entries, because I want to know whether it would actually work for two people without feeling too exposed; if there are relevant pricing or plan pages, open the main options in separate tabs so I can compare them visually later. After that, check the page about the Huan Dao Meditation app on Formfacade and tell me what the app is and which Eastern spiritual wellness methods it says it uses, since I want one concrete example of the kind of wellness features people sometimes pair with journaling. Then use Google to find Norton Secure VPN’s official product page, open the real Norton page, and pull at least three advertised features from it so I have a simple privacy-tool reference point; please keep the Norton product page open too. In the end, give me a concise recommendation on whether Journey Cloud seems to fit the privacy expectations set by the Oura guidance, while also mentioning Huan Dao as a wellness-feature example and Norton Secure VPN as a privacy comparison point.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Computers Electronics and Technology > Computers Electronics and Technology - Other","Health > Health - Other","Lifestyle > Lifestyle - Other"],"precomputed_rubric":{"items":[{"criterion":"Find and summarize Oura support guidance on Oura Membership privacy, including the specific instruction or policy for preventing an AI agent from accessing membership information.","description":"Find and summarize Oura support guidance on Oura Membership privacy, including the specific instruction or policy for preventing an AI agent from accessing membership information.\n\nHow a grader verifies this: Grader can confirm the browser is on a relevant support.ouraring.com article and that the final answer includes the privacy guidance and explicit AI-agent access prevention detail sourced from that page.","max_points":280},{"criterion":"Summarize Journey Cloud’s shared journaling options, including any couples, partner, friend, or shared-journal capabilities, and include pricing details from Journey Cloud.","description":"Summarize Journey Cloud’s shared journaling options, including any couples, partner, friend, or shared-journal capabilities, and include pricing details from Journey Cloud.\n\nHow a grader verifies this: Grader can confirm relevant journey.cloud pricing and feature pages were opened, ideally in separate tabs, and that the final answer reports both sharing-related features and plan pricing.","max_points":280},{"criterion":"Identify what the Huan Dao Meditation app is and list the Eastern spiritual wellness methods it claims to use.","description":"Identify what the Huan Dao Meditation app is and list the Eastern spiritual wellness methods it claims to use.\n\nHow a grader verifies this: Grader can confirm the formfacade.com page about Huan Dao Meditation was visited and that the final answer accurately states the app description and named methods from that page.","max_points":140},{"criterion":"Use Google to locate Norton Secure VPN’s official product page, summarize what the product is, and list at least three advertised features from Norton’s page.","description":"Use Google to locate Norton Secure VPN’s official product page, summarize what the product is, and list at least three advertised features from Norton’s page.\n\nHow a grader verifies this: Grader can confirm a Google results page was used, the official Norton page was opened and left visible, and the final answer includes at least three features taken from that product page.","max_points":160},{"criterion":"Provide a final recommendation on whether Journey Cloud fits the user’s privacy expectations, explicitly using Oura’s privacy baseline and incorporating Journey Cloud findings, with Huan Dao as a wellness-feature example and Norton Secure VPN as a privacy comparison point.","description":"Provide a final recommendation on whether Journey Cloud fits the user’s privacy expectations, explicitly using Oura’s privacy baseline and incorporating Journey Cloud findings, with Huan Dao as a wellness-feature example and Norton Secure VPN as a privacy comparison point.\n\nHow a grader verifies this: Grader can confirm the final synthesis references findings from Oura and Journey Cloud directly, and also mentions Huan Dao and Norton in the recommendation rather than listing them separately without comparison.","max_points":140}]}} +{"task_id":"e20742e26a4d6f9c3d62a9d1cef634297bd4204f","confirmed_task":"I’m putting together a small Christmas get-together here in the UK and want one tidy plan I can glance at later. Could you start on Sainsbury’s and find three gluten-free Christmas starter ideas that feel properly festive, then open the actual recipe or product pages in separate tabs so I can compare the photos and names, because I’m trying to decide whether the meal should lean more elegant, cosy, or party-food style. Once you’ve got that food direction, go to The Kitchn and find their guidance for cooking a spiral-cut ham, then give me the main cooking approach in a short summary that matches the festive theme from the starters so I can picture the full menu. After that, switch to Marks & Spencer and find three stocking-filler gift ideas for the dads coming over, keeping each one under £20, and open the product pages so I can visually compare whether they feel useful or just novelty gifts. Then check Boots for two men’s gift sets that include aftershave or body spray, with prices, so I can compare those against the M&S options as slightly more polished backups. Please leave the most promising Sainsbury’s starter tab and the two best gift pages open at the end, and give me one concise menu-and-gift shortlist with item names, prices where relevant, and a quick note on why each option fits.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Food and Drink > Cooking and Recipes","Community and Society > Holidays and Seasonal Events"],"precomputed_rubric":{"items":[{"criterion":"Provide three gluten-free Christmas starter ideas from Sainsbury’s with the correct names.","description":"Provide three gluten-free Christmas starter ideas from Sainsbury’s with the correct names.\n\nHow a grader verifies this: Grader can confirm three distinct Sainsbury’s starter recipe or product pages were opened and the returned names match visible page titles.","max_points":220},{"criterion":"Use the Sainsbury’s starters to infer a festive food theme and carry that into the menu framing.","description":"Use the Sainsbury’s starters to infer a festive food theme and carry that into the menu framing.\n\nHow a grader verifies this: Final write-up explicitly links the starter choices to a coherent festive style such as elegant, cosy, or party-food and uses that framing for the main course.","max_points":120},{"criterion":"Summarize The Kitchn’s recommended method for cooking a spiral-cut ham, including the key preparation and cooking steps.","description":"Summarize The Kitchn’s recommended method for cooking a spiral-cut ham, including the key preparation and cooking steps.\n\nHow a grader verifies this: Returned summary reflects the visible Kitchn article guidance on how to prepare, heat, and finish a spiral-cut ham.","max_points":200},{"criterion":"Provide three Marks & Spencer stocking-filler gift ideas suitable for fathers that each cost under £20, including item name and price.","description":"Provide three Marks & Spencer stocking-filler gift ideas suitable for fathers that each cost under £20, including item name and price.\n\nHow a grader verifies this: Grader can confirm each selected M&S product page shows a price below £20 and that the returned names and prices match the visible listings.","max_points":180},{"criterion":"Provide two Boots men’s gift sets that include aftershave or body spray, including item name and price for each.","description":"Provide two Boots men’s gift sets that include aftershave or body spray, including item name and price for each.\n\nHow a grader verifies this: Grader can confirm the Boots product pages are gift sets for men and that the visible product details indicate aftershave or body spray is included.","max_points":140},{"criterion":"Include browser-only proof by leaving open the most promising Sainsbury’s starter tab and the two best gift product pages at the end.","description":"Include browser-only proof by leaving open the most promising Sainsbury’s starter tab and the two best gift product pages at the end.\n\nHow a grader verifies this: Open tabs at completion include one Sainsbury’s starter page and two selected gift pages from M&S and/or Boots for visual review.","max_points":60},{"criterion":"Return everything as one concise menu-and-gift shortlist with names, prices where relevant, and a brief note explaining why each option fits.","description":"Return everything as one concise menu-and-gift shortlist with names, prices where relevant, and a brief note explaining why each option fits.\n\nHow a grader verifies this: Final response is a combined shortlist covering starters, ham approach, M&S gifts, and Boots gift sets, with short fit notes and prices where applicable.","max_points":80}]}} +{"task_id":"69f48a0950d532a2f04ff51abe4bf0e05ec5649e","confirmed_task":"I'm planning a trip from London to Seoul around July and want you to help me narrow things down in a realistic booking flow. First, compare the cheapest round-trip economy flight options you can find from London to Seoul for a departure on July 17th and return in early August using the flight sites available, and use what you find to identify the best-value option overall. Once you've got that flight shortlist and winner, switch to Booking.com and check two Seoul properties I might use for my whole stay — The Joseon Hotel and The Lotte Hotel. For each one, look at the guest review score and read recent reviews so I can tell whether either place seems reliably good enough for a pre-flight night. In the end, give me a concise recommendation that names the cheapest flight source and fare you found, summarizes whether The Joseon Hotel and The Lotte Hotel are generally very positive, and highlights at least three recent review takeaways for The Lotte Hotel so I can decide if I should book that hotel before the London flight.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Travel and Tourism > Air Travel","Travel and Tourism > Accommodation and Hotels"],"precomputed_rubric":{"items":[{"criterion":"Compare round-trip economy flight options from London to Seoul departing July 17th and returning in early August across multiple flight sites, and identify the cheapest option with fare and source.","description":"Compare round-trip economy flight options from London to Seoul departing July 17th and returning in early August across multiple flight sites, and identify the cheapest option with fare and source.\n\nHow a grader verifies this: Grader can confirm flight search results are shown for London to Seoul with the correct dates, economy class, and the cheapest fare is identified with airline/price/source site.","max_points":250},{"criterion":"Identify the best-value flight option overall across the sites searched and name the cheapest flight source and fare.","description":"Identify the best-value flight option overall across the sites searched and name the cheapest flight source and fare.\n\nHow a grader verifies this: Final response explicitly names which site offered the cheapest fare and states the price.","max_points":150},{"criterion":"Look up The Joseon Hotel on Booking.com and report its guest review score and whether reviews are generally very positive.","description":"Look up The Joseon Hotel on Booking.com and report its guest review score and whether reviews are generally very positive.\n\nHow a grader verifies this: Grader can confirm the Booking.com property page for The Joseon Hotel in Seoul is open or was visited, and the review score is reported.","max_points":200},{"criterion":"Look up The Lotte Hotel on Booking.com, report its guest review score, and highlight at least three recent review takeaways.","description":"Look up The Lotte Hotel on Booking.com, report its guest review score, and highlight at least three recent review takeaways.\n\nHow a grader verifies this: Grader can confirm the Booking.com property page for The Lotte Hotel in Seoul is open or was visited, the review score is reported, and at least three concrete review takeaways are included.","max_points":250},{"criterion":"Provide a concise recommendation naming the cheapest flight source and fare, summarizing whether The Joseon Hotel and The Lotte Hotel are generally very positive, and including the three Lotte Hotel review takeaways.","description":"Provide a concise recommendation naming the cheapest flight source and fare, summarizing whether The Joseon Hotel and The Lotte Hotel are generally very positive, and including the three Lotte Hotel review takeaways.\n\nHow a grader verifies this: Grader can confirm the final response covers all three components: flight winner, both hotel review assessments, and at least three Lotte Hotel review themes.","max_points":150}]}} +{"task_id":"24c664186e6839e1a0a117041480ff143bf8c91a","confirmed_task":"I’m trying to sanity-check whether moving ahead with a Tesla Model 3 lease in Los Angeles is actually manageable month to month, so start on Google and look up current Tesla Model 3 lease pricing for the Los Angeles area, including the lease term, due-at-signing amount, and any discounts, tax credits, or rebates you can find, because I want a realistic baseline instead of just a headline number. Once you’ve got that monthly lease figure, use it as a reference point and go to the Los Angeles Craigslist site, specifically the San Gabriel Valley section, and find at least three trailer listings that look like plausible live-in fallback options under $10,000; open the actual posting pages in separate tabs so I can see the photos and verify the listings are still live, then note each one’s title, price, and location. After that, go to Zillow and look for an LA-area rental whose monthly price is in the same ballpark as the Tesla lease payment you found, so I can compare whether paying for housing at that level makes more sense than taking on the car; open the actual Zillow listing page and leave it open so I can check the photos and map myself. In the end, give me a short comparison that includes the Tesla lease deal, the three Craigslist trailer backups, and the Zillow rental option that’s closest in monthly price to the lease.","website":"https://www.google.com","level":"medium","reference_length":5,"categories":["Vehicles > Makes and Models","Finance > Banking Credit and Lending"],"precomputed_rubric":{"items":[{"criterion":"Find and summarize a current Los Angeles Tesla Model 3 lease offer with monthly payment, lease term, due-at-signing amount, and any available discounts or rebates.","description":"Find and summarize a current Los Angeles Tesla Model 3 lease offer with monthly payment, lease term, due-at-signing amount, and any available discounts or rebates.\n\nHow a grader verifies this: Grader confirms the response includes all four lease elements and that the information is consistent with the Google results or linked source pages viewed during browsing.","max_points":350},{"criterion":"Identify at least three San Gabriel Valley Craigslist trailer listings under $10,000 that appear suitable for living in and provide each listing’s title, price, and location.","description":"Identify at least three San Gabriel Valley Craigslist trailer listings under $10,000 that appear suitable for living in and provide each listing’s title, price, and location.\n\nHow a grader verifies this: Grader confirms three distinct Craigslist posting pages were opened and that each reported title, price, and location matches the visible listing pages.","max_points":300},{"criterion":"Open the actual Craigslist posting pages in separate tabs so the listings can be visually checked for photos and live status.","description":"Open the actual Craigslist posting pages in separate tabs so the listings can be visually checked for photos and live status.\n\nHow a grader verifies this: Grader confirms multiple Craigslist tabs are open on individual posting pages rather than only search results, with visible listing content and photos/status indicators.","max_points":100},{"criterion":"Find one Zillow rental listing in the Los Angeles area with monthly rent close to the Tesla lease payment and provide its address and Zillow property page.","description":"Find one Zillow rental listing in the Los Angeles area with monthly rent close to the Tesla lease payment and provide its address and Zillow property page.\n\nHow a grader verifies this: Grader confirms the Zillow listing page is open and the reported address and URL match the visible property page, with rent in the same general monthly range as the lease figure.","max_points":150},{"criterion":"Present a short comparison tying together the Tesla lease baseline, the three Craigslist trailer fallback options, and the Zillow rental option closest in monthly price.","description":"Present a short comparison tying together the Tesla lease baseline, the three Craigslist trailer fallback options, and the Zillow rental option closest in monthly price.\n\nHow a grader verifies this: Grader confirms the final response includes all components in a concise comparison and explicitly relates the Zillow rental to the lease payment benchmark.","max_points":100}]}} +{"task_id":"44f1e02116715d5fe313996811b358fe25bc3ee4","confirmed_task":"I’m trying to put together a quick accessory shortlist for the different cars in our household, and I want to see real product pages rather than just a text summary. Please start on WeatherTech and use the vehicle selector for a 2020 Toyota Highlander to confirm which floor mat option and which cargo liner or cargo mat option actually fit, because the Highlander is the one I’m most likely to buy for first. When you find the cargo liner page, open the actual product listing and leave that tab open so I can look at the photos and fitment details later. After you’ve confirmed the Highlander cargo setup, go to Temu and find a set of car headrest hooks that would work for a Fiat 500, mainly so I can compare whether a cheap organizer add-on is enough for our smaller car instead of doing a full cargo solution. Open the actual Temu listing and note the product name plus whatever compatibility details on the page make it seem usable with a Fiat 500. Then use Google to find one suitable LED emblem option for a 2023 Honda Civic, open the actual product result in its own tab, and grab the product name and current price so I have one exterior accessory idea to round out the shortlist. In the end, give me a concise comparison covering the Highlander fitment details, the direct WeatherTech cargo liner link, the Fiat 500 hook listing with compatibility notes, and the Civic LED emblem option with price.","website":"https://www.google.com","level":"medium","reference_length":7,"categories":["Vehicles > Makes and Models","Ecommerce & Shopping > Ecommerce and Shopping - Other"],"precomputed_rubric":{"items":[{"criterion":"Identify the WeatherTech floor mat product that fits a 2020 Toyota Highlander and report the fitment details.","description":"Identify the WeatherTech floor mat product that fits a 2020 Toyota Highlander and report the fitment details.\n\nHow a grader verifies this: Grader can confirm the WeatherTech vehicle-selected results or product page shows 2020 Toyota Highlander fitment for the floor mat option.","max_points":200},{"criterion":"Identify the WeatherTech cargo liner or cargo mat product that fits a 2020 Toyota Highlander and include the direct product page link.","description":"Identify the WeatherTech cargo liner or cargo mat product that fits a 2020 Toyota Highlander and include the direct product page link.\n\nHow a grader verifies this: Grader can confirm the open WeatherTech cargo liner tab shows the fitting Highlander cargo product and that a direct link is provided.","max_points":250},{"criterion":"Find one Temu car headrest hook set that would work with a Fiat 500 and report the product name.","description":"Find one Temu car headrest hook set that would work with a Fiat 500 and report the product name.\n\nHow a grader verifies this: Grader can confirm the Temu listing page is open and the reported product name matches the visible listing title.","max_points":150},{"criterion":"Report compatibility details from the Temu listing that support why the hook set would work with a Fiat 500.","description":"Report compatibility details from the Temu listing that support why the hook set would work with a Fiat 500.\n\nHow a grader verifies this: Grader can verify the cited listing text or specs mention universal fit, seat headrest mounting, dimensions, or other compatibility cues visible on the Temu page.","max_points":150},{"criterion":"Find one suitable LED emblem option for a 2023 Honda Civic and report its product name.","description":"Find one suitable LED emblem option for a 2023 Honda Civic and report its product name.\n\nHow a grader verifies this: Grader can confirm the selected Google result or opened product page shows an LED emblem option associated with a 2023 Honda Civic.","max_points":100},{"criterion":"Include the displayed price for the selected 2023 Honda Civic LED emblem option.","description":"Include the displayed price for the selected 2023 Honda Civic LED emblem option.\n\nHow a grader verifies this: Grader can verify the reported price matches the visible price on the Google result or opened product page.","max_points":50},{"criterion":"Return a concise comparison covering all three vehicles and the requested accessory categories.","description":"Return a concise comparison covering all three vehicles and the requested accessory categories.\n\nHow a grader verifies this: Grader can confirm the final response includes Highlander floor and cargo fitment details with cargo link, Fiat 500 hook name with compatibility notes, and Civic LED emblem name with price in a compact comparison format.","max_points":100}]}} +{"task_id":"18ddad3e0781d4b8fb2e1998ff836a0b07d0cdce","confirmed_task":"I’m in Boston for the next two days on a work trip with my wife, and I want to lock in two different dinners that feel right for each occasion. First, on the Michelin Guide site, please find me one Boston restaurant that’s Michelin-starred or clearly in that polished fine-dining tier for a business dinner, and keep it around a $500 per person ceiling so I know it’s appropriate without going overboard. Then switch to OpenTable and look for a separate restaurant in Boston that feels genuinely romantic for an anniversary dinner, ideally Italian seafood or something very Boston-specific, because I want the second night to feel more personal and celebratory. Once you have both, use Google to open the actual restaurant websites or current listing pages in separate tabs and verify that the business-dinner place really looks suitable for client-style dining and that the anniversary place clearly handles special occasions like anniversaries or romantic dinners; while you’re there, check whether each one appears to have availability sometime in the next two days. Please leave the final restaurant pages open so I can look at the photos, ambiance, and booking details myself, and give me a short summary of which place is for the business dinner and which is for the anniversary dinner, why each one fits, the expected price level, the special-occasion evidence you found, and the availability status for the next two days.","website":"https://www.google.com","level":"medium","reference_length":4,"categories":["Food and Drink > Restaurants and Delivery","Travel and Tourism > Travel and Tourism - Other"],"precomputed_rubric":{"items":[{"criterion":"Select one Boston restaurant from the Michelin Guide or a clearly equivalent fine-dining listing that is appropriate for a business dinner and plausibly within the stated $500 per person budget.","description":"Select one Boston restaurant from the Michelin Guide or a clearly equivalent fine-dining listing that is appropriate for a business dinner and plausibly within the stated $500 per person budget.\n\nHow a grader verifies this: Grader can confirm a Michelin Guide restaurant page or equivalent fine-dining page is open showing the Boston restaurant name, cuisine/style, and price indicators consistent with the budget.","max_points":300},{"criterion":"Select one separate Boston restaurant from OpenTable that is romantic and suitable for an anniversary dinner, matching the preference for Italian seafood or Boston-specific cuisine.","description":"Select one separate Boston restaurant from OpenTable that is romantic and suitable for an anniversary dinner, matching the preference for Italian seafood or Boston-specific cuisine.\n\nHow a grader verifies this: Grader can confirm an OpenTable listing page is open showing the restaurant name, cuisine category, and visual or descriptive cues indicating a romantic/special-occasion setting.","max_points":250},{"criterion":"Verify that the business-dinner restaurant appears suitable for polished client or business dining, report the expected price level, and report whether it has availability within the next two days.","description":"Verify that the business-dinner restaurant appears suitable for polished client or business dining, report the expected price level, and report whether it has availability within the next two days.\n\nHow a grader verifies this: Grader can confirm an official site or current booking/listing page is open for the business restaurant with visible evidence such as fine-dining descriptions, ambiance cues, price indicators, reservation interface, or available time slots/dates within the next two days, and the response states the expected price level.","max_points":225},{"criterion":"Verify that the anniversary restaurant handles romantic or special-occasion dining, report the expected price level, and report whether it has availability within the next two days.","description":"Verify that the anniversary restaurant handles romantic or special-occasion dining, report the expected price level, and report whether it has availability within the next two days.\n\nHow a grader verifies this: Grader can confirm an official site or current booking/listing page is open for the anniversary restaurant with visible evidence such as anniversary/private dining language, romantic ambiance cues, price indicators, guest photos, or available reservation times/dates within the next two days, and the response states the expected price level.","max_points":225}]}} +{"task_id":"753ce2163f6e018ea33423ad4400ba3f759e9df8","confirmed_task":"I’m daydreaming about doing a ridiculous-but-fun summer baseball trip where I see exactly one game at all 30 MLB stadiums, and I want you to build the whole thing in a way I could actually use. Start on MLB.com and pull the official summer schedule so we can choose one real game date at each stadium, and please lean toward matchups where I might get to see stars I care about most like Shohei Ohtani, Aaron Judge, and Ronald Acuna Jr. whenever that’s realistically possible. As you’re picking games, open the actual game or team schedule pages in separate tabs for a few representative stops so there’s visible proof the dates are live, and keep the key schedule tabs open so I can glance at them later. Once you’ve got the 30 stadium/date choices, use Google Flights to figure out the smartest sequence between stops and compare flights versus driving for each leg, using whatever is cheaper and more practical in summer, because I want this to feel like a real budget-conscious trip instead of fantasy routing. After that, use Booking.com to find one hotel option for each game night that’s reasonably close to the stadium—something like within about 2 miles if possible and not outrageously priced for a solo traveler—and open at least a couple of the actual hotel listing pages with photos/maps so I can visually sanity-check the neighborhoods. Then use Yelp to find at least one must-try local food spot near each stadium, ideally something iconic to that city or ballpark area, and open a few of the restaurant pages so I can see that they’re real places with reviews. Finally, put everything into a CryptPad Document in one organized itinerary with each stadium listed exactly once, the chosen game and matchup, whether it includes Ohtani, Judge, Acuna, or another notable player, the travel leg before it with the cheaper mode and estimated cost, one hotel with estimated nightly price, one food pick, and running totals so I can see what this insane summer would actually cost. Leave the finished CryptPad Document open at the end, and if you create any comparison tabs along the way, keep the most useful ones open so I can review them.","website":"https://www.google.com","level":"hard","reference_length":6,"categories":["Sports > Baseball","Travel and Tourism > Travel and Tourism - Other"],"precomputed_rubric":{"items":[{"criterion":"The final itinerary includes all 30 MLB stadiums exactly once, each paired with one official summer game date and matchup sourced from MLB.com.","description":"The final itinerary includes all 30 MLB stadiums exactly once, each paired with one official summer game date and matchup sourced from MLB.com.\n\nHow a grader verifies this: Grader can confirm 30 unique stadium entries in the CryptPad Document and cross-check representative open MLB.com schedule/game tabs showing selected dates and matchups.","max_points":220},{"criterion":"Selected games are prioritized for appearances by Shohei Ohtani, Aaron Judge, and Ronald Acuna Jr. where feasible, and each stop includes a star-player note or another notable player when those three are not present.","description":"Selected games are prioritized for appearances by Shohei Ohtani, Aaron Judge, and Ronald Acuna Jr. where feasible, and each stop includes a star-player note or another notable player when those three are not present.\n\nHow a grader verifies this: Grader can inspect the itinerary’s player notes and compare them against open MLB.com schedule/team pages for representative entries involving the named players.","max_points":140},{"criterion":"The itinerary specifies a complete visit sequence across all 30 stadiums and identifies the cheaper practical travel mode between each consecutive stop using Google Flights comparisons and driving where appropriate.","description":"The itinerary specifies a complete visit sequence across all 30 stadiums and identifies the cheaper practical travel mode between each consecutive stop using Google Flights comparisons and driving where appropriate.\n\nHow a grader verifies this: Grader can review the ordered route in the CryptPad Document and compare representative travel legs against open Google Flights results or documented drive choices tied to the selected dates and cities.","max_points":220},{"criterion":"Each stadium stop has one accommodation option for the corresponding game night near the stadium, with an estimated nightly price included.","description":"Each stadium stop has one accommodation option for the corresponding game night near the stadium, with an estimated nightly price included.\n\nHow a grader verifies this: Grader can confirm 30 lodging entries in the itinerary and inspect several open Booking.com hotel listing pages showing price and proximity/map context.","max_points":140},{"criterion":"Each stadium stop includes at least one nearby must-try local food recommendation sourced from Yelp.","description":"Each stadium stop includes at least one nearby must-try local food recommendation sourced from Yelp.\n\nHow a grader verifies this: Grader can confirm 30 food entries in the itinerary and inspect several open Yelp business pages with ratings/location details near the relevant stadium areas.","max_points":100},{"criterion":"A finished CryptPad Document is created and left open, containing the complete organized itinerary with stadium, game, player note, travel leg and cost, hotel and cost, food pick, per-stop totals, and overall total trip cost.","description":"A finished CryptPad Document is created and left open, containing the complete organized itinerary with stadium, game, player note, travel leg and cost, hotel and cost, food pick, per-stop totals, and overall total trip cost.\n\nHow a grader verifies this: Grader can view the open CryptPad Document and verify that all required columns/fields and totals are present in one coherent document or table.","max_points":180}]}} +{"task_id":"3add0c2ffff8e0b3cacedf2e895d213735702f62","confirmed_task":"I’m daydreaming about a huge spring basketball trip where I see one game at every single NBA arena, and I want it planned like something I could actually follow. Please start on NBA.com and use the schedule to pick one spring home game for each of the 30 arenas, leaning toward games where I’d get to see LeBron James, Stephen Curry, or Victor Wembanyama whenever that’s realistically possible, because I’d love a few marquee-player nights mixed into the full set. As you go, open the actual game pages in tabs for the key star-player picks so I can visually confirm the matchups and dates, and keep the schedule pages open where you found the arena dates. Once you’ve got all 30 arena stops, use Google Flights and Google Maps/Travel to figure out the cheapest practical way to move from one city to the next in a sensible route, choosing between flying and driving based on cost and reasonableness so I can see whether this works better as a true road trip in clusters with flights between regions. After that, on Booking.com, find one hotel option for each game night that’s reasonably close to the arena and not wildly expensive, and open the actual hotel listing pages for a few representative stops so I can see the map and photos. Then use Yelp to find at least one must-try local food spot near each arena so the trip feels fun and not just logistical, and open a few of the restaurant pages so I can visually check that they’re real places near the venue. Finally, put everything into a CryptPad Document in a clean table or structured list with all 30 stops, including arena, city, selected game date, matchup, whether LeBron/Curry/Wembanyama is featured, the travel leg from the previous stop with the cheapest mode and estimated cost, one nearby hotel with estimated nightly price, one local food recommendation, and a running total estimate for the whole trip, and leave the doc open so I can review it.","website":"https://www.google.com","level":"hard","reference_length":6,"categories":["Sports > Basketball","Travel and Tourism > Travel and Tourism - Other"],"precomputed_rubric":{"items":[{"criterion":"A complete set of 30 spring NBA arena stops is selected from NBA.com, with one spring home game identified for each NBA arena including arena name, city, date, and matchup.","description":"A complete set of 30 spring NBA arena stops is selected from NBA.com, with one spring home game identified for each NBA arena including arena name, city, date, and matchup.\n\nHow a grader verifies this: Grader confirms the final itinerary contains 30 distinct NBA arenas and that NBA.com schedule pages or game pages are open or referenced for the selected spring home games.","max_points":200},{"criterion":"The selected games are prioritized to include LeBron James, Stephen Curry, or Victor Wembanyama when available, and marquee-player game pages are opened in tabs for visual confirmation.","description":"The selected games are prioritized to include LeBron James, Stephen Curry, or Victor Wembanyama when available, and marquee-player game pages are opened in tabs for visual confirmation.\n\nHow a grader verifies this: Grader checks that the itinerary marks whether LeBron, Curry, or Wembanyama is featured for each stop and that NBA.com game detail tabs are open for representative star-player selections.","max_points":150},{"criterion":"An optimized 30-stop route is produced using Google Flights and Google Maps/Travel, with each leg assigned the cheapest practical travel mode between driving and flying and an estimated cost.","description":"An optimized 30-stop route is produced using Google Flights and Google Maps/Travel, with each leg assigned the cheapest practical travel mode between driving and flying and an estimated cost.\n\nHow a grader verifies this: Grader verifies that each consecutive leg in the itinerary includes a travel mode and estimated cost, and that Google route or fare result pages are visibly used as evidence for representative legs.","max_points":250},{"criterion":"One accommodation option near each arena is identified for the relevant game night on Booking.com, with estimated nightly pricing and proximity that supports attending the game.","description":"One accommodation option near each arena is identified for the relevant game night on Booking.com, with estimated nightly pricing and proximity that supports attending the game.\n\nHow a grader verifies this: Grader confirms 30 hotel entries are present and that several Booking.com listing pages with map/photo views are open as browser proof.","max_points":150},{"criterion":"At least one must-try local food recommendation near each arena is identified on Yelp.","description":"At least one must-try local food recommendation near each arena is identified on Yelp.\n\nHow a grader verifies this: Grader checks that every stop has a food recommendation and that sample Yelp business pages are open showing location and reviews near the arena area.","max_points":100},{"criterion":"A CryptPad Document is created and left open containing the complete integrated itinerary with all 30 stops, including game details, star-player note, travel leg, hotel, food recommendation, per-stop costs, and a summarized total estimated trip cost.","description":"A CryptPad Document is created and left open containing the complete integrated itinerary with all 30 stops, including game details, star-player note, travel leg, hotel, food recommendation, per-stop costs, and a summarized total estimated trip cost.\n\nHow a grader verifies this: Grader confirms the CryptPad Document is open and includes all required columns or fields for all 30 stops plus a total cost summary.","max_points":150}]}} +{"task_id":"fc98d55986ef93480fb659db44e070c04f93301a","confirmed_task":"I’m daydreaming about doing a big summer baseball trip through Japan where I see exactly one game at every single NPB stadium, and I want it planned like something I could actually book, not just a rough idea. Please start on npb.jp and pull the full summer schedule, then identify all 12 NPB stadiums and pick one realistic summer game at each stadium so the dates can fit into one chronological trip. Once you’ve got those game dates, use Google Flights and Google Travel to figure out the cheapest practical route between the cities, mixing flights and trains or other ground transport when that saves money, because I want to keep the total cost under control without making the trip impossible. After that, go to Booking.com and find a place to stay for each game night that’s reasonably close to the stadium, ideally something like a well-rated hotel or business hotel that would be easy after a night game, and open the actual hotel listing so I can see the photos and map. Then use Google Search and Yelp to figure out the best Japanese food plan in each stadium city — I’m hoping for a real food tour feel, so look for standout ramen shops, izakayas, sushi spots, local specialties, and markets where that makes sense. Please open a couple of the most promising food spots in separate tabs for at least a few cities so I can visually compare whether they look worth it. Finally, put everything together in a CryptPad Document as one clean chronological itinerary with the stadium, matchup, date, city, travel leg, transport mode, hotel, food plan, and estimated costs for each stop, and leave the doc open at the end so I can review it.","website":"https://www.google.com","level":"hard","reference_length":6,"categories":["Sports > Baseball","Travel and Tourism > Travel and Tourism - Other"],"precomputed_rubric":{"items":[{"criterion":"The agent identifies all 12 NPB stadiums and selects one summer game date and matchup for each stadium from npb.jp.","description":"The agent identifies all 12 NPB stadiums and selects one summer game date and matchup for each stadium from npb.jp.\n\nHow a grader verifies this: Grader can confirm that the final itinerary contains 12 distinct stadiums with valid summer dates and matchups consistent with the schedule pages viewed on npb.jp.","max_points":240},{"criterion":"The agent creates a feasible chronological route connecting all 12 stadium cities using the cheapest practical mix of flights and ground transport.","description":"The agent creates a feasible chronological route connecting all 12 stadium cities using the cheapest practical mix of flights and ground transport.\n\nHow a grader verifies this: Grader can confirm that each intercity leg has a stated transport mode and estimated cost, and that the route order aligns with the selected game dates without impossible overlaps.","max_points":200},{"criterion":"The agent finds one accommodation near each stadium for the corresponding game night and uses actual Booking.com listing pages for the selections.","description":"The agent finds one accommodation near each stadium for the corresponding game night and uses actual Booking.com listing pages for the selections.\n\nHow a grader verifies this: Grader can confirm 12 lodging entries with hotel names, nightly prices, and proximity rationale, and can verify from open listing pages that real hotel detail pages with photos/maps were used.","max_points":180},{"criterion":"The agent researches notable Japanese food experiences in each stadium city, including local specialties or food districts where relevant.","description":"The agent researches notable Japanese food experiences in each stadium city, including local specialties or food districts where relevant.\n\nHow a grader verifies this: Grader can confirm that each city has food context beyond generic dining, such as a local specialty, market, neighborhood, or cuisine angle sourced from Google research.","max_points":120},{"criterion":"The agent builds a detailed food itinerary for each city with named venues such as ramen shops, izakayas, sushi spots, or markets, using Yelp pages where available.","description":"The agent builds a detailed food itinerary for each city with named venues such as ramen shops, izakayas, sushi spots, or markets, using Yelp pages where available.\n\nHow a grader verifies this: Grader can confirm that each stop includes specific venue names and meal ideas, and that at least some restaurant pages were opened in separate tabs for visual comparison.","max_points":120},{"criterion":"The agent compiles a complete integrated itinerary in CryptPad Document with games, travel, hotels, food plans, and itemized costs in chronological order.","description":"The agent compiles a complete integrated itinerary in CryptPad Document with games, travel, hotels, food plans, and itemized costs in chronological order.\n\nHow a grader verifies this: Grader can confirm the CryptPad Document is open and contains all 12 stops with the required fields combined into one coherent schedule.","max_points":140}]}} +{"task_id":"56f6e52a7d77ae7831e527f64e6544b1f929904b","confirmed_task":"I'm starting to get serious about going on the academic job market, and I want a really thorough browser-based sweep of faculty openings at the top 50 U.S. computer science schools, not just a quick skim of a couple of job boards. Please start by opening CryptPad Sheets and create a spreadsheet called \"CS Faculty Job Search\" with columns for university, department or school, job title, area, rank, deadline, posting link, and a short verification note, because I want a clean tracker I can review later. Then use CSRankings to pull the top 50 U.S. CS schools and keep that page open as the master checklist so I can see exactly which schools were covered. From there, go school by school and actually visit the relevant CS department pages, engineering school faculty hiring pages, and university jobs or careers sites as needed to verify whether there are any tenure-track, tenured, or open-rank faculty jobs in computer science, machine learning, AI, data science, robotics, vision, NLP, or closely related areas. If you find a relevant opening, open the actual posting page in its own tab and read it directly rather than relying on search snippets, and if you had to use a department hiring page or university careers page to confirm it, leave that verification page open too so I can inspect the trail myself. If a school seems to have no relevant opening, I still want you to verify that as carefully as possible from the department or university hiring information before moving on, because the absence is useful too. As you go, keep the spreadsheet updated so each row corresponds to a real posting page you currently have open in a tab, with the university, department or school, title, area, rank, deadline if listed, link, and a brief note saying how you verified it. At the end, do a completeness pass against the full top-50 checklist, make sure the important tabs are still open, and add a short summary report with the main patterns you noticed and any recommendations about where the strongest cluster of relevant openings seems to be.","website":"https://www.google.com","level":"hard","reference_length":7,"categories":["Jobs and Career > Jobs and Employment","Science and Education > Universities and Colleges"],"precomputed_rubric":{"items":[{"criterion":"A CryptPad Sheet titled \"CS Faculty Job Search\" exists and includes the required columns for university, department or school, job title, area, rank, deadline, posting link, and verification note.","description":"A CryptPad Sheet titled \"CS Faculty Job Search\" exists and includes the required columns for university, department or school, job title, area, rank, deadline, posting link, and verification note.\n\nHow a grader verifies this: Grader can see the spreadsheet open in the browser with the exact title and visible header row containing all required fields.","max_points":100},{"criterion":"A credible top-50 U.S. computer science school list is identified from CSRankings and kept available as the master checklist.","description":"A credible top-50 U.S. computer science school list is identified from CSRankings and kept available as the master checklist.\n\nHow a grader verifies this: Grader can see an open CSRankings tab showing the rankings source used for the top 50 schools.","max_points":120},{"criterion":"All 50 schools from the checklist are systematically checked using department, school, and/or university hiring pages as needed.","description":"All 50 schools from the checklist are systematically checked using department, school, and/or university hiring pages as needed.\n\nHow a grader verifies this: Spreadsheet completion note and verification notes indicate coverage of all 50 schools, and open tabs/history show school-by-school checking from original institutional sources.","max_points":240},{"criterion":"Every relevant faculty opening found is verified from the actual posting page and each posting is opened in its own browser tab.","description":"Every relevant faculty opening found is verified from the actual posting page and each posting is opened in its own browser tab.\n\nHow a grader verifies this: Grader can inspect open tabs for actual posting pages and confirm the recorded links point to live original postings rather than summaries or search results.","max_points":180},{"criterion":"When a posting or absence required additional confirmation, the corresponding department hiring page or university careers page is also kept open as verification evidence.","description":"When a posting or absence required additional confirmation, the corresponding department hiring page or university careers page is also kept open as verification evidence.\n\nHow a grader verifies this: Open tabs include supporting department or careers pages for cases where the verification note says those pages were used.","max_points":100},{"criterion":"The spreadsheet contains one row per verified posting, and each row accurately includes all required fields plus a brief note on how the posting was verified.","description":"The spreadsheet contains one row per verified posting, and each row accurately includes all required fields plus a brief note on how the posting was verified.\n\nHow a grader verifies this: Grader can compare spreadsheet rows against open posting tabs and see complete, populated fields with matching links and verification notes.","max_points":160},{"criterion":"A final completeness review is performed confirming coverage of all 50 schools, consistency between spreadsheet rows and open tabs, and a concise summary of schools with no verified relevant openings plus overall patterns or recommendations.","description":"A final completeness review is performed confirming coverage of all 50 schools, consistency between spreadsheet rows and open tabs, and a concise summary of schools with no verified relevant openings plus overall patterns or recommendations.\n\nHow a grader verifies this: Grader can see a final summary/completion note in the spreadsheet and confirm the important tabs remain open for inspection.","max_points":100}]}} +{"task_id":"40735c71648c0ca0e6291d534685853acf1122c1","confirmed_task":"I’m trying to map out a summer concert run and I’d love your help doing it in a real browser so I can actually look at the pages afterward. Please use Google to find the official tour or ticket pages for Zedd, Bad Bunny, and TWICE, and for each artist open the real official tour page in its own tab and pull out every U.S. show happening during the summer months, with the city, venue, date, and the actual ticket or event link. I want at least 10 total concert options across the three artists, and I’d like them organized in a CryptPad Sheets spreadsheet titled Summer Concert Plan so I can compare everything in one place. Once you’ve got the concert list, look across the dates and figure out a realistic sequence where I could attend at least one show from each artist in the same summer without impossible timing. Then use Google Flights to check real flight options for each leg of that route, making sure the dates and airports line up with the concert schedule, and open the flight results you used in tabs so I can review them. After that, use Booking.com to find a hotel in each concert city for the nights I’d need, and open the actual hotel listing pages so I can see the photos, prices, and location. Then use Google Maps or Google Search to find a couple of food options near each venue or hotel, because I want this to feel like a full trip and not just a list of shows. Please finish the spreadsheet with the concert options plus the chosen three-concert itinerary, including the verified flights, hotels, and food picks, and keep the official artist pages and the key booking tabs open so I can compare everything visually.","website":"https://www.google.com","level":"hard","reference_length":12,"categories":["Arts & Entertainment > Music","Ecommerce & Shopping > Tickets"],"precomputed_rubric":{"items":[{"criterion":"Zedd's official tour or ticket page is found and opened in a browser tab.","description":"Zedd's official tour or ticket page is found and opened in a browser tab.\n\nHow a grader verifies this: A visible tab shows Zedd's official site or official ticket page with tour/event information.","max_points":60},{"criterion":"Qualifying U.S. summer Zedd concerts are extracted with city, venue, date, and ticket/event link.","description":"Qualifying U.S. summer Zedd concerts are extracted with city, venue, date, and ticket/event link.\n\nHow a grader verifies this: Zedd entries appear in the working notes or spreadsheet with all required fields and correspond to the open official page.","max_points":80},{"criterion":"Bad Bunny's official tour or ticket page is found and opened in a browser tab.","description":"Bad Bunny's official tour or ticket page is found and opened in a browser tab.\n\nHow a grader verifies this: A visible tab shows Bad Bunny's official site or official ticket page with tour/event information.","max_points":60},{"criterion":"Qualifying U.S. summer Bad Bunny concerts are extracted with city, venue, date, and ticket/event link.","description":"Qualifying U.S. summer Bad Bunny concerts are extracted with city, venue, date, and ticket/event link.\n\nHow a grader verifies this: Bad Bunny entries appear in the working notes or spreadsheet with all required fields and match the open official page.","max_points":80},{"criterion":"TWICE's official tour or ticket page is found and opened in a browser tab.","description":"TWICE's official tour or ticket page is found and opened in a browser tab.\n\nHow a grader verifies this: A visible tab shows TWICE's official site or official ticket page with tour/event information.","max_points":60},{"criterion":"Qualifying U.S. summer TWICE concerts are extracted with city, venue, date, and ticket/event link.","description":"Qualifying U.S. summer TWICE concerts are extracted with city, venue, date, and ticket/event link.\n\nHow a grader verifies this: TWICE entries appear in the working notes or spreadsheet with all required fields and match the open official page.","max_points":80},{"criterion":"A CryptPad Sheets spreadsheet titled 'Summer Concert Plan' is created with at least 10 total U.S. summer concert entries across the three artists.","description":"A CryptPad Sheets spreadsheet titled 'Summer Concert Plan' is created with at least 10 total U.S. summer concert entries across the three artists.\n\nHow a grader verifies this: The spreadsheet title is visible and the sheet contains at least 10 rows of concert options with artist, city, venue, date, and ticket link columns filled.","max_points":140},{"criterion":"A feasible same-summer itinerary is selected that includes at least one concert from Zedd, Bad Bunny, and TWICE.","description":"A feasible same-summer itinerary is selected that includes at least one concert from Zedd, Bad Bunny, and TWICE.\n\nHow a grader verifies this: The sheet clearly marks or lists a three-concert route with one selected show per artist and no obvious date conflicts.","max_points":120},{"criterion":"Real flight options are verified for each leg of the selected itinerary and aligned to the concert dates.","description":"Real flight options are verified for each leg of the selected itinerary and aligned to the concert dates.\n\nHow a grader verifies this: Google Flights results tabs are open and the sheet records airports, airlines, dates, times, and links matching the chosen route.","max_points":100},{"criterion":"Hotels are verified for each stop in the selected itinerary using Booking.com listing pages.","description":"Hotels are verified for each stop in the selected itinerary using Booking.com listing pages.\n\nHow a grader verifies this: Booking.com hotel listing tabs are open for each city and the sheet records hotel names, stay dates, and booking links.","max_points":80},{"criterion":"Food options are included for each stop in the selected itinerary.","description":"Food options are included for each stop in the selected itinerary.\n\nHow a grader verifies this: The sheet contains nearby dining choices tied to each concert city, venue, or hotel, sourced from Google Search or Maps.","max_points":50},{"criterion":"The final sheet contains the full verified itinerary and the official artist pages plus key booking tabs remain open for visual review.","description":"The final sheet contains the full verified itinerary and the official artist pages plus key booking tabs remain open for visual review.\n\nHow a grader verifies this: The completed sheet shows concert options and the chosen travel plan with flights, hotels, and food, while official tour pages and booking/result tabs are still visible in the browser.","max_points":90}]}} +{"task_id":"ea8661bcf1150ea65b11aa0f2041cd4b1047d90f","confirmed_task":"I’m trying to build a serious law school research sheet for myself, and I want to base it on a major ranking rather than random lists. Please start on U.S. News and use its law school rankings to pull the top 20 U.S. law schools, including each school’s city and state, so I have a clean shortlist to work from. Then, for each of those 20 schools, go to the school’s official law school website and open the JD admissions page in its own tab so I can visually compare the official pages later. On each admissions page, verify the current JD application deadline, whether they accept the LSAT, GRE, or both for JD applicants, and the application fee if it’s listed on the admissions or application requirements pages. After that, stay on the official school sites and look for at least one real funding opportunity per school when possible—things like merit scholarships, named scholar programs, public interest fellowships, or other law student funding—so that we end up with at least 20 official funding opportunities total across the 20 schools. When you find one, open the actual program page in a new tab and verify the eligibility requirements and the funding amount or benefit if the school gives one, because I want this based only on official pages I could click through myself. Once you’ve gathered everything, create a CryptPad Sheets spreadsheet titled Top Law Schools and Fellowships and enter one row for each school with the school name, location, application deadline, LSAT/GRE policy, application fee, and the official admissions link. Then add the funding opportunities in the same sheet or a clearly labeled second tab with the school name, scholarship or fellowship name, eligibility criteria, funding amount or benefit if listed, and the official program link. Please leave the key admissions tabs and several of the funding tabs open so I can spot-check them in the browser, and finish with a short written summary in the sheet or an adjacent CryptPad Document explaining which schools seem to offer the most generous funding and whether the programs you found lean more toward public interest, leadership, or academic excellence.","website":"https://www.google.com","level":"hard","reference_length":10,"categories":["Science and Education > Universities and Colleges","Law and Government > Legal"],"precomputed_rubric":{"items":[{"criterion":"A top-20 list of U.S. law schools is taken from U.S. News and includes school names and locations.","description":"A top-20 list of U.S. law schools is taken from U.S. News and includes school names and locations.\n\nHow a grader verifies this: Grader can see U.S. News ranking page used as the source and confirm 20 schools with city/state recorded in the spreadsheet.","max_points":100},{"criterion":"The official JD admissions page for each of the 20 schools is opened in separate tabs from the schools' official domains.","description":"The official JD admissions page for each of the 20 schools is opened in separate tabs from the schools' official domains.\n\nHow a grader verifies this: Browser shows multiple official admissions tabs open, and spreadsheet admissions links point to official law school pages.","max_points":100},{"criterion":"The JD application deadline is verified and recorded for each of the 20 schools.","description":"The JD application deadline is verified and recorded for each of the 20 schools.\n\nHow a grader verifies this: Each school row contains a deadline, and spot-checking open admissions tabs confirms the recorded dates or deadline language.","max_points":90},{"criterion":"The LSAT/GRE policy is verified and recorded for each of the 20 schools.","description":"The LSAT/GRE policy is verified and recorded for each of the 20 schools.\n\nHow a grader verifies this: Each school row includes whether LSAT, GRE, or both are accepted, and spot-checking official admissions pages matches the entries.","max_points":90},{"criterion":"The JD application fee is verified and recorded for each of the 20 schools when listed on official pages.","description":"The JD application fee is verified and recorded for each of the 20 schools when listed on official pages.\n\nHow a grader verifies this: Each school row includes an application fee or a clearly indicated official absence/unavailability, supported by official admissions or application pages.","max_points":80},{"criterion":"At least 20 official law student funding opportunities are identified across the selected schools.","description":"At least 20 official law student funding opportunities are identified across the selected schools.\n\nHow a grader verifies this: Spreadsheet contains 20 or more funding entries tied to official school programs, with school names and program names populated.","max_points":130},{"criterion":"Each identified funding opportunity has an official program page opened and verified for eligibility requirements and funding amount or benefit if listed.","description":"Each identified funding opportunity has an official program page opened and verified for eligibility requirements and funding amount or benefit if listed.\n\nHow a grader verifies this: Representative funding tabs remain open, and funding rows include eligibility details plus amount/benefit information or note that no amount was listed officially.","max_points":130},{"criterion":"A CryptPad Sheets file titled 'Top Law Schools and Fellowships' is created and populated with all 20 school admissions records.","description":"A CryptPad Sheets file titled 'Top Law Schools and Fellowships' is created and populated with all 20 school admissions records.\n\nHow a grader verifies this: Grader can see the spreadsheet title and 20 school rows containing school name, location, deadline, LSAT/GRE policy, application fee, and admissions link.","max_points":100},{"criterion":"The spreadsheet includes at least 20 funding opportunity entries with school name, program name, eligibility, funding amount or benefit if listed, and official link.","description":"The spreadsheet includes at least 20 funding opportunity entries with school name, program name, eligibility, funding amount or benefit if listed, and official link.\n\nHow a grader verifies this: Funding sheet or section contains 20 or more complete entries with clickable official links and corresponding details.","max_points":100},{"criterion":"A final synthesis compares which schools appear most generous and categorizes program emphasis as public interest, leadership, or academic excellence.","description":"A final synthesis compares which schools appear most generous and categorizes program emphasis as public interest, leadership, or academic excellence.\n\nHow a grader verifies this: A written summary is present in the spreadsheet or adjacent CryptPad Document and references patterns visible in the compiled funding data.","max_points":80}]}} +{"task_id":"ef766b69020befdc8e208f47401cb6bce5e9b931","confirmed_task":"I’m planning a pretty big house renovation in the Dallas–Fort Worth area and want to build a solid shortlist of contractors before I start calling people, so could you help me research this in the browser and keep the evidence visible? Please start in CryptPad Document and create a spreadsheet-style document titled DFW Renovation Contractors with sections for Landscapers, Plumbers, Electricians, and a Final Summary, because I want everything in one place while I compare options. Then use Google Search to find landscaping companies that clearly serve Dallas or Fort Worth, and for each one open the actual company website or business profile page in its own tab so I can inspect the details later; as you go, verify that the company really does landscaping work, note the rating or review score if one is shown, and confirm from the site or profile that the service area includes Dallas, Fort Worth, or the broader DFW area. I need at least 10 landscapers that meet those checks, and once you’ve verified them, record the company name, service type, location, rating if available, and the direct link to the company page in the document. After that, do the same thing for plumbers serving Dallas or Fort Worth, again making sure each qualifying company has its own tab left open on the actual page and that the document captures the same fields for at least 10 plumbers. Then repeat the process for electricians in the same area, with at least 10 verified entries and each company page still open in a tab so I can visually compare them afterward. When all three categories are filled out, do a cleanup pass in the document and across the open tabs to make sure every listed contractor still has a matching tab open, every entry really serves Dallas or Fort Worth, and every row has the required details. Finally, add a short summary explaining which companies seem to have the strongest reputation and the broadest Dallas/Fort Worth coverage based on the ratings, reviews, and service-area evidence you found, and leave the document plus the contractor tabs open so I can review everything on screen.","website":"https://www.google.com","level":"hard","reference_length":9,"categories":["Home and Garden > Home Improvement and Maintenance","Heavy Industry and Engineering > Construction and Maintenance"],"precomputed_rubric":{"items":[{"criterion":"A CryptPad document titled 'DFW Renovation Contractors' exists and is organized with sections for Landscapers, Plumbers, Electricians, and Final Summary, including fields for company name, service type, location, rating if available, and company-page link.","description":"A CryptPad document titled 'DFW Renovation Contractors' exists and is organized with sections for Landscapers, Plumbers, Electricians, and Final Summary, including fields for company name, service type, location, rating if available, and company-page link.\n\nHow a grader verifies this: Grader can see the document title and the structured sections or tables with the required columns/fields visible in CryptPad Document.","max_points":80},{"criterion":"At least 10 qualifying landscapers serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.","description":"At least 10 qualifying landscapers serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.\n\nHow a grader verifies this: Grader can inspect open tabs and confirm there are at least 10 landscaper company/profile pages corresponding to qualifying businesses with visible service and service-area evidence.","max_points":120},{"criterion":"The document contains complete recorded details for at least 10 verified landscapers: company name, service type, location, rating if available, and direct link to the company page.","description":"The document contains complete recorded details for at least 10 verified landscapers: company name, service type, location, rating if available, and direct link to the company page.\n\nHow a grader verifies this: Grader can review the Landscapers section in the document and confirm at least 10 entries with all required fields populated and matching the open tabs.","max_points":120},{"criterion":"At least 10 qualifying plumbers serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.","description":"At least 10 qualifying plumbers serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.\n\nHow a grader verifies this: Grader can inspect open tabs and confirm there are at least 10 plumber company/profile pages corresponding to qualifying businesses with visible service and service-area evidence.","max_points":120},{"criterion":"The document contains complete recorded details for at least 10 verified plumbers: company name, service type, location, rating if available, and direct link to the company page.","description":"The document contains complete recorded details for at least 10 verified plumbers: company name, service type, location, rating if available, and direct link to the company page.\n\nHow a grader verifies this: Grader can review the Plumbers section in the document and confirm at least 10 entries with all required fields populated and matching the open tabs.","max_points":120},{"criterion":"At least 10 qualifying electricians serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.","description":"At least 10 qualifying electricians serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.\n\nHow a grader verifies this: Grader can inspect open tabs and confirm there are at least 10 electrician company/profile pages corresponding to qualifying businesses with visible service and service-area evidence.","max_points":120},{"criterion":"The document contains complete recorded details for at least 10 verified electricians: company name, service type, location, rating if available, and direct link to the company page.","description":"The document contains complete recorded details for at least 10 verified electricians: company name, service type, location, rating if available, and direct link to the company page.\n\nHow a grader verifies this: Grader can review the Electricians section in the document and confirm at least 10 entries with all required fields populated and matching the open tabs.","max_points":120},{"criterion":"A validation pass confirms every listed contractor has a corresponding open tab, clearly serves Dallas or Fort Worth, and the final list still contains at least 10 valid entries in each service type after any replacements.","description":"A validation pass confirms every listed contractor has a corresponding open tab, clearly serves Dallas or Fort Worth, and the final list still contains at least 10 valid entries in each service type after any replacements.\n\nHow a grader verifies this: Grader can compare the document entries against the open tabs and confirm that each recorded contractor is backed by a visible page and meets the service-area requirement.","max_points":100},{"criterion":"The Final Summary identifies which contractors appear strongest based on reputation and Dallas/Fort Worth service coverage using evidence from the verified company pages and profiles.","description":"The Final Summary identifies which contractors appear strongest based on reputation and Dallas/Fort Worth service coverage using evidence from the verified company pages and profiles.\n\nHow a grader verifies this: Grader can read the Final Summary section and see comparative conclusions tied to ratings/reviews and service-area coverage from the collected entries and open tabs.","max_points":100}]}} +{"task_id":"7aab821efa9c268801d21ad8cf2ca60a82c699b3","confirmed_task":"I'm seriously thinking about applying to MBA programs in the U.S., and I want a solid research sheet I can actually use to decide where to apply and where funding might be strongest. Please start by using U.S. News and then Poets&Quants to build a defensible list of the top 20 full-time MBA programs in the United States, mainly so I have a realistic shortlist based on major rankings rather than just reputation. Once that list looks settled, go to each school's official MBA admissions site and open every program's admissions page in its own tab so I can visually inspect the official pages later, and for each one capture the application deadline, the GMAT/GRE or test-waiver policy, the application fee, the program length, and the admissions URL. After that, use the official business school or university financial aid pages for those same schools to find MBA-specific fellowships, scholarships, or named funding programs, and whenever you find one, open the actual official funding page in its own tab and verify the eligibility rules and the funding amount if the page lists one. I want at least 20 schools and at least 20 total funding opportunities across them, so if a school's main admissions page is vague, keep digging on the official school domain until you find the clearest scholarship or fellowship source. Then create a spreadsheet or document titled Top MBA Programs and Fellowships and record, for each school, the school name, MBA program name, application deadline, GMAT/GRE policy, application fee, program length, and admissions link, followed by any associated fellowship or scholarship names, eligibility criteria, funding amount if listed, and the link to the official funding page. Please keep the official admissions tabs and the official funding tabs open so I can compare them side by side afterward, and before you finish, do one pass through the open tabs to make sure every row in the sheet has matching browser proof. At the end, add a short summary telling me which schools seem to offer the biggest funding opportunities and whether the awards you found are mostly merit-based, diversity-focused, leadership-oriented, need-based, or something else, because I want a quick sense of where I should spend the most application effort.","website":"https://www.google.com","level":"hard","reference_length":8,"categories":["Science and Education > Universities and Colleges","Science and Education > Business Training"],"precomputed_rubric":{"items":[{"criterion":"A defensible final list of 20 top U.S. full-time MBA programs is established using both U.S. News and Poets&Quants.","description":"A defensible final list of 20 top U.S. full-time MBA programs is established using both U.S. News and Poets&Quants.\n\nHow a grader verifies this: Grader can confirm that the document includes 20 schools and that the selected set is based on evidence gathered from both ranking sites.","max_points":120},{"criterion":"Official admissions pages are opened in separate tabs for all 20 selected schools and the required admissions details are captured for each school.","description":"Official admissions pages are opened in separate tabs for all 20 selected schools and the required admissions details are captured for each school.\n\nHow a grader verifies this: Grader can inspect open official admissions tabs and match them to document rows containing deadline, GMAT/GRE policy, application fee, program length, and admissions URL for all 20 schools.","max_points":220},{"criterion":"MBA-specific funding opportunities are identified on official school domains across the selected schools, with at least 20 total fellowships or scholarships found.","description":"MBA-specific funding opportunities are identified on official school domains across the selected schools, with at least 20 total fellowships or scholarships found.\n\nHow a grader verifies this: Grader can count at least 20 funding entries in the document and confirm they are tied to official school sources rather than third-party summaries.","max_points":160},{"criterion":"Each listed fellowship or scholarship is verified on its own official page with eligibility criteria and funding amount recorded when available.","description":"Each listed fellowship or scholarship is verified on its own official page with eligibility criteria and funding amount recorded when available.\n\nHow a grader verifies this: Grader can inspect open official funding tabs and confirm that each listed award has a matching page and includes eligibility details plus funding amount if the page provides one.","max_points":160},{"criterion":"Coverage is complete, meaning all 20 MBA programs are documented and the total verified funding opportunities reach at least 20 after gap-filling.","description":"Coverage is complete, meaning all 20 MBA programs are documented and the total verified funding opportunities reach at least 20 after gap-filling.\n\nHow a grader verifies this: Grader can verify final counts in the document and see that missing or unclear schools were supplemented with additional official-source research.","max_points":100},{"criterion":"A document or spreadsheet titled 'Top MBA Programs and Fellowships' is created and organized with all required school and funding fields.","description":"A document or spreadsheet titled 'Top MBA Programs and Fellowships' is created and organized with all required school and funding fields.\n\nHow a grader verifies this: Grader can open the created file and confirm the exact title and the presence of school-level admissions data plus funding-level details and links.","max_points":100},{"criterion":"Browser-proof is preserved by keeping official admissions and funding pages open for the documented entries.","description":"Browser-proof is preserved by keeping official admissions and funding pages open for the documented entries.\n\nHow a grader verifies this: Grader can inspect the browser state and confirm that official admissions tabs and official funding tabs remain open and correspond to the entries in the document.","max_points":60},{"criterion":"The final document includes a comparative summary of the largest funding opportunities and the main funding categories observed.","description":"The final document includes a comparative summary of the largest funding opportunities and the main funding categories observed.\n\nHow a grader verifies this: Grader can read the concluding section and confirm it identifies schools with stronger funding and categorizes awards into patterns such as merit, diversity, leadership, need-based, or other.","max_points":80}]}} +{"task_id":"e7ae8abcf742d5ba2ef4eef88d16bbe26df978e2","confirmed_task":"I'm trying to get a realistic picture of the top U.S. medical schools before I go too far down the application rabbit hole, and I want this organized in a way I can actually review later. Please start by using U.S. News to pull together a top-20 list of U.S. medical schools, then cross-check that list on Times Higher Education so the schools we keep are broadly supported by major rankings rather than coming from just one source. Once that shortlist looks solid, go to each school's official MD admissions site and open the actual admissions page in its own tab so I can see the real pages later, and from those official pages verify the application deadline, whether the MCAT is required, and a brief program overview with the school name, university, and location. After that, stay on the official university or medical school sites and look for at least one funding opportunity, fellowship, scholarship, or special training program tied to medical students for each school where possible — things like merit scholarships, leadership fellowships, research fellowships, or service and specialized pathway programs. When you find a relevant program, open the actual program page in a new tab and verify the eligibility details and any funding amount if it's listed, because I want browser-proof tabs open to the official sources rather than just a summary. Then create a CryptPad document titled Top Medical Schools and Fellowships and fill it in with one structured entry for each of the 20 schools, including the admissions link and the associated program links, and make sure there are at least 20 total funding or special-program entries across the whole document. At the end, leave the document open and also keep a representative set of the admissions and program tabs open so I can visually spot-check them, then add a short summary explaining which schools seem to offer the strongest funding support and whether the programs you found lean more toward research, leadership, or service, along with your top recommendations for where I should focus first.","website":"https://www.google.com","level":"hard","reference_length":9,"categories":["Science and Education > Universities and Colleges","Health > Medicine"],"precomputed_rubric":{"items":[{"criterion":"A top-20 list of U.S. medical schools is established from U.S. News and used as the initial candidate set.","description":"A top-20 list of U.S. medical schools is established from U.S. News and used as the initial candidate set.\n\nHow a grader verifies this: Visible evidence from U.S. News ranking pages and a recorded list of 20 candidate schools reflected in the working notes or final document.","max_points":80},{"criterion":"The selected schools are cross-validated against Times Higher Education and a final top-20 set is chosen based on major rankings support.","description":"The selected schools are cross-validated against Times Higher Education and a final top-20 set is chosen based on major rankings support.\n\nHow a grader verifies this: Visible THE ranking page or search results are used to confirm overlap or support for the selected schools, and the final 20-school set is consistent in the document.","max_points":80},{"criterion":"Official MD admissions pages are opened for all 20 selected schools in separate tabs.","description":"Official MD admissions pages are opened for all 20 selected schools in separate tabs.\n\nHow a grader verifies this: Browser shows official university or medical school admissions tabs for the selected schools, and the document includes official admissions links for each school.","max_points":140},{"criterion":"Application deadline and MCAT requirement are accurately verified from official sources for all 20 schools.","description":"Application deadline and MCAT requirement are accurately verified from official sources for all 20 schools.\n\nHow a grader verifies this: Each school entry in the document contains a deadline and MCAT requirement sourced from official admissions pages, with values matching the visible source tabs.","max_points":140},{"criterion":"Program overview information, school name, university, and location are accurately captured for all 20 schools from official pages.","description":"Program overview information, school name, university, and location are accurately captured for all 20 schools from official pages.\n\nHow a grader verifies this: Each school entry includes identifying details and a concise overview that align with the official program or school overview pages.","max_points":100},{"criterion":"Funding opportunities or special programs relevant to medical students are identified across the selected schools.","description":"Funding opportunities or special programs relevant to medical students are identified across the selected schools.\n\nHow a grader verifies this: The document contains associated scholarships, fellowships, or special programs tied to the schools, sourced from official university pages.","max_points":100},{"criterion":"At least 20 funding opportunities or special programs are individually verified on official program pages, including eligibility and funding information when listed.","description":"At least 20 funding opportunities or special programs are individually verified on official program pages, including eligibility and funding information when listed.\n\nHow a grader verifies this: There are at least 20 program entries with official links, and representative open tabs show program pages containing eligibility details and funding amounts where available.","max_points":140},{"criterion":"A CryptPad Document titled 'Top Medical Schools and Fellowships' is created and contains complete structured entries for all 20 schools and their associated programs.","description":"A CryptPad Document titled 'Top Medical Schools and Fellowships' is created and contains complete structured entries for all 20 schools and their associated programs.\n\nHow a grader verifies this: The CryptPad Document title matches exactly, the document is open in the browser, and it includes 20 school records with admissions and program details plus links.","max_points":140},{"criterion":"The final document includes a synthesis comparing which schools appear to provide the most funding support and whether programs emphasize research, leadership, or service, while confirming the minimum counts and leaving proof tabs open.","description":"The final document includes a synthesis comparing which schools appear to provide the most funding support and whether programs emphasize research, leadership, or service, while confirming the minimum counts and leaving proof tabs open.\n\nHow a grader verifies this: The bottom of the CryptPad Document contains a written summary and count confirmation, and the browser still shows the document plus representative admissions and program tabs open.","max_points":80}]}} +{"task_id":"feba3355ecae6838d521294bbca8e50cf99f0a53","confirmed_task":"I'm getting serious about applying to computer science PhD programs in machine learning and AI, and I want a realistic shortlist of schools plus specific professors who might actually be good advisor matches. Please start on CSRankings and use it to pull together the top 25 U.S. computer science PhD programs as the base list, because I want something credible and standardized rather than a random blog ranking. Then open CryptPad Sheets and create a spreadsheet called \"ML PhD Programs and Advisors\" with columns for university, professor name, research area, whether they appear to be accepting students, the exact evidence or wording you found, professor page link, and lab or research group link if there is one. After that, go school by school through the CS department faculty directories for those top programs and look specifically for faculty working in machine learning, artificial intelligence, data science, NLP, robotics, or computer vision, since those are the areas I'm most likely to apply in. For each promising professor, open the actual faculty profile or personal website in a new tab, and if they have a lab page open that too in another tab so I can visually inspect the pages later; I especially want you to look for signs like \"accepting PhD students,\" \"recruiting,\" application guidance for prospective students, active lab rosters, recent projects, or anything else that suggests they are actively supervising graduate students right now. Please record at least 20 professors across the top schools, but try to spread them across the list instead of clustering everything at just MIT, Stanford, Berkeley, and CMU, because I want a broad application strategy. As you go, keep the relevant professor and lab tabs open so I can review the evidence on screen, and if a professor does not explicitly say they are accepting students, note that clearly rather than guessing. Once the sheet looks complete, do a consistency pass to make sure the links work, the research areas match what is actually on the pages, and the accepting-students status is backed by visible evidence. Then add a short summary section in the sheet or a companion doc explaining which universities seem to have the biggest clusters of ML/AI faculty who appear open to advising new PhD students, so I can see where my odds and fit might be strongest.","website":"https://www.google.com","level":"hard","reference_length":8,"categories":["Science and Education > Universities and Colleges","Computers Electronics and Technology > Programming and Developer Software"],"precomputed_rubric":{"items":[{"criterion":"A credible top-25 list of U.S. computer science PhD programs is identified on CSRankings and used as the basis for the research.","description":"A credible top-25 list of U.S. computer science PhD programs is identified on CSRankings and used as the basis for the research.\n\nHow a grader verifies this: Grader can see CSRankings open with the ranking view and a corresponding list of 25 U.S. universities reflected in the working materials.","max_points":120},{"criterion":"A CryptPad Sheets titled \"ML PhD Programs and Advisors\" is created with the required columns: university, professor name, research area, accepting students status, evidence text, professor page link, and lab or research group link.","description":"A CryptPad Sheets titled \"ML PhD Programs and Advisors\" is created with the required columns: university, professor name, research area, accepting students status, evidence text, professor page link, and lab or research group link.\n\nHow a grader verifies this: Grader can see the spreadsheet title and header row visible in CryptPad Sheets.","max_points":100},{"criterion":"Faculty directories are opened for the target universities and ML/AI-aligned faculty are identified from those directories.","description":"Faculty directories are opened for the target universities and ML/AI-aligned faculty are identified from those directories.\n\nHow a grader verifies this: Grader can see multiple university CS faculty directory tabs open and relevant faculty entries visible on those pages.","max_points":130},{"criterion":"Professor personal pages, faculty profiles, or lab pages are opened in separate tabs and used to verify both research alignment and evidence of active advising or student recruitment.","description":"Professor personal pages, faculty profiles, or lab pages are opened in separate tabs and used to verify both research alignment and evidence of active advising or student recruitment.\n\nHow a grader verifies this: Grader can see professor and lab tabs open with visible text such as research topics, lab information, or statements about accepting or recruiting students.","max_points":180},{"criterion":"The spreadsheet contains at least 20 professor entries across the top programs, each with all required fields completed.","description":"The spreadsheet contains at least 20 professor entries across the top programs, each with all required fields completed.\n\nHow a grader verifies this: Grader can count at least 20 filled rows in the sheet and confirm each row includes university, professor, research area, accepting status, evidence, and links.","max_points":200},{"criterion":"The collected set shows broad coverage across the top-25 schools rather than being concentrated in only a small number of universities, and relevant professor or lab pages remain open in tabs.","description":"The collected set shows broad coverage across the top-25 schools rather than being concentrated in only a small number of universities, and relevant professor or lab pages remain open in tabs.\n\nHow a grader verifies this: Grader can see entries spanning a meaningful range of universities in the sheet and multiple supporting tabs still open for inspection.","max_points":100},{"criterion":"The recorded data is internally consistent and links, research areas, and accepting-status entries are validated against the source pages.","description":"The recorded data is internally consistent and links, research areas, and accepting-status entries are validated against the source pages.\n\nHow a grader verifies this: Spot checks of several rows against the open tabs show matching research areas, working links, and accepting-status claims supported by visible evidence text.","max_points":90},{"criterion":"A final summary identifies which universities appear to have the largest clusters of ML/AI faculty accepting PhD students, along with key patterns or recommendations.","description":"A final summary identifies which universities appear to have the largest clusters of ML/AI faculty accepting PhD students, along with key patterns or recommendations.\n\nHow a grader verifies this: Grader can see a summary section in the sheet or companion doc that names universities, describes cluster strength, and provides concise takeaways.","max_points":80}]}} +{"task_id":"d8061d694d7a4276f12e8f15c5d3029ab084e7d1","confirmed_task":"I’m helping the same couple plan two separate weddings and I want something I can actually review in the browser afterward. First, use Google to find official venue pages for Napa Valley wedding venues that could realistically handle about 200 guests for a fall wedding, and please open each serious option in its own tab so I can compare them side by side later. I need about 10 Napa venues, and for each one please verify from the actual venue site the venue name, where it is in the Napa Valley area, the stated maximum wedding capacity or closest guest-count language you can find, plus a short description of what kind of place it is. Once you’ve verified those, go to CryptPad Sheets and create a spreadsheet called Napa Wedding Venues with columns for venue name, location, maximum guest capacity, venue description, and link to venue page, then fill it in with the Napa venues you confirmed, making sure the rows match the tabs you’ve kept open. After that, switch to Seoul and use Google to find wedding venues or wedding halls specifically in Gangnam-gu, opening each official site or the most authoritative live venue page you can find in its own tab so I can visually review the listings. I’d like around 10 Gangnam options too, and for each one please confirm the Gangnam location, the wedding capacity or at least an approximate size if that’s all the page gives, and what type of venue it is, like hotel, wedding hall, banquet hall, or something similar. Keep those Gangnam tabs open as well, and then give me a short comparison report that summarizes the 10 Seoul options by name, location, approximate size or capacity, and venue type, because I’m trying to see how the Napa and Gangnam venue pools compare before we narrow anything down.","website":"https://www.google.com","level":"hard","reference_length":8,"categories":["Community and Society > Community and Society - Other","Travel and Tourism > Tourist Attractions","Lifestyle > Weddings"],"precomputed_rubric":{"items":[{"criterion":"About 10 Napa Valley wedding venues suitable for around 200 guests are identified via Google and each venue page is opened in its own browser tab.","description":"About 10 Napa Valley wedding venues suitable for around 200 guests are identified via Google and each venue page is opened in its own browser tab.\n\nHow a grader verifies this: Grader can see multiple Napa venue tabs open from search results and confirm the pages correspond to distinct venue sites relevant to Napa Valley weddings.","max_points":140},{"criterion":"Each Napa venue is verified from its official page for venue name, location, guest capacity or closest stated guest-count language, brief description, and source suitability.","description":"Each Napa venue is verified from its official page for venue name, location, guest capacity or closest stated guest-count language, brief description, and source suitability.\n\nHow a grader verifies this: Open Napa tabs visibly show venue details or event/wedding information that supports the extracted fields.","max_points":160},{"criterion":"A CryptPad Sheets spreadsheet titled 'Napa Wedding Venues' is created with the required columns: venue name, location, maximum guest capacity, venue description, and link to venue page.","description":"A CryptPad Sheets spreadsheet titled 'Napa Wedding Venues' is created with the required columns: venue name, location, maximum guest capacity, venue description, and link to venue page.\n\nHow a grader verifies this: Grader can see the spreadsheet title and header row in CryptPad Sheets with the exact required columns present.","max_points":100},{"criterion":"The Napa spreadsheet is populated with around 10 verified venue rows, and the entries correspond to the open Napa venue tabs.","description":"The Napa spreadsheet is populated with around 10 verified venue rows, and the entries correspond to the open Napa venue tabs.\n\nHow a grader verifies this: Spreadsheet contains about 10 filled rows and the names/links align with the open Napa venue pages.","max_points":160},{"criterion":"About 10 Gangnam wedding venues or wedding halls are identified and each official or authoritative live venue page is opened in its own browser tab.","description":"About 10 Gangnam wedding venues or wedding halls are identified and each official or authoritative live venue page is opened in its own browser tab.\n\nHow a grader verifies this: Grader can see multiple Gangnam venue tabs open and confirm they are distinct venues relevant to weddings in Gangnam-gu.","max_points":120},{"criterion":"Each Gangnam venue is verified for location, capacity or approximate size, and venue type from the opened pages.","description":"Each Gangnam venue is verified for location, capacity or approximate size, and venue type from the opened pages.\n\nHow a grader verifies this: Open Gangnam tabs visibly support the extracted location, size/capacity, and type fields, even if some capacities are approximate.","max_points":140},{"criterion":"The Gangnam venue set is cross-checked to ensure around 10 distinct suitable venues with sufficient comparison data, and the tabs remain open for review.","description":"The Gangnam venue set is cross-checked to ensure around 10 distinct suitable venues with sufficient comparison data, and the tabs remain open for review.\n\nHow a grader verifies this: Final open-tab set shows distinct Gangnam venues with enough visible information to compare, without obvious duplicates.","max_points":80},{"criterion":"A short comparison report is produced for the Gangnam venues covering name, location, approximate size or capacity, and venue type.","description":"A short comparison report is produced for the Gangnam venues covering name, location, approximate size or capacity, and venue type.\n\nHow a grader verifies this: Final output includes a concise Seoul/Gangnam comparison summary with the required fields for the identified venues.","max_points":100}]}} +{"task_id":"a930fe364d5950d8cdcb74fe1316ecebb7d63009","confirmed_task":"I’m trying to get a genuinely rigorous picture of the current market for AI and machine learning postdoc roles, because I want a shortlist I could actually use for applications instead of a shallow search result dump. Please start in CryptPad Sheets and create a spreadsheet named AI Postdoc Opportunities with columns for university or lab, department or institute or PI, exact position title, research area, deadline or start date if shown, posting link, and a short note saying how you verified it. Then use Google to build a coverage list of roughly the top 30 U.S. computer science schools along with major AI labs and related institutes, and work through that list carefully using official university job boards, department hiring pages, institute sites, lab pages, faculty group pages, and central hiring portals. As you find anything promising, open the actual posting page in its own tab and read enough of it to confirm it is really a current postdoctoral research opening in AI, machine learning, or a closely related area, not a faculty search, PhD opening, internship, or industry research scientist role. If you need a second official page like a lab hiring page or institute announcement to confirm it, open that too and leave both tabs available so I can inspect the evidence myself. For schools or labs where nothing turns up, please do a second pass across multiple official sources so the absence is at least reasonably verified. As you go, fill the spreadsheet with every verified opening and make sure each row corresponds to a posting tab you still have open. Near the end, keep the spreadsheet visible along with several of the posting tabs and any supporting hiring pages, because I want visual proof of the search trail, and then add a short summary of the main patterns you noticed and which opportunities look strongest or most relevant.","website":"https://www.google.com","level":"hard","reference_length":7,"categories":["Jobs and Career > Jobs and Employment","Science and Education > Science and Education - Other"],"precomputed_rubric":{"items":[{"criterion":"A spreadsheet titled 'AI Postdoc Opportunities' is created in CryptPad Sheets with the requested columns for capturing openings and verification details.","description":"A spreadsheet titled 'AI Postdoc Opportunities' is created in CryptPad Sheets with the requested columns for capturing openings and verification details.\n\nHow a grader verifies this: Grader can see an open spreadsheet with the exact title and visible headers for university/lab, department/institute/PI, position title, research area, deadline/start date, posting link, and verification note.","max_points":100},{"criterion":"A broad coverage checklist is assembled that includes roughly the top 30 U.S. computer science schools plus major AI labs and related institutes.","description":"A broad coverage checklist is assembled that includes roughly the top 30 U.S. computer science schools plus major AI labs and related institutes.\n\nHow a grader verifies this: Grader can see the compiled institution list in the working materials or spreadsheet and confirm it spans top CS universities and major AI labs/institutes rather than only a handful of examples.","max_points":140},{"criterion":"The search is carried out systematically across the checklist using official departmental, institute, lab, faculty, and university hiring sources.","description":"The search is carried out systematically across the checklist using official departmental, institute, lab, faculty, and university hiring sources.\n\nHow a grader verifies this: Browser history/tabs show repeated navigation from Google to official university or lab domains for multiple institutions, demonstrating a broad audit rather than a few isolated searches.","max_points":180},{"criterion":"Each included opportunity is verified on the actual posting page as a current postdoctoral research position relevant to AI/ML, with non-postdoc or irrelevant roles excluded.","description":"Each included opportunity is verified on the actual posting page as a current postdoctoral research position relevant to AI/ML, with non-postdoc or irrelevant roles excluded.\n\nHow a grader verifies this: Open tabs show live posting pages whose titles and text indicate postdoctoral roles; supporting tabs are open where needed to confirm relevance or status.","max_points":220},{"criterion":"Every verified opening is entered in the spreadsheet with all requested fields completed as available, including a direct link and a verification note.","description":"Every verified opening is entered in the spreadsheet with all requested fields completed as available, including a direct link and a verification note.\n\nHow a grader verifies this: Spreadsheet rows contain populated fields for each verified role, and the posting links correspond to the open tabs used for verification.","max_points":180},{"criterion":"Institutions with no identified openings receive an additional verification pass using multiple official sources to support the absence conclusion.","description":"Institutions with no identified openings receive an additional verification pass using multiple official sources to support the absence conclusion.\n\nHow a grader verifies this: For at least several no-opening institutions, tabs or notes show checks across more than one official source such as department, institute, lab, or central jobs pages.","max_points":100},{"criterion":"The final workspace remains visually inspectable, with the spreadsheet visible, important posting/supporting tabs left open, and a short summary of patterns and recommendations added.","description":"The final workspace remains visually inspectable, with the spreadsheet visible, important posting/supporting tabs left open, and a short summary of patterns and recommendations added.\n\nHow a grader verifies this: Grader can see the spreadsheet open, multiple source tabs still present, and a visible summary section or note capturing takeaways and strongest opportunities.","max_points":80}]}} +{"task_id":"8464610469e9466cd87449671df0c4e761fa7434","confirmed_task":"I’m daydreaming about doing a full summer KBO baseball trip through South Korea and I want to make it feel like a real, bookable plan instead of just a rough idea. Please start on koreabaseball.com and pull the current KBO schedule, then identify all 10 active KBO stadiums and choose one actual summer game at each stadium, ideally in a route that won’t make me zigzag all over the country. Once you have those 10 game dates and matchups, use Google Flights and Google Maps or Google Travel to figure out the cheapest practical way to move between each stop, whether that means flights, trains, buses, or driving, because I want the route to be efficient and budget-conscious. After that, go to Booking.com and find one solid place to stay near each stadium for the corresponding game night, aiming for convenient locations and reasonable prices rather than luxury. Then use Google Search to research what each city is especially known for eating, and use Yelp to turn that into a real food plan for every stop with specific restaurants, markets, or street-food areas I could actually visit around the game. As you do this, please open the actual hotel listings in their own tabs so I can compare photos and map locations, and for at least a couple of the food stops, open the real listing pages so I can visually verify they look active and worth visiting. When it all comes together, put everything into a CryptPad Document with the stadium, city, game date and matchup, travel leg, lodging, food plan, and estimated costs for each stop, and leave the finished doc open so I can review it.","website":"https://www.google.com","level":"hard","reference_length":6,"categories":["Sports > Baseball","Travel and Tourism > Travel and Tourism - Other"],"precomputed_rubric":{"items":[{"criterion":"Retrieve the current KBO schedule and identify 10 distinct KBO stadiums, selecting one real summer game date and matchup for each.","description":"Retrieve the current KBO schedule and identify 10 distinct KBO stadiums, selecting one real summer game date and matchup for each.\n\nHow a grader verifies this: Grader can confirm the chosen games correspond to visible schedule information on koreabaseball.com and that all 10 stadiums are distinct and summer-dated.","max_points":220},{"criterion":"Create a complete ordered route covering all 10 selected stadium stops with the cheapest practical transport choice for each leg.","description":"Create a complete ordered route covering all 10 selected stadium stops with the cheapest practical transport choice for each leg.\n\nHow a grader verifies this: Grader can confirm each travel leg connects consecutive selected cities from the itinerary and includes a transport mode with estimated cost/time derived from Google tools.","max_points":200},{"criterion":"Find one accommodation option near each stadium for the corresponding game night and visually verify the chosen listings by opening the actual hotel pages.","description":"Find one accommodation option near each stadium for the corresponding game night and visually verify the chosen listings by opening the actual hotel pages.\n\nHow a grader verifies this: Grader can confirm 10 lodging selections exist, align with the itinerary dates/cities, and that Booking.com listing pages or tabs show real hotel details, photos, and map context.","max_points":180},{"criterion":"Research city-specific Korean food specialties or notable food areas for each stadium city.","description":"Research city-specific Korean food specialties or notable food areas for each stadium city.\n\nHow a grader verifies this: Grader can confirm each city has at least one locally relevant dish, market, food street, or culinary specialty sourced from Google research rather than generic cuisine labels.","max_points":120},{"criterion":"Build a detailed food itinerary for each city with named restaurants, markets, and/or street-food stops that fit the trip schedule, including visible verification on at least some actual Yelp listing pages.","description":"Build a detailed food itinerary for each city with named restaurants, markets, and/or street-food stops that fit the trip schedule, including visible verification on at least some actual Yelp listing pages.\n\nHow a grader verifies this: Grader can confirm each city has concrete food stops tied to the itinerary and that at least a couple of Yelp business pages were opened and appear active.","max_points":130},{"criterion":"Compile a coherent final itinerary in CryptPad Document that integrates games, stadiums, travel, accommodations, food plans, and estimated costs into one document.","description":"Compile a coherent final itinerary in CryptPad Document that integrates games, stadiums, travel, accommodations, food plans, and estimated costs into one document.\n\nHow a grader verifies this: Grader can confirm the CryptPad Document contains all 10 stops in order with the required fields and remains open at the end for review.","max_points":150}]}} +{"task_id":"8f005e9f09101dd540f1f666063483931e8faa59","confirmed_task":"I’m helping a family get settled in Raleigh, North Carolina, and I want to line up both healthcare and childcare in one pass so they have real options to review on screen. Please start with Google and figure out three major health insurance plans that are actually relevant in Raleigh, then use the official insurer sites for Aetna, Blue Cross NC, and UnitedHealthcare to open each plan’s consumer overview page in its own tab and also open the matching provider directory or doctor search page in its own tab, because I want to be able to look at both the plan details and the network search later. Once those networks are confirmed, use the insurer directories and actual doctor profile pages to find at least 10 pediatricians in Raleigh who accept one or more of those plans; for every pediatrician you include, open the real profile or listing page in its own tab and verify both the accepted insurance and the clinic location so I can inspect the pages myself. After that, switch to childcare and use Google to find around 10 daycare centers in Raleigh, then open the actual daycare page, official site, or a reputable listing page for each one in its own tab and verify the age range served and the location, because I need to compare realistic options for a family with young kids. Please keep all the pediatrician and daycare tabs open as proof, and at the end give me a structured report with three sections for the insurance plans, the pediatricians who take those plans, and the daycare options, plus a short judgment on which insurance plan seems to have the biggest pediatrician network in Raleigh and which daycare centers look the most highly rated.","website":"https://www.google.com","level":"hard","reference_length":9,"categories":["Health > Health - Other","Science and Education > Education","Lifestyle > Childcare"],"precomputed_rubric":{"items":[{"criterion":"Three major insurers relevant to Raleigh are identified and official overview pages are selected for Aetna, Blue Cross NC, and UnitedHealthcare.","description":"Three major insurers relevant to Raleigh are identified and official overview pages are selected for Aetna, Blue Cross NC, and UnitedHealthcare.\n\nHow a grader verifies this: Grader can see official insurer overview tabs open for all three carriers and the report names the three plans/insurers.","max_points":120},{"criterion":"Aetna overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.","description":"Aetna overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.\n\nHow a grader verifies this: Aetna overview tab and Aetna provider search/directory tab are visibly open, with the directory showing a pediatrician search context for Raleigh or equivalent.","max_points":80},{"criterion":"Blue Cross NC overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.","description":"Blue Cross NC overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.\n\nHow a grader verifies this: Blue Cross NC overview tab and provider search/directory tab are visibly open, with pediatrician search capability shown for Raleigh or equivalent.","max_points":80},{"criterion":"UnitedHealthcare overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.","description":"UnitedHealthcare overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.\n\nHow a grader verifies this: UnitedHealthcare overview tab and provider search/directory tab are visibly open, with pediatrician search capability shown for Raleigh or equivalent.","max_points":80},{"criterion":"At least 10 pediatricians in Raleigh who accept one or more of the selected plans are identified and verified.","description":"At least 10 pediatricians in Raleigh who accept one or more of the selected plans are identified and verified.\n\nHow a grader verifies this: Final report lists at least 10 pediatricians with accepted insurance and clinic location, and the open tabs show matching doctor profile or listing pages.","max_points":240},{"criterion":"Every listed pediatrician corresponds to an open tab showing an actual doctor profile or listing page.","description":"Every listed pediatrician corresponds to an open tab showing an actual doctor profile or listing page.\n\nHow a grader verifies this: Grader can count open pediatrician tabs and match them to the doctors named in the report.","max_points":100},{"criterion":"Around 10 Raleigh daycare centers are identified with verified location and age range served.","description":"Around 10 Raleigh daycare centers are identified with verified location and age range served.\n\nHow a grader verifies this: Final report lists about 10 daycare centers, each with location and age range, supported by visible daycare listing or official tabs.","max_points":160},{"criterion":"Every listed daycare corresponds to an open tab showing the actual daycare page or reputable listing page.","description":"Every listed daycare corresponds to an open tab showing the actual daycare page or reputable listing page.\n\nHow a grader verifies this: Grader can match each daycare named in the report to an open tab on Care.com, Winnie, or an official site.","max_points":60},{"criterion":"A final structured report is produced with the three required sections and a brief comparative summary of pediatrician network size and daycare ratings.","description":"A final structured report is produced with the three required sections and a brief comparative summary of pediatrician network size and daycare ratings.\n\nHow a grader verifies this: Report includes sections for insurance plans, pediatricians, and daycare options, plus a concise conclusion naming the plan with the broadest apparent pediatrician network and the daycare centers that seem most highly rated.","max_points":80}]}} +{"task_id":"b421f308e18fc92b84ed676609e62a2b536b28b7","confirmed_task":"I'm flying from Pittsburgh to a wedding in Palm Springs on the 3rd of next month and I need help putting together a full trip plan in the browser so I can actually see everything. Please start on Google Flights and search for round-trip flights from Pittsburgh to LAX, picking dates so I land at least 2 days before the wedding and get back to Pittsburgh by the 5th, and prioritize non-stop options if they exist — open the best result in its own tab so I can review it. Then do the same search but for Pittsburgh to Palm Springs International Airport (PSP) instead, because flying directly into Palm Springs might save me the drive entirely, and open that result in its own tab too so I can compare the two side by side. Once you've got both flight options, pull up Google Maps and check the drive time from LAX to Palm Springs, because I only want to drive between 9am and 4pm — so if the LAX flight lands too late to make that window, either adjust the flight date or find me a hotel near LAX on Booking.com for an overnight stay before driving, and leave the hotel page open so I can see the price and location. If PSP ends up being the better option and skips the drive issue entirely, flag that clearly. After that, search for car rental options at whichever airport makes more sense for the dates I'd need, and open at least one rental listing so I can see the vehicle type and daily rate. Then check whether I could squeeze in a stop at either Soban or Holbox on any of the drives between the airport and Palm Springs — look up both on Google Maps, see how far each detour would add, and recommend which one is actually worth it given the 9am–4pm driving constraint. Finally, open CryptPad and create a new document where you lay out the full day-by-day itinerary covering the chosen flights, the drive or lack thereof, hotel if needed, car rental, the wedding on the 3rd, and the recommended detour stop, and leave the CryptPad doc open so I can edit it later.","website":"https://www.google.com","level":"hard","reference_length":8,"categories":["Travel and Tourism > Air Travel","Travel and Tourism > Accommodation and Hotels","Travel and Tourism > Car Rentals"],"precomputed_rubric":{"items":[{"criterion":"Search Google Flights for round-trip non-stop flights from Pittsburgh to LAX, arriving at least 2 days before the 3rd and returning by the 5th, and open the best result in its own tab.","description":"Search Google Flights for round-trip non-stop flights from Pittsburgh to LAX, arriving at least 2 days before the 3rd and returning by the 5th, and open the best result in its own tab.\n\nHow a grader verifies this: Grader can confirm a Google Flights tab is open showing Pittsburgh to LAX results with correct dates, and the selected option is visible with airline, times, and price.","max_points":150},{"criterion":"Search Google Flights for round-trip flights from Pittsburgh to Palm Springs International Airport (PSP) for the same date constraints, and open the best result in its own tab for comparison.","description":"Search Google Flights for round-trip flights from Pittsburgh to Palm Springs International Airport (PSP) for the same date constraints, and open the best result in its own tab for comparison.\n\nHow a grader verifies this: Grader can confirm a Google Flights tab is open showing Pittsburgh to PSP results with correct dates, and the selected option is visible with airline, times, and price.","max_points":150},{"criterion":"Look up the drive time from LAX to Palm Springs on Google Maps and assess whether the LAX flight's landing time allows driving within the 9am–4pm window. If not, either adjust the flight or find a hotel near LAX on Booking.com with the page left open.","description":"Look up the drive time from LAX to Palm Springs on Google Maps and assess whether the LAX flight's landing time allows driving within the 9am–4pm window. If not, either adjust the flight or find a hotel near LAX on Booking.com with the page left open.\n\nHow a grader verifies this: Grader can confirm a Google Maps lookup was performed, the driving window constraint is addressed, and if a hotel is needed, a Booking.com property page is open with price and dates.","max_points":120},{"criterion":"Explicitly compare the LAX vs PSP flight options and flag which airport makes more sense given price, convenience, and the driving constraint.","description":"Explicitly compare the LAX vs PSP flight options and flag which airport makes more sense given price, convenience, and the driving constraint.\n\nHow a grader verifies this: Final response clearly states the trade-offs between LAX and PSP and names the recommended airport with reasoning.","max_points":100},{"criterion":"Find car rental options at the recommended airport for the trip dates, and open at least one rental listing showing vehicle type and daily rate.","description":"Find car rental options at the recommended airport for the trip dates, and open at least one rental listing showing vehicle type and daily rate.\n\nHow a grader verifies this: Grader can confirm a car rental search was performed and at least one concrete option is visible with provider, vehicle type, and price.","max_points":100},{"criterion":"Look up both Soban and Holbox on Google Maps, assess detour feasibility on the drives between the airport and Palm Springs within the 9am–4pm window, and recommend one.","description":"Look up both Soban and Holbox on Google Maps, assess detour feasibility on the drives between the airport and Palm Springs within the 9am–4pm window, and recommend one.\n\nHow a grader verifies this: Grader can confirm both locations were searched, detour distances/times are reported, and a clear recommendation is made with reasoning tied to the driving constraint.","max_points":130},{"criterion":"Create a CryptPad document containing the full day-by-day itinerary covering flights, driving, hotel if needed, car rental, the wedding, and the recommended detour, and leave the document open.","description":"Create a CryptPad document containing the full day-by-day itinerary covering flights, driving, hotel if needed, car rental, the wedding, and the recommended detour, and leave the document open.\n\nHow a grader verifies this: Grader can confirm a CryptPad document is open with a structured itinerary that includes all required components.","max_points":150},{"criterion":"Provide a concise final summary naming the chosen flights, airport, car rental, hotel if applicable, and detour recommendation.","description":"Provide a concise final summary naming the chosen flights, airport, car rental, hotel if applicable, and detour recommendation.\n\nHow a grader verifies this: Grader can confirm the final response integrates all components into a coherent trip plan that respects the 2-day-early arrival, return by the 5th, and 9am–4pm driving constraints.","max_points":100}]}} +{"task_id":"1fd26abb3743ca1dfdc648af0fcab2c3a2def6e9","confirmed_task":"I’m moving from Pittsburgh to San Francisco and want to get a realistic side-by-side view of my options before I decide whether to hire movers, use a container, or just drive a truck myself. Please start on MovingAPT.com and get me a long-distance estimate for a 1-bedroom apartment move from Pittsburgh, PA to San Francisco, CA, and keep the quote page or results open so I can look at what assumptions they used. Then do the same on International Van Lines for the same 1-bedroom move, because I want at least two full-service mover quotes to compare. After that, check PODS for a container option that would make sense for a 1-bedroom apartment on that same route, and then check U-Pack for the equivalent portable moving setup, making note of whether they’re pricing by container count, trailer space, delivery, or monthly rental. Once those are open, go to U-Haul and price out a one-way 15-foot truck from Pittsburgh to San Francisco, then on U-Haul’s site find the MPG or fuel economy info for that truck so we can estimate the real driving cost. Use Google Maps to pull up the driving route from Pittsburgh, PA to San Francisco, CA and record the mileage, and leave the map visible so I can sanity-check the route distance on screen. I also have State Farm renters insurance, so please look on State Farm’s site to see whether my belongings are covered while they’re in transit during a move or whether I’d probably need separate moving coverage or valuation. After that, check Trustpilot for MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul, and open each company’s Trustpilot page in its own tab so I can visually compare the ratings and review counts. In the end, pull everything together into one comparison with the estimated total cost for each option, and for U-Haul please calculate the truck rental plus estimated fuel cost using the route mileage and the truck MPG, so I can see which option is cheapest and which seems safest.","website":"https://www.google.com","level":"hard","reference_length":10,"categories":["Business and Consumer Services > Moving & Relocation"],"precomputed_rubric":{"items":[{"criterion":"A MovingAPT quote or estimate for a 1-bedroom move from Pittsburgh, PA to San Francisco, CA is obtained, with price and visible assumptions or included services captured.","description":"A MovingAPT quote or estimate for a 1-bedroom move from Pittsburgh, PA to San Francisco, CA is obtained, with price and visible assumptions or included services captured.\n\nHow a grader verifies this: Grader can confirm a MovingAPT quote/results page is open or was visited, and the final notes include a price plus assumptions/services shown on that page.","max_points":100},{"criterion":"An International Van Lines quote or estimate for a 1-bedroom move from Pittsburgh, PA to San Francisco, CA is obtained, with price and visible assumptions or included services captured.","description":"An International Van Lines quote or estimate for a 1-bedroom move from Pittsburgh, PA to San Francisco, CA is obtained, with price and visible assumptions or included services captured.\n\nHow a grader verifies this: Grader can confirm an International Van Lines quote/results page is open or was visited, and the final notes include a price plus assumptions/services shown on that page.","max_points":100},{"criterion":"PODS pricing for the route is captured with the major fee structure or assumptions visible on the pricing page.","description":"PODS pricing for the route is captured with the major fee structure or assumptions visible on the pricing page.\n\nHow a grader verifies this: Grader can confirm a PODS pricing page is open or was visited, and the response includes the estimated total plus details such as container size, delivery, transport, storage, or rental assumptions.","max_points":90},{"criterion":"U-Pack pricing for the route is captured with the major fee structure or assumptions visible on the quote page.","description":"U-Pack pricing for the route is captured with the major fee structure or assumptions visible on the quote page.\n\nHow a grader verifies this: Grader can confirm a U-Pack quote page is open or was visited, and the response includes the estimated total plus details such as trailer footage, cube count, transit, or related assumptions.","max_points":90},{"criterion":"A one-way U-Haul 15-foot truck rental estimate for Pittsburgh to San Francisco is found and recorded with visible pricing details.","description":"A one-way U-Haul 15-foot truck rental estimate for Pittsburgh to San Francisco is found and recorded with visible pricing details.\n\nHow a grader verifies this: Grader can confirm a U-Haul estimate page is open or was visited, and the response includes the 15-foot truck estimate with base rental and visible fees or truck details.","max_points":100},{"criterion":"The U-Haul 15-foot truck MPG or fuel economy figure used for fuel estimation is correctly captured from U-Haul’s site.","description":"The U-Haul 15-foot truck MPG or fuel economy figure used for fuel estimation is correctly captured from U-Haul’s site.\n\nHow a grader verifies this: Grader can confirm the U-Haul truck info page or specification page was visited, and the response includes the MPG/fuel economy figure tied to the 15-foot truck.","max_points":70},{"criterion":"The Pittsburgh to San Francisco driving distance is obtained from Google Maps and recorded for the fuel calculation.","description":"The Pittsburgh to San Francisco driving distance is obtained from Google Maps and recorded for the fuel calculation.\n\nHow a grader verifies this: Grader can confirm a Google Maps route is visible or was visited, and the response includes the route mileage used in the calculation.","max_points":70},{"criterion":"State Farm renters insurance coverage during a move is researched and summarized accurately, including whether separate moving coverage or valuation may be needed.","description":"State Farm renters insurance coverage during a move is researched and summarized accurately, including whether separate moving coverage or valuation may be needed.\n\nHow a grader verifies this: Grader can confirm State Farm pages were visited, and the response includes a coverage conclusion plus caveats or limitations about property in transit.","max_points":100},{"criterion":"Trustpilot review information is collected for MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul, with pages opened in separate tabs for visual comparison.","description":"Trustpilot review information is collected for MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul, with pages opened in separate tabs for visual comparison.\n\nHow a grader verifies this: Grader can confirm Trustpilot pages for all five providers were visited or left open in tabs, and the response includes each provider’s rating and review count or clear review sentiment.","max_points":120},{"criterion":"A complete final comparison is produced covering all five moving options, including estimated total costs, U-Haul total with calculated fuel estimate, Trustpilot review data, and State Farm insurance findings.","description":"A complete final comparison is produced covering all five moving options, including estimated total costs, U-Haul total with calculated fuel estimate, Trustpilot review data, and State Farm insurance findings.\n\nHow a grader verifies this: Grader can confirm the final output includes MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul in one comparison, with U-Haul total derived from rental plus fuel and with review and insurance context included.","max_points":160}]}} +{"task_id":"e53065fe786881377e88667a80ccc2edcb321320","confirmed_task":"I’m trying to help my 16-year-old figure out a good summer pre-college option, and I want to do a pretty careful browser-based search across top U.S. universities rather than just rely on a generic list. Please start with Northeast schools first on Google and check places like Harvard, Yale, Princeton, Columbia, Penn, Cornell, Brown, Dartmouth, and MIT for official pre-college or summer programs for high school students. I only want programs that are actually in person on campus, are meant for high school students around age 16, let students take real college-level classes or courses taught by university instructors, and run for less than about 8 weeks total, because I’m trying to find something academically serious but not too long. As you find anything that looks promising, open the actual official program page in its own tab and read it closely to verify those details, then open the official application or admissions page in another tab so I can see what applying would really involve. While you do that, create a spreadsheet called Pre-College Summer Programs in CryptPad Sheets and log every verified match with the university name, program name, location, length, whether it offers college credit or clearly college-level classes, the application deadline if it’s listed, the program-page link, and the application-page link. If you check a school and it doesn’t seem to have a qualifying option, add a quick note for that too so we know it was reviewed. After you’ve covered the Northeast, expand to a few other strong top-30 schools like Stanford, UChicago, Duke, Northwestern, Rice, Vanderbilt, WashU, UCLA, or Berkeley and apply the same standards. Please keep the matching program tabs and their application tabs open so I can visually inspect them myself afterward, and when you’re done give me a short summary that highlights the strongest Northeast fits first, with a few especially good non-Northeast options as backups.","website":"https://www.google.com","level":"hard","reference_length":8,"categories":["Science and Education > Education","Science and Education > Universities and Colleges"],"precomputed_rubric":{"items":[{"criterion":"A spreadsheet titled 'Pre-College Summer Programs' is created and used to track both verified matches and schools checked that did not qualify.","description":"A spreadsheet titled 'Pre-College Summer Programs' is created and used to track both verified matches and schools checked that did not qualify.\n\nHow a grader verifies this: Grader can see a spreadsheet with that exact title open in CryptPad Sheets containing entries for matches and non-match notes.","max_points":100},{"criterion":"The search is systematic across top-30 universities with clear emphasis on Northeast schools before expanding to a few strong non-Northeast options.","description":"The search is systematic across top-30 universities with clear emphasis on Northeast schools before expanding to a few strong non-Northeast options.\n\nHow a grader verifies this: Browser history/tabs and spreadsheet entries show Northeast universities were searched first, followed by a smaller set of non-Northeast top universities.","max_points":120},{"criterion":"Northeast programs included as matches are verified on official university pages as in-person, intended for high school students around age 16, offering real college-level classes or courses taught by university instructors, and shorter than about 8 weeks.","description":"Northeast programs included as matches are verified on official university pages as in-person, intended for high school students around age 16, offering real college-level classes or courses taught by university instructors, and shorter than about 8 weeks.\n\nHow a grader verifies this: Open official program tabs and spreadsheet notes visibly support each required criterion for the Northeast matches.","max_points":200},{"criterion":"Each qualifying Northeast program has its official application or admissions page opened in a tab and documented with application link and deadline if listed.","description":"Each qualifying Northeast program has its official application or admissions page opened in a tab and documented with application link and deadline if listed.\n\nHow a grader verifies this: For each Northeast match, a corresponding application/admissions tab is open and the spreadsheet contains the application URL plus any visible deadline.","max_points":120},{"criterion":"A few strong non-Northeast top-30 universities are also searched for matching pre-college programs.","description":"A few strong non-Northeast top-30 universities are also searched for matching pre-college programs.\n\nHow a grader verifies this: Tabs and spreadsheet entries show searches and checks for several named non-Northeast universities such as Stanford, UChicago, Duke, Northwestern, Rice, Vanderbilt, WashU, UCLA, or Berkeley.","max_points":80},{"criterion":"Non-Northeast programs included as matches are verified against the same criteria and documented, with official application/admissions pages opened and linked.","description":"Non-Northeast programs included as matches are verified against the same criteria and documented, with official application/admissions pages opened and linked.\n\nHow a grader verifies this: Open official tabs and spreadsheet entries for non-Northeast matches show the same eligibility, format, academic rigor, duration, and application details.","max_points":160},{"criterion":"The spreadsheet contains complete entries for every verified program and quick notes for checked schools that did not fit, and the corresponding program and application tabs remain open for visual inspection.","description":"The spreadsheet contains complete entries for every verified program and quick notes for checked schools that did not fit, and the corresponding program and application tabs remain open for visual inspection.\n\nHow a grader verifies this: Spreadsheet rows include university name, program name, location, length, college credit or college-level class info, application deadline if listed, program-page link, and application-page link, while matching browser tabs are still open.","max_points":140},{"criterion":"A concise final summary highlights the strongest Northeast options first and includes the best non-Northeast backups, consistent with the spreadsheet and open tabs.","description":"A concise final summary highlights the strongest Northeast options first and includes the best non-Northeast backups, consistent with the spreadsheet and open tabs.\n\nHow a grader verifies this: Final response prioritizes Northeast fits, references only programs documented in the spreadsheet, and matches the visible open tabs.","max_points":80}]}} +{"task_id":"3bfacd06345631511177bf106f9040300e5875da","confirmed_task":"I’m helping someone look into treatment options for lung cancer in the U.S., and I want a solid browser-based shortlist I can actually review myself afterward. Please go to ClinicalTrials.gov and search for interventional lung cancer studies in the United States that are currently recruiting, then use the site filters so we’re only looking at active recruiting trials that are really relevant. As you find good candidates, open the official ClinicalTrials.gov record for each one in its own tab and keep those tabs open so I can compare them later. I need at least 15 distinct trials, and for each one please verify on the actual trial page that it’s recruiting, note the study phase, identify the treatment or intervention type, and capture the U.S. locations where it’s available. Once you’ve gathered the set, create a CryptPad Documents file titled “Lung Cancer Clinical Trials” and record one entry per trial with the trial name, treatment type, trial phase, recruiting status, locations, and the official ClinicalTrials.gov link. After that, add a short summary telling me which treatment approaches seem to come up most often and which cities or hospitals show up most frequently across the location lists. Before you finish, do one last pass to make sure the document has at least 15 complete entries and that each entry still matches an open official trial tab so I have visual proof to review.","website":"https://www.google.com","level":"hard","reference_length":8,"categories":["Health > Medicine","Science and Education > Science and Education - Other"],"precomputed_rubric":{"items":[{"criterion":"ClinicalTrials.gov is searched with filters that limit results to U.S.-based interventional lung cancer studies that are currently recruiting.","description":"ClinicalTrials.gov is searched with filters that limit results to U.S.-based interventional lung cancer studies that are currently recruiting.\n\nHow a grader verifies this: Grader can see the ClinicalTrials.gov results page with relevant search terms and visible recruiting/interventional/U.S. filtering applied.","max_points":120},{"criterion":"At least 15 distinct official ClinicalTrials.gov study records are opened in separate tabs and left open.","description":"At least 15 distinct official ClinicalTrials.gov study records are opened in separate tabs and left open.\n\nHow a grader verifies this: Browser shows 15 or more open tabs corresponding to individual ClinicalTrials.gov trial pages, not just search results.","max_points":140},{"criterion":"Each selected trial has its official trial name and recruiting status captured from the official study page, with recruiting status verified as active recruiting.","description":"Each selected trial has its official trial name and recruiting status captured from the official study page, with recruiting status verified as active recruiting.\n\nHow a grader verifies this: Document entries match visible trial titles and recruiting status on the open ClinicalTrials.gov tabs.","max_points":140},{"criterion":"Each selected trial includes the study phase and treatment or intervention type taken from the official record.","description":"Each selected trial includes the study phase and treatment or intervention type taken from the official record.\n\nHow a grader verifies this: For sampled entries, the phase and intervention details in the document match the corresponding fields on the open ClinicalTrials.gov pages.","max_points":120},{"criterion":"Each selected trial includes U.S. recruiting locations from the official record.","description":"Each selected trial includes U.S. recruiting locations from the official record.\n\nHow a grader verifies this: For sampled entries, the listed cities or hospitals in the document match the locations section on the corresponding ClinicalTrials.gov pages.","max_points":120},{"criterion":"The final set is validated so every included study is lung cancer related, interventional, currently recruiting, U.S.-based, and complete for all required fields, with any invalid studies replaced.","description":"The final set is validated so every included study is lung cancer related, interventional, currently recruiting, U.S.-based, and complete for all required fields, with any invalid studies replaced.\n\nHow a grader verifies this: Final document contains only qualifying studies, and any replacements correspond to open official tabs that satisfy the criteria.","max_points":140},{"criterion":"A CryptPad Document titled 'Lung Cancer Clinical Trials' is created and contains at least 15 entries with trial name, treatment type, trial phase, recruiting status, locations, and official ClinicalTrials.gov link.","description":"A CryptPad Document titled 'Lung Cancer Clinical Trials' is created and contains at least 15 entries with trial name, treatment type, trial phase, recruiting status, locations, and official ClinicalTrials.gov link.\n\nHow a grader verifies this: CryptPad Document title is visible and the body contains 15 or more complete entries with all required fields and links.","max_points":120},{"criterion":"The document includes a summary of the most common treatment approaches and the most frequent cities or hospitals, and the open official tabs remain available for visual cross-checking.","description":"The document includes a summary of the most common treatment approaches and the most frequent cities or hospitals, and the open official tabs remain available for visual cross-checking.\n\nHow a grader verifies this: A summary section is visible in the CryptPad Document, and the browser still shows the official ClinicalTrials.gov tabs open.","max_points":100}]}} +{"task_id":"5d157ce3b5a1d2ecbd01bc29e8b2c0a309971c33","confirmed_task":"I’m helping a friend who’s moving to Stanford for work and wants a realistic shortlist of apartments they could actually consider, so please use Apartments.com to search around Stanford University in Stanford/Palo Alto and keep it to places that look like about a 20-minute commute or less to campus. The budget is pretty specific: for a 1-bedroom, stay under $3,500 a month, and for a 2-bedroom that would work for roommates, stay under $6,000 a month. As you find matches, open the actual listing page for each apartment in its own tab so I can visually compare them later, and make sure each one really shows the rent and bedroom count on the listing itself before you keep it. For commute time, use Google Maps or the map/location details from the listing to estimate how long it would take to get to Stanford University, and only keep the ones that are still roughly within that 20-minute window. I’d like around 20 solid options if possible. Then create a CryptPad Sheets spreadsheet titled Stanford Apartment Options and log each one with the building name or street address, monthly rent, number of bedrooms, estimated commute time to Stanford University, and the direct listing link. Once the sheet is filled out, add a short note in the sheet about which nearby neighborhoods seem to have the most within-budget options, and leave the spreadsheet open along with the apartment tabs so I can look through the listings myself.","website":"https://www.google.com","level":"hard","reference_length":7,"categories":["Business and Consumer Services > Real Estate"],"precomputed_rubric":{"items":[{"criterion":"A relevant Apartments.com search near Stanford University/Palo Alto is performed using criteria aligned with 1-bedroom under $3,500 and 2-bedroom under $6,000.","description":"A relevant Apartments.com search near Stanford University/Palo Alto is performed using criteria aligned with 1-bedroom under $3,500 and 2-bedroom under $6,000.\n\nHow a grader verifies this: Grader can see Apartments.com search results or filters reflecting the Stanford/Palo Alto area and the stated bedroom and price constraints.","max_points":140},{"criterion":"Promising apartment listings are opened in separate browser tabs from the search results.","description":"Promising apartment listings are opened in separate browser tabs from the search results.\n\nHow a grader verifies this: Browser shows multiple open listing tabs corresponding to apartment result pages rather than only the search page.","max_points":110},{"criterion":"Each included apartment is verified directly on its listing page for building name/address, rent, and bedroom count, and only qualifying listings are retained.","description":"Each included apartment is verified directly on its listing page for building name/address, rent, and bedroom count, and only qualifying listings are retained.\n\nHow a grader verifies this: Open listing pages visibly display rent and bedroom information matching what is later recorded in the spreadsheet.","max_points":180},{"criterion":"Each included apartment has an approximate commute time to Stanford University checked and is kept only if it is about 20 minutes or less.","description":"Each included apartment has an approximate commute time to Stanford University checked and is kept only if it is about 20 minutes or less.\n\nHow a grader verifies this: Google Maps pages, map snippets, or recorded commute values show commute checks tied to the retained listings.","max_points":160},{"criterion":"Around 20 qualifying apartments are collected, and each retained listing still corresponds to an open live listing tab.","description":"Around 20 qualifying apartments are collected, and each retained listing still corresponds to an open live listing tab.\n\nHow a grader verifies this: There are approximately 20 entries retained and the browser still shows the associated apartment tabs open for visual confirmation.","max_points":170},{"criterion":"A CryptPad Sheets spreadsheet titled 'Stanford Apartment Options' is created and includes for each listing the building name/address, monthly rent, bedroom count, estimated commute time, and listing URL.","description":"A CryptPad Sheets spreadsheet titled 'Stanford Apartment Options' is created and includes for each listing the building name/address, monthly rent, bedroom count, estimated commute time, and listing URL.\n\nHow a grader verifies this: Open CryptPad Sheets shows the specified title and rows with all required columns populated for the collected apartments.","max_points":160},{"criterion":"The spreadsheet includes a short summary identifying which nearby neighborhoods appear to have the most within-budget listings, and the sheet remains open for review.","description":"The spreadsheet includes a short summary identifying which nearby neighborhoods appear to have the most within-budget listings, and the sheet remains open for review.\n\nHow a grader verifies this: A visible note or summary section in the sheet names neighborhoods with the strongest concentration of qualifying listings, and the sheet is left open.","max_points":80}]}} +{"task_id":"f23a062af7be0d5a28f1dcb1f06cc79a89dd04d6","confirmed_task":"I’m helping a professor who works in natural language processing put together a serious funding list, and I want this to be something they can actually review in the browser afterward. Please start in CryptPad Sheets and create a spreadsheet called NLP Grant Opportunities so we have a clean place to track everything. Then use Google to search for active funding opportunities on official funder sites that are relevant to artificial intelligence, machine learning, computational linguistics, or NLP, focusing on opportunities that university faculty, professors, principal investigators, or academic researchers can apply for. As you find promising results, open the official opportunity page in its own tab, read enough of the page to confirm the call is still active and that academic applicants are eligible, and then record the program name, funding organization, research area or topic, award amount if the page lists one, application deadline, and the official link in the sheet. I need at least 20 distinct verified opportunities, and every row in the sheet should match an official grant page tab that stays open so I can visually review them one by one later. Once you’ve built the list, use the collected set to add a short summary in the sheet about what kinds of funders seem to support the most AI/NLP research—like federal agencies, foundations, nonprofits, or industry-backed research programs—and include your quick take on the strongest opportunities. Please leave the spreadsheet open at the end with the official grant tabs still open too.","website":"https://www.google.com","level":"hard","reference_length":7,"categories":["Science and Education > Grants Scholarships and Financial Aid","Computers Electronics and Technology > Programming and Developer Software"],"precomputed_rubric":{"items":[{"criterion":"A CryptPad Sheets document titled 'NLP Grant Opportunities' is created and used as the main workspace.","description":"A CryptPad Sheets document titled 'NLP Grant Opportunities' is created and used as the main workspace.\n\nHow a grader verifies this: Grader can see an open CryptPad Sheets document with the exact title visible in the header/tab.","max_points":80},{"criterion":"The agent performs broad Google searches that target official funding sources relevant to AI, ML, computational linguistics, or NLP and academic eligibility.","description":"The agent performs broad Google searches that target official funding sources relevant to AI, ML, computational linguistics, or NLP and academic eligibility.\n\nHow a grader verifies this: Browser history or open search result pages show multiple relevant Google queries and results pointing to official funder domains.","max_points":120},{"criterion":"Each included opportunity is verified on an official opportunity page as currently active.","description":"Each included opportunity is verified on an official opportunity page as currently active.\n\nHow a grader verifies this: Open tabs show official grant pages with visible status indicators, current cycle language, open call text, or deadlines that demonstrate the opportunity is active.","max_points":180},{"criterion":"Each included opportunity is verified as open to academic researchers, professors, universities, principal investigators, or equivalent academic applicants.","description":"Each included opportunity is verified as open to academic researchers, professors, universities, principal investigators, or equivalent academic applicants.\n\nHow a grader verifies this: Official pages or eligibility sections in open tabs visibly mention universities, faculty, academic institutions, PIs, or similar eligible applicant categories.","max_points":160},{"criterion":"At least 20 distinct verified grant opportunities are collected, and each one corresponds to its own open official opportunity tab.","description":"At least 20 distinct verified grant opportunities are collected, and each one corresponds to its own open official opportunity tab.\n\nHow a grader verifies this: Spreadsheet contains at least 20 distinct rows and the browser shows a matching set of official grant tabs left open for review.","max_points":200},{"criterion":"For each verified grant, the spreadsheet includes program name, funding organization, research area or topic, award amount if listed, application deadline, and official opportunity link.","description":"For each verified grant, the spreadsheet includes program name, funding organization, research area or topic, award amount if listed, application deadline, and official opportunity link.\n\nHow a grader verifies this: Rows in the spreadsheet visibly contain all required fields, with links present and award cells filled when the official page lists an amount.","max_points":180},{"criterion":"The spreadsheet includes a summary identifying which types of organizations appear to fund the most AI/NLP research and gives brief recommendations on strong opportunities.","description":"The spreadsheet includes a summary identifying which types of organizations appear to fund the most AI/NLP research and gives brief recommendations on strong opportunities.\n\nHow a grader verifies this: A visible summary section in the sheet describes funder patterns such as federal agencies, foundations, nonprofits, or industry-backed programs and includes recommendation language.","max_points":80}]}} +{"task_id":"47b251d71185920165b7645139ead965cd47441a","confirmed_task":"I'm seriously thinking about boarding school for my child for high school, and I want a solid college-prep shortlist I can actually look through myself afterward. Please start on Google and use credible ranking or review sources to identify about 15 to 20 of the strongest U.S. boarding schools with strong academic reputations, then for each school open the actual admissions page in its own tab so I can compare them side by side. As you go, please make sure each school really does offer boarding and is clearly a college-preparatory high school, not just a day school or a specialty program. Then create a CryptPad Sheets spreadsheet called Top Boarding Schools and log each verified school with the school name, city and state, annual boarding tuition or total boarding cost, application deadline if the admissions site lists one, and the direct admissions page link. I also want browser-proof here, so please leave every admissions tab open for the schools you include, and if a tuition or deadline is buried on a separate tuition or apply page, open that page long enough to verify it before recording the number and then keep the admissions tab available. Once the sheet is filled out with around 15 to 20 strong options, add a short summary in the sheet or a companion CryptPad Document about the typical tuition range and where these schools are concentrated geographically, and finish with a brief recommendation note about the most compelling options so I have a practical starting point.","website":"https://www.google.com","level":"hard","reference_length":7,"categories":["Science and Education > Education"],"precomputed_rubric":{"items":[{"criterion":"A credible initial pool of top U.S. college-preparatory boarding schools is identified from authoritative Google search results or ranking sources.","description":"A credible initial pool of top U.S. college-preparatory boarding schools is identified from authoritative Google search results or ranking sources.\n\nHow a grader verifies this: Grader can confirm relevant Google results and/or opened source pages showing recognized rankings, reviews, or roundup lists that support the candidate pool.","max_points":140},{"criterion":"Admissions pages are opened in separate tabs for about 15 to 20 promising schools.","description":"Admissions pages are opened in separate tabs for about 15 to 20 promising schools.\n\nHow a grader verifies this: Browser shows roughly 15 to 20 school-domain tabs open on admissions pages, one per included school.","max_points":140},{"criterion":"Each included school is verified to offer boarding and to be a college-preparatory high school.","description":"Each included school is verified to offer boarding and to be a college-preparatory high school.\n\nHow a grader verifies this: Visible content on school admissions, residential life, academics, or about pages confirms boarding availability and college-preparatory secondary education for each included school.","max_points":180},{"criterion":"The required fields are accurately extracted for each verified school: school name, location, annual boarding tuition or total boarding cost, application deadline if listed, and admissions page link.","description":"The required fields are accurately extracted for each verified school: school name, location, annual boarding tuition or total boarding cost, application deadline if listed, and admissions page link.\n\nHow a grader verifies this: Spreadsheet entries match the visible information on the school sites, including cost and deadline values where available and direct admissions URLs.","max_points":220},{"criterion":"A CryptPad Sheets file titled Top Boarding Schools is created and populated in a clear structured format.","description":"A CryptPad Sheets file titled Top Boarding Schools is created and populated in a clear structured format.\n\nHow a grader verifies this: CryptPad Sheets shows a spreadsheet with the correct title and a usable table containing the collected school data.","max_points":120},{"criterion":"A final synthesis is added summarizing the typical tuition range, geographic distribution, and brief recommendations.","description":"A final synthesis is added summarizing the typical tuition range, geographic distribution, and brief recommendations.\n\nHow a grader verifies this: The sheet or companion CryptPad Document contains a concise written summary discussing tuition patterns, regional concentration, and standout schools.","max_points":120},{"criterion":"The final set includes about 15 to 20 strong schools and each documented school corresponds to an open admissions tab left available for review.","description":"The final set includes about 15 to 20 strong schools and each documented school corresponds to an open admissions tab left available for review.\n\nHow a grader verifies this: The number of spreadsheet rows aligns with the number of open admissions tabs, and the tabs remain open on the relevant school admissions pages at task end.","max_points":80}]}} +{"task_id":"940d8aaa7700347c9fd9a0508e5de2e07c23cdb5","confirmed_task":"I’m helping a friend figure out housing near MIT in Cambridge, so could you use a real browser to look for apartments that are roughly within a 20-minute commute to MIT and keep this organized for me? Start with Google to find solid rental sites that actually have Cambridge and nearby Boston-area listings, then use places like Apartments.com and any other major listing sources you find to search for either 1-bedroom apartments under $3,000 a month or 2-bedroom apartments under $5,000 a month for a roommate setup. As you find listings that seem to fit, open each actual listing page in its own tab so I can visually compare the photos, addresses, and details later, and only keep tabs open for listings that really match the bedroom and budget limits. For every listing you keep, verify the rent, bedroom count, and building name or address on the listing page itself, then use Google Maps to check the commute to MIT and keep only the ones that look to be about 20 minutes or less. After that, create a spreadsheet in CryptPad Sheets called MIT Apartment Options and record about 20 good options with the building or address, monthly rent, number of bedrooms, estimated commute time to MIT, and the direct listing link. Please make sure every row in the sheet corresponds to a listing tab that is still open on the actual apartment page, because I want to be able to click around and inspect them afterward. Once the sheet is filled out, add a short note summarizing which neighborhoods seem to have the most within-budget options so I can see where the best concentration of listings is.","website":"https://www.google.com","level":"hard","reference_length":7,"categories":["Business and Consumer Services > Real Estate"],"precomputed_rubric":{"items":[{"criterion":"Identify suitable apartment listing sources via Google and begin a search focused on rentals near MIT in Cambridge and nearby neighborhoods.","description":"Identify suitable apartment listing sources via Google and begin a search focused on rentals near MIT in Cambridge and nearby neighborhoods.\n\nHow a grader verifies this: Browser history or visible search results show Google used to locate apartment marketplaces relevant to Cambridge/Boston rentals near MIT.","max_points":100},{"criterion":"Open candidate apartment listings from rental sites in separate tabs using the specified bedroom and price constraints.","description":"Open candidate apartment listings from rental sites in separate tabs using the specified bedroom and price constraints.\n\nHow a grader verifies this: Multiple apartment listing tabs are visibly open from rental marketplace sites, and the listings reflect searches for 1-bedroom under $3,000 or 2-bedroom under $5,000.","max_points":200},{"criterion":"Verify each kept listing’s rent, bedroom count, building/address, and direct listing URL from the actual listing page, removing non-qualifying options.","description":"Verify each kept listing’s rent, bedroom count, building/address, and direct listing URL from the actual listing page, removing non-qualifying options.\n\nHow a grader verifies this: Open tabs show listing pages with visible rent and bedroom details, and only qualifying listings remain represented in the working set.","max_points":200},{"criterion":"Check commute times to MIT in Google Maps and keep only listings that are roughly within a 20-minute commute.","description":"Check commute times to MIT in Google Maps and keep only listings that are roughly within a 20-minute commute.\n\nHow a grader verifies this: Google Maps routes are used for listing addresses, and the final set reflects commute times at or around 20 minutes or less.","max_points":180},{"criterion":"Create a CryptPad Sheets spreadsheet titled 'MIT Apartment Options' with about 20 qualifying listings and the required columns.","description":"Create a CryptPad Sheets spreadsheet titled 'MIT Apartment Options' with about 20 qualifying listings and the required columns.\n\nHow a grader verifies this: A CryptPad Sheets file with the exact title is visible and contains approximately 20 rows of apartment entries with building/address, rent, bedrooms, commute time, and listing link.","max_points":180},{"criterion":"Ensure each spreadsheet entry corresponds to a currently open tab on that apartment’s actual listing page.","description":"Ensure each spreadsheet entry corresponds to a currently open tab on that apartment’s actual listing page.\n\nHow a grader verifies this: The browser shows open listing tabs matching the entries in the spreadsheet, allowing direct visual cross-checking between rows and tabs.","max_points":80},{"criterion":"Provide a brief summary of which neighborhoods appear to have the most qualifying within-budget listings.","description":"Provide a brief summary of which neighborhoods appear to have the most qualifying within-budget listings.\n\nHow a grader verifies this: A visible note or text summary identifies neighborhoods with the highest concentration of qualifying options based on the collected sheet entries.","max_points":60}]}} +{"task_id":"a5724e1c94ac221f0a53765c51f625b7bc3cc58e","confirmed_task":"I’m helping a university researcher who works on AI safety and alignment, and I need a solid list of current funding options they could realistically apply for. Please start on Google and search for active grant programs from official funder sites that support research in artificial intelligence, AI safety, alignment, trustworthy machine learning, or responsible AI, and focus on opportunities where academic researchers or university-based investigators are eligible. As you find promising ones, open the official opportunity page for each grant in its own tab so I can visually compare them later, and only keep it if the page itself makes it clear the opportunity is still active or open rather than archived or closed. I need at least 15 verified opportunities, and for each one please pull the program name, funding organization, research topic or focus area, award size if the page lists one, the application deadline, and the official URL. Once you’ve gathered enough, go to CryptPad Documents and create a document titled AI Safety Grant Opportunities, then put the grants into a clean table or structured list that matches the open tabs one-for-one. After that, add a short summary at the end explaining what kinds of organizations seem to fund AI safety-related research most often—like government agencies, private foundations, nonprofits, industry labs, or academic consortia—because I want to see where the strongest funding patterns are. Please leave the CryptPad Document open at the end, and keep the official grant tabs open too so I can spot-check the pages myself.","website":"https://www.google.com","level":"hard","reference_length":9,"categories":["Science and Education > Grants Scholarships and Financial Aid","Computers Electronics and Technology > Computers Electronics and Technology - Other"],"precomputed_rubric":{"items":[{"criterion":"Searches broadly on Google and identifies plausible AI safety, alignment, responsible AI, AI, or machine learning research funding opportunities from official sources.","description":"Searches broadly on Google and identifies plausible AI safety, alignment, responsible AI, AI, or machine learning research funding opportunities from official sources.\n\nHow a grader verifies this: Browser history or visible search results show Google queries and candidate results leading to official funding pages.","max_points":120},{"criterion":"Opens official grant or opportunity pages in separate tabs for candidate opportunities.","description":"Opens official grant or opportunity pages in separate tabs for candidate opportunities.\n\nHow a grader verifies this: Multiple browser tabs are open on official funding domains, each showing a distinct opportunity page.","max_points":100},{"criterion":"Confirms that retained opportunities are active or open rather than expired, archived, or clearly closed.","description":"Confirms that retained opportunities are active or open rather than expired, archived, or clearly closed.\n\nHow a grader verifies this: Visible page text on retained tabs indicates active status, open call language, current cycle information, or upcoming deadlines.","max_points":140},{"criterion":"Verifies that academic researchers, universities, or academic institutions are eligible for each included opportunity.","description":"Verifies that academic researchers, universities, or academic institutions are eligible for each included opportunity.\n\nHow a grader verifies this: Eligibility sections on retained tabs mention universities, faculty, investigators, academic institutions, or equivalent academic participation.","max_points":140},{"criterion":"Collects at least 15 verified grant opportunities relevant to AI safety, alignment, AI, machine learning, or responsible AI research.","description":"Collects at least 15 verified grant opportunities relevant to AI safety, alignment, AI, machine learning, or responsible AI research.\n\nHow a grader verifies this: The final CryptPad Document contains 15 or more distinct entries, each corresponding to a verified official grant tab.","max_points":160},{"criterion":"For each included opportunity, records program name, funding organization, research topic or focus area, award size if listed, application deadline, and official link.","description":"For each included opportunity, records program name, funding organization, research topic or focus area, award size if listed, application deadline, and official link.\n\nHow a grader verifies this: Each row or entry in the CryptPad Document includes all required fields, with award size marked only when available on the source page.","max_points":140},{"criterion":"Creates a CryptPad Document titled 'AI Safety Grant Opportunities' containing the compiled grant records.","description":"Creates a CryptPad Document titled 'AI Safety Grant Opportunities' containing the compiled grant records.\n\nHow a grader verifies this: An open CryptPad Document with the exact title is visible and includes the compiled opportunities.","max_points":80},{"criterion":"Ensures each final document entry corresponds to an open tab with the official grant page.","description":"Ensures each final document entry corresponds to an open tab with the official grant page.\n\nHow a grader verifies this: The number and identity of listed opportunities can be matched against open official tabs still visible in the browser.","max_points":60},{"criterion":"Adds a concluding summary identifying which types of organizations most frequently fund AI safety-related research among the collected opportunities and highlights key takeaways or recommendations.","description":"Adds a concluding summary identifying which types of organizations most frequently fund AI safety-related research among the collected opportunities and highlights key takeaways or recommendations.\n\nHow a grader verifies this: The CryptPad Document ends with a written summary discussing organization categories and observed funding patterns.","max_points":60}]}} +{"task_id":"e96aa77ab19737990cfa7a4da23533f2b0a0de92","confirmed_task":"I’m trying to put together a solid shortlist of the best hospitals in Texas for cardiac surgery for a family reference, so could you research this in a real browser and keep it grounded in actual hospital program pages and recognizable rankings? Start on Google and look for authoritative sources that would help identify strong Texas heart surgery centers, like U.S. News, Healthgrades, Leapfrog, CMS-related quality pages, or major hospital recognition pages, because I want the final list to be based on visible quality signals rather than guesswork. Then use U.S. News and those other quality indicators to narrow it down to the top 10 Texas hospitals for cardiac care or heart surgery. For each hospital you choose, open the actual cardiac surgery, heart surgery, or heart and vascular program page in its own tab and make sure the page clearly shows they offer advanced heart surgery services like CABG, valve repair or replacement, aortic surgery, or similar procedures. After that, create a CryptPad Sheets spreadsheet titled Top Texas Cardiac Hospitals and enter one row per hospital with the hospital name, city, a short description of the cardiac surgery program, whether it appears in rankings or quality indicators, and the direct link to the program page. Please leave all 10 hospital program tabs open so I can visually compare them later, and also keep the spreadsheet open in another tab. Once the sheet is filled out, add a short written summary in the sheet or a companion CryptPad Document explaining which Texas cities seem to have the strongest cardiac surgery centers based on how many top hospitals show up there and how prominent they are, so I can quickly see the main patterns and your top recommendations.","website":"https://www.google.com","level":"hard","reference_length":7,"categories":["Health > Medicine","Health > Health - Other"],"precomputed_rubric":{"items":[{"criterion":"Authoritative ranking or quality indicator sources relevant to Texas cardiac care hospitals are identified and used as the basis for selection.","description":"Authoritative ranking or quality indicator sources relevant to Texas cardiac care hospitals are identified and used as the basis for selection.\n\nHow a grader verifies this: Grader can confirm from browser history, open tabs, or notes that Google results and recognized sources such as U.S. News, Healthgrades, Leapfrog, CMS-related pages, or comparable quality sources were consulted.","max_points":140},{"criterion":"A final set of exactly 10 Texas hospitals is selected based on visible ranking presence or quality indicators for cardiac care or heart surgery.","description":"A final set of exactly 10 Texas hospitals is selected based on visible ranking presence or quality indicators for cardiac care or heart surgery.\n\nHow a grader verifies this: Grader can count exactly 10 hospitals in the final sheet and see that each one has some ranking or quality-indicator notation tied to the researched sources.","max_points":180},{"criterion":"Each selected hospital has its cardiac surgery, heart surgery, or heart and vascular program page opened in a separate browser tab.","description":"Each selected hospital has its cardiac surgery, heart surgery, or heart and vascular program page opened in a separate browser tab.\n\nHow a grader verifies this: Grader can visually confirm 10 distinct hospital program tabs are open, each corresponding to one hospital listed in the spreadsheet.","max_points":140},{"criterion":"For each selected hospital, the agent verifies that advanced heart surgery services are offered.","description":"For each selected hospital, the agent verifies that advanced heart surgery services are offered.\n\nHow a grader verifies this: Grader can inspect the open hospital pages and see explicit references to advanced cardiac surgery services such as CABG, valve procedures, aortic surgery, or equivalent surgical offerings.","max_points":160},{"criterion":"For each of the 10 hospitals, the required fields are accurately captured: hospital name, city, cardiac program description, ranking or quality appearance, and program page link.","description":"For each of the 10 hospitals, the required fields are accurately captured: hospital name, city, cardiac program description, ranking or quality appearance, and program page link.\n\nHow a grader verifies this: Grader can compare the spreadsheet rows against the open hospital tabs and ranking sources to confirm all five fields are present and consistent for all 10 entries.","max_points":180},{"criterion":"A CryptPad Sheets spreadsheet titled 'Top Texas Cardiac Hospitals' is created and populated with the 10 hospital records.","description":"A CryptPad Sheets spreadsheet titled 'Top Texas Cardiac Hospitals' is created and populated with the 10 hospital records.\n\nHow a grader verifies this: Grader can see an open CryptPad Sheets tab with the exact title and a structured table containing 10 rows of hospital data.","max_points":120},{"criterion":"The final output includes a concise summary identifying which Texas cities appear to have the strongest cardiac surgery centers and the spreadsheet and hospital tabs remain open for visual review.","description":"The final output includes a concise summary identifying which Texas cities appear to have the strongest cardiac surgery centers and the spreadsheet and hospital tabs remain open for visual review.\n\nHow a grader verifies this: Grader can see the summary text in the sheet or companion CryptPad Document and confirm the spreadsheet tab plus the hospital program tabs are still open.","max_points":80}]}} +{"task_id":"ca5c6ddf8b347ee0935c6044fe65cd182e4fb26c","confirmed_task":"I’m trying to piece together a pretty complicated trip and want your help doing it in the browser so I can actually see the options. On Google Flights, please start with an early-December flight from Pittsburgh to Hawaii, using a real Hawaii destination like Honolulu if that gives the best deal, because I want to break up the trip with a few days there before heading to Australia; I’d prefer a morning or late-night departure from Pittsburgh if possible, and since there are no direct flights, find me a reasonable connecting itinerary and open the best option in its own tab so I can look at the timing and price. Once you’ve got those Hawaii dates, go to Booking.com and find a good-value resort for 2 adults in Hawaii for about 3 nights that fits those flight dates, ideally somewhere well-reviewed near the beach with a private room and free cancellation if available, and open the actual property page with photos and map view so I can judge whether it feels worth it. After that, go back to Google Flights and look for a Hawaii-to-Sydney flight that leaves in the morning or at night, not the afternoon, using the Hawaii stay you picked to set the departure date; find a reasonable option and keep that result open in a separate tab too. Then on Booking.com, find me a 1-week stay in Sydney for 2 adults that’s close to the Sydney Opera House, ideally walkable or clearly nearby on the map, and open the listing page plus the map so I can verify the location myself. Once that’s set, use Google Flights again to find a Sydney-to-Tokyo flight after the Sydney stay, with only morning or night departures, and pick a reasonable option that keeps the trip flowing logically. Finally, put everything into a CryptPad Document with the flights, hotels, dates, times, airports, nightly or total lodging costs, and a full trip total, and leave the doc open along with the key tabs for the Hawaii resort and Sydney hotel so I can review the visual details.","website":"https://www.google.com","level":"hard","reference_length":6,"categories":["Travel and Tourism > Air Travel","Travel and Tourism > Accommodation and Hotels","Travel and Tourism > Car Rentals"],"precomputed_rubric":{"items":[{"criterion":"Identify at least one viable early-December Pittsburgh-to-Hawaii connecting flight itinerary, including airline(s), departure and arrival airports, dates, times, and total price, with preference given to morning or late-night departure from Pittsburgh when available.","description":"Identify at least one viable early-December Pittsburgh-to-Hawaii connecting flight itinerary, including airline(s), departure and arrival airports, dates, times, and total price, with preference given to morning or late-night departure from Pittsburgh when available.\n\nHow a grader verifies this: Grader can confirm a Google Flights results/details tab is open showing a PIT to Hawaii itinerary with connection(s), visible dates, times, airports, airline(s), and fare.","max_points":180},{"criterion":"Select a Hawaii resort for 2 adults for about 3 nights that aligns with the chosen Hawaii stopover dates, including property name, occupancy/room details, nightly or total cost, and location/value characteristics.","description":"Select a Hawaii resort for 2 adults for about 3 nights that aligns with the chosen Hawaii stopover dates, including property name, occupancy/room details, nightly or total cost, and location/value characteristics.\n\nHow a grader verifies this: Grader can confirm a Booking.com property page is open with matching dates, 2-adult occupancy, resort details, visible price, and map or photo evidence.","max_points":160},{"criterion":"Find at least one Hawaii-to-Sydney flight option that departs in the morning or at night and avoids afternoon departure, with departure airport, date, departure time, arrival time, airline(s), route, and total price.","description":"Find at least one Hawaii-to-Sydney flight option that departs in the morning or at night and avoids afternoon departure, with departure airport, date, departure time, arrival time, airline(s), route, and total price.\n\nHow a grader verifies this: Grader can confirm a Google Flights tab is open for Hawaii to Sydney showing the selected itinerary and visible departure time outside the afternoon window.","max_points":180},{"criterion":"Select accommodation in Sydney for 2 adults for one week that is close to the Sydney Opera House, including property name, room/occupancy details, dates, total cost, and clear proximity information.","description":"Select accommodation in Sydney for 2 adults for one week that is close to the Sydney Opera House, including property name, room/occupancy details, dates, total cost, and clear proximity information.\n\nHow a grader verifies this: Grader can confirm a Booking.com listing and map view are open showing the property location relative to the Sydney Opera House, along with dates and pricing.","max_points":160},{"criterion":"Find at least one reasonable Sydney-to-Tokyo flight option after the Sydney stay with morning or night departure only, including date, departure time, arrival time, airline(s), route, and total price.","description":"Find at least one reasonable Sydney-to-Tokyo flight option after the Sydney stay with morning or night departure only, including date, departure time, arrival time, airline(s), route, and total price.\n\nHow a grader verifies this: Grader can confirm a Google Flights result is shown for Sydney to Tokyo with a visible departure time that is morning or night, plus fare and route details.","max_points":140},{"criterion":"Compile a complete multi-city itinerary in CryptPad Document covering Pittsburgh to Hawaii, Hawaii stay, Hawaii to Sydney, Sydney stay, and Sydney to Tokyo, with dates in chronological order, itemized costs, and an overall total.","description":"Compile a complete multi-city itinerary in CryptPad Document covering Pittsburgh to Hawaii, Hawaii stay, Hawaii to Sydney, Sydney stay, and Sydney to Tokyo, with dates in chronological order, itemized costs, and an overall total.\n\nHow a grader verifies this: Grader can confirm a CryptPad Document is open containing all five trip components with dates, times, airports/properties, prices, and a summed total.","max_points":180}]}} +{"task_id":"9ad01a4a4bda2e8df7489c9831931b044c646a20","confirmed_task":"I’m trying to get a realistic shortlist of shoulder surgeons in Chicago because I may need surgery for a rotator cuff or labrum issue, and I want something more trustworthy than random review sites. Please start on Google and search for Chicago orthopedic surgeons who clearly specialize in shoulder surgery, especially rotator cuff repair or labrum repair, and use official hospital or orthopedic practice profile pages as the main sources. As you find strong candidates, open each surgeon’s official profile in its own tab so I can visually compare them later, and only keep people whose actual profile page clearly mentions shoulder surgery, shoulder conditions, rotator cuff repair, labrum repair, sports medicine with shoulder focus, or similar shoulder-specific procedures. While you work, create a CryptPad Sheets spreadsheet called Top Shoulder Surgeons Chicago and track the finalists there with columns for surgeon name, hospital or medical center affiliation, specialty focus, the exact confirmation that shoulder surgery is listed, and the link to the profile page. From the verified candidates, narrow it to the top 10 Chicago surgeons who seem especially strong for shoulder surgery based on what you can see on their official profiles, like shoulder specialization, fellowship training, leadership roles, sports medicine focus, or detailed shoulder procedure listings. Please leave all 10 profile tabs open so I can inspect the pages myself, then finish the sheet with a short summary of which hospitals or orthopedic centers show up most often among the 10 specialists.","website":"https://www.google.com","level":"hard","reference_length":6,"categories":["Health > Medicine","Health > Health - Other"],"precomputed_rubric":{"items":[{"criterion":"A CryptPad Sheets spreadsheet titled 'Top Shoulder Surgeons Chicago' is created and used as the workspace.","description":"A CryptPad Sheets spreadsheet titled 'Top Shoulder Surgeons Chicago' is created and used as the workspace.\n\nHow a grader verifies this: Grader can see a spreadsheet with the exact title open in CryptPad Sheets.","max_points":100},{"criterion":"Research is conducted on Google using official hospital or orthopedic practice sources to build a relevant Chicago candidate pool for shoulder surgery, rotator cuff repair, or labrum repair.","description":"Research is conducted on Google using official hospital or orthopedic practice sources to build a relevant Chicago candidate pool for shoulder surgery, rotator cuff repair, or labrum repair.\n\nHow a grader verifies this: Browser history/tabs show Google searches and resulting official physician or hospital profile pages relevant to Chicago shoulder specialists.","max_points":150},{"criterion":"Official profile pages are opened in separate tabs and each selected surgeon is verified from the page itself as performing shoulder surgery or treating shoulder-specific conditions/procedures.","description":"Official profile pages are opened in separate tabs and each selected surgeon is verified from the page itself as performing shoulder surgery or treating shoulder-specific conditions/procedures.\n\nHow a grader verifies this: Open tabs display official surgeon profile pages, and visible page text confirms shoulder surgery, shoulder conditions, rotator cuff repair, labrum repair, or equivalent shoulder-focused treatment.","max_points":250},{"criterion":"Exactly 10 Chicago surgeons are selected as the top specialists based on evidence visible on their official profiles.","description":"Exactly 10 Chicago surgeons are selected as the top specialists based on evidence visible on their official profiles.\n\nHow a grader verifies this: Spreadsheet contains exactly 10 surgeon entries, each corresponding to a Chicago-based surgeon supported by an official profile tab.","max_points":200},{"criterion":"Each of the 10 spreadsheet entries includes surgeon name, hospital or medical center affiliation, specialty focus, explicit confirmation that shoulder surgery is listed, and the profile link.","description":"Each of the 10 spreadsheet entries includes surgeon name, hospital or medical center affiliation, specialty focus, explicit confirmation that shoulder surgery is listed, and the profile link.\n\nHow a grader verifies this: Each row in the spreadsheet has all required fields populated with usable links and shoulder-specific confirmation text.","max_points":200},{"criterion":"Every surgeon listed in the spreadsheet has a corresponding official profile tab left open, and the sheet includes a brief summary of which hospitals or orthopedic centers appear most frequently among the top 10.","description":"Every surgeon listed in the spreadsheet has a corresponding official profile tab left open, and the sheet includes a brief summary of which hospitals or orthopedic centers appear most frequently among the top 10.\n\nHow a grader verifies this: There are 10 matching open profile tabs for the 10 listed surgeons, and the spreadsheet contains a written frequency summary of recurring hospitals or orthopedic centers.","max_points":100}]}} +{"task_id":"e1b99d1777a0aa911745b7ca02ba94ef10d7d45b","confirmed_task":"I’m helping a high school student who’s pretty serious about engineering and wants an on-campus summer program that actually feels academic, not just a general camp, so could you use Google to find university-hosted engineering summer programs for high school students that are in person, take place on the university campus, run for less than 6 weeks, and involve college-level or clearly advanced coursework. As you find promising ones, open each actual university program page in its own tab and verify from the page itself that it really is for high school students, that it’s in person, and that the length fits; I want at least 12 that genuinely match. Then create a CryptPad Sheets spreadsheet called Engineering Summer Programs and record, for each one, the university, program name, program length, subject focus, application deadline if it’s listed, and the direct link to the program page. Please keep the tabs for all qualifying programs open so I can visually compare the pages afterward, and if a page has photos or campus details visible, open the actual listing rather than a summary page so I can see that it’s a real campus-based program. Once the sheet is filled in, add a short note at the bottom about which U.S. regions seem to have the most engineering summer programs based on the set you found, just so I have a quick sense of where the strongest concentration is.","website":"https://www.google.com","level":"hard","reference_length":5,"categories":["Science and Education > Education","Science and Education > Universities and Colleges","Heavy Industry and Engineering > Heavy Industry and Engineering - Other"],"precomputed_rubric":{"items":[{"criterion":"A broad but relevant candidate pool of university-hosted engineering summer programs for high school students is gathered through Google Search.","description":"A broad but relevant candidate pool of university-hosted engineering summer programs for high school students is gathered through Google Search.\n\nHow a grader verifies this: Grader can confirm Google was used to surface multiple university program candidates relevant to the stated constraints before verification on university sites.","max_points":140},{"criterion":"At least 12 programs are verified on actual university program pages as meeting all required constraints: high school audience, engineering-focused, in person on a university campus, less than 6 weeks long, and involving college-level or clearly advanced coursework.","description":"At least 12 programs are verified on actual university program pages as meeting all required constraints: high school audience, engineering-focused, in person on a university campus, less than 6 weeks long, and involving college-level or clearly advanced coursework.\n\nHow a grader verifies this: Grader can confirm at least 12 open university tabs or corresponding entries tied to live program pages showing the qualifying details on-page, including evidence of college-level or advanced academic content.","max_points":340},{"criterion":"For each qualifying program, the university, program name, program length, subject focus, application deadline if listed, and direct program link are accurately extracted.","description":"For each qualifying program, the university, program name, program length, subject focus, application deadline if listed, and direct program link are accurately extracted.\n\nHow a grader verifies this: Grader can compare spreadsheet entries against the open program tabs and confirm the required fields are present and consistent with the source pages.","max_points":240},{"criterion":"A CryptPad Sheets spreadsheet titled 'Engineering Summer Programs' is created and populated with at least 12 qualifying program entries in a clear structured format.","description":"A CryptPad Sheets spreadsheet titled 'Engineering Summer Programs' is created and populated with at least 12 qualifying program entries in a clear structured format.\n\nHow a grader verifies this: Grader can see a CryptPad Sheet with the exact title and at least 12 rows of program data organized into usable columns.","max_points":180},{"criterion":"The spreadsheet includes a brief summary stating which U.S. regions appear to have the most engineering summer programs based on the identified set.","description":"The spreadsheet includes a brief summary stating which U.S. regions appear to have the most engineering summer programs based on the identified set.\n\nHow a grader verifies this: Grader can see a written summary note in the sheet that references regional distribution derived from the collected programs.","max_points":100}]}} +{"task_id":"bacbe73cdb06541360047d8c90677f7d569172bd","confirmed_task":"I want to do market research on the most popular cafes in Singapore. Analyse the menus of the top 10 cafes in singapore (by Google reviews/ratings), and make sure we include at least 1 from the North/South/East/West/Central regions of Singapore. Keep the relevant pages of each cafe open, and summarise their pricing, menu offerings, unique selling points, making sure to reference which tab is opened for each cafe. For each cafe, also help me figure out how long it would take to get to it from Tampines MRT, and include this in your final summary.","website":"https://www.google.com","level":"hard","reference_length":5,"categories":["Food and Drink > Restaurants and Delivery","Food and Drink > Beverages"],"precomputed_rubric":{"items":[{"criterion":"The top 10 most popular cafes in Singapore are identified using Google reviews/ratings, with evidence of their popularity (e.g. star ratings, review counts).","description":"The top 10 most popular cafes in Singapore are identified using Google reviews/ratings, with evidence of their popularity (e.g. star ratings, review counts).\n\nHow a grader verifies this: Grader can confirm that 10 cafes are listed with Google review ratings or review counts as evidence of ranking.","max_points":120},{"criterion":"At least 1 cafe from each of the 5 Singapore regions (North, South, East, West, Central) is included in the selection.","description":"At least 1 cafe from each of the 5 Singapore regions (North, South, East, West, Central) is included in the selection.\n\nHow a grader verifies this: Grader can confirm at least 5 distinct regions are represented with at least 1 cafe each, and the region assignment is geographically accurate.","max_points":140},{"criterion":"The relevant menu or information page for each of the 10 cafes is kept open in a separate tab, with each tab clearly referenced in the summary.","description":"The relevant menu or information page for each of the 10 cafes is kept open in a separate tab, with each tab clearly referenced in the summary.\n\nHow a grader verifies this: Grader can see 10 open tabs corresponding to the 10 cafes, and the summary text references which tab belongs to which cafe.","max_points":160},{"criterion":"Pricing information is summarised for each cafe, including specific menu item prices or price ranges.","description":"Pricing information is summarised for each cafe, including specific menu item prices or price ranges.\n\nHow a grader verifies this: Grader can confirm each cafe entry includes concrete pricing data (not just vague descriptors) sourced from the open menu pages.","max_points":160},{"criterion":"Menu offerings and unique selling points are summarised for each cafe.","description":"Menu offerings and unique selling points are summarised for each cafe.\n\nHow a grader verifies this: Grader can confirm each cafe entry includes a description of menu highlights and at least one unique selling point or differentiator.","max_points":160},{"criterion":"Travel time from Tampines MRT to each of the 10 cafes is calculated and included in the final summary.","description":"Travel time from Tampines MRT to each of the 10 cafes is calculated and included in the final summary.\n\nHow a grader verifies this: Grader can confirm each cafe entry includes an estimated travel time from Tampines MRT with the transport mode indicated (e.g. public transit, driving).","max_points":140},{"criterion":"A final structured summary combines all information (cafe name, region, pricing, menu highlights, USPs, tab reference, travel time) in a clear format.","description":"A final structured summary combines all information (cafe name, region, pricing, menu highlights, USPs, tab reference, travel time) in a clear format.\n\nHow a grader verifies this: Grader can see a complete summary or table that consolidates all required fields for all 10 cafes in an organised, readable format.","max_points":120}]}} +{"task_id":"543918a53f9196e0f77783e1dc4a9db90ebc6eb9","confirmed_task":"I want to develop the best banana bread recipe. Look up the top 10 recipes online (by engagement, popularity, reviews) and compare the recipes (e.g. composition of ingredients, additions, cooking method), identifying and highlighting similarities and unique points that make each recipe good. Keep the most unique or highly reviewed 3 recipes in open tabs so I can reference them, and make sure at least one has a YouTube video (also keep this video open and start playing it). Then, from these three, create the best recipe you can combining aspects of these and provide me with step by step instructions.","website":"https://www.google.com","level":"hard","reference_length":5,"categories":["Food and Drink > Cooking and Recipes"],"precomputed_rubric":{"items":[{"criterion":"The top 10 banana bread recipes online are identified and ranked by engagement, popularity, or reviews, with evidence of their ranking.","description":"The top 10 banana bread recipes online are identified and ranked by engagement, popularity, or reviews, with evidence of their ranking.\n\nHow a grader verifies this: Grader can confirm 10 recipes are listed with source URLs and evidence of popularity such as review counts, star ratings, or engagement metrics.","max_points":120},{"criterion":"Ingredient compositions, additions, and cooking methods are extracted and compared across all 10 recipes, identifying similarities and unique differentiators.","description":"Ingredient compositions, additions, and cooking methods are extracted and compared across all 10 recipes, identifying similarities and unique differentiators.\n\nHow a grader verifies this: Grader can confirm a comparison analysis is provided that highlights common base ingredients across recipes and calls out unique additions or techniques for each.","max_points":200},{"criterion":"The 3 most unique or highly reviewed recipes are selected and kept open in separate tabs for reference.","description":"The 3 most unique or highly reviewed recipes are selected and kept open in separate tabs for reference.\n\nHow a grader verifies this: Grader can see 3 recipe tabs open and the selection rationale (uniqueness or review quality) is explained.","max_points":160},{"criterion":"At least one of the 3 selected recipes has an associated YouTube video that is opened in a tab and started playing.","description":"At least one of the 3 selected recipes has an associated YouTube video that is opened in a tab and started playing.\n\nHow a grader verifies this: Grader can confirm a YouTube video tab is open and playing for at least one of the selected recipes.","max_points":140},{"criterion":"A combined best banana bread recipe is created drawing from the strengths of the top 3 selected recipes, with a complete ingredient list and step-by-step instructions.","description":"A combined best banana bread recipe is created drawing from the strengths of the top 3 selected recipes, with a complete ingredient list and step-by-step instructions.\n\nHow a grader verifies this: Grader can see a full recipe with numbered steps and a complete ingredient list, with clear attribution of which elements were drawn from which source recipes.","max_points":220},{"criterion":"The combined recipe explains why specific elements were chosen from each source recipe.","description":"The combined recipe explains why specific elements were chosen from each source recipe.\n\nHow a grader verifies this: Grader can confirm the final recipe includes reasoning for ingredient or method choices tied back to the comparison analysis.","max_points":160}]}} +{"task_id":"a89f1d8d1b3d2b74d784c82e09c2511e7e1587fd","confirmed_task":"I’m trying to buy a genuinely good budget suitcase for a future trip, and I don’t want a fake bargain that only looks good because one eBay listing has stars. Start on eBay with suitcase listings under $100, but then broaden into a real shopping decision: find 10 total live suitcase options under $100 from public product or listing pages across eBay and other major public retailers or marketplaces if needed, with at least 4 of the options coming from eBay. Keep this focused on actual suitcases rather than backpacks or duffels, and prefer recognizable luggage brands when they fit the budget, but include lesser-known brands too if the value looks strong. For each option, compare the visible price, estimated shipping if shown, size category, dimensions, weight, hard-side versus soft-side, wheel configuration, visible customer rating and review count, and return policy or seller/retailer protection if that is shown; if any field is missing, just mark it as not shown. I also want you to sanity-check quality instead of trusting one marketplace blindly, so for the most promising 5 options, look for the same model or a closely matching product page on another public site to verify specs or to see whether the price and review story still look good. As you work, keep the strongest 3 candidate product pages open, plus at least 2 comparison or evidence tabs that helped you rule things in or out, so I can inspect the reasoning visually. At the end, recommend exactly 1 best overall suitcase under $100, exactly 2 runner-up picks for different priorities like lowest cost or best reviews, and briefly explain the tradeoffs that made you choose them.","website":"https://www.ebay.com","level":"hard","reference_length":7,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies 10 total live suitcase options under $100 from public product or listing pages, with at least 4 of those options coming from eBay.","description":"The browsing session identifies 10 total live suitcase options under $100 from public product or listing pages, with at least 4 of those options coming from eBay.\n\nHow a grader verifies this: Count the recorded options and verify that all are suitcases priced under $100 on public pages, and that at least 4 are sourced from eBay pages.","max_points":180},{"criterion":"Each of the 10 options includes the requested comparison fields: visible price, estimated shipping if shown, size category, dimensions, weight, hard-side versus soft-side, wheel configuration, visible customer rating and review count, and return policy or seller/retailer protection, with missing fields marked as not shown.","description":"Each of the 10 options includes the requested comparison fields: visible price, estimated shipping if shown, size category, dimensions, weight, hard-side versus soft-side, wheel configuration, visible customer rating and review count, and return policy or seller/retailer protection, with missing fields marked as not shown.\n\nHow a grader verifies this: Inspect the final comparison output and confirm that every option has all requested fields populated or explicitly marked not shown where unavailable.","max_points":220},{"criterion":"The set stays focused on actual suitcases rather than backpacks or duffels, and it prefers recognizable luggage brands when they fit the budget while still allowing lesser-known brands when the value appears strong.","description":"The set stays focused on actual suitcases rather than backpacks or duffels, and it prefers recognizable luggage brands when they fit the budget while still allowing lesser-known brands when the value appears strong.\n\nHow a grader verifies this: Review the 10 selected products and confirm they are suitcases; check that the mix includes recognizable luggage brands where available within budget rather than only generic items.","max_points":100},{"criterion":"For the most promising 5 options, the session finds the same model or a closely matching product page on another public site to verify specs or compare whether the price and review story still hold up.","description":"For the most promising 5 options, the session finds the same model or a closely matching product page on another public site to verify specs or compare whether the price and review story still hold up.\n\nHow a grader verifies this: Confirm that 5 shortlisted options each have an additional public cross-site verification page or closely matching page used for spec, price, or review sanity-checking.","max_points":180},{"criterion":"The browser is left with the strongest 3 candidate product pages open, plus at least 2 comparison or evidence tabs that were used to support or reject options.","description":"The browser is left with the strongest 3 candidate product pages open, plus at least 2 comparison or evidence tabs that were used to support or reject options.\n\nHow a grader verifies this: Check the open tabs at the end for 3 finalist product pages and at least 2 additional evidence or comparison pages relevant to the decision.","max_points":120},{"criterion":"The final recommendation names exactly 1 best overall suitcase under $100 and exactly 2 runner-up picks for different priorities such as lowest cost or best reviews.","description":"The final recommendation names exactly 1 best overall suitcase under $100 and exactly 2 runner-up picks for different priorities such as lowest cost or best reviews.\n\nHow a grader verifies this: Inspect the final summary and confirm there is one clearly labeled best overall pick and two clearly labeled runner-up picks with distinct priorities.","max_points":100},{"criterion":"The final summary briefly explains the tradeoffs behind the best overall pick and the two runner-up picks using the compared evidence.","description":"The final summary briefly explains the tradeoffs behind the best overall pick and the two runner-up picks using the compared evidence.\n\nHow a grader verifies this: Check that the concluding recommendation includes concise reasoning tied to factors like total cost, features, review strength, protection/returns, or spec verification.","max_points":100}]}} +{"task_id":"48a6cb73b8d54934ae1ad3d50abdf17a4dcd6b42","confirmed_task":"I want to file a Virginia Department of Education public-records request, but before we submit anything I want you to help me figure out what request is actually worth filing and how to phrase it so it is specific, lower-cost, and more likely to succeed. Start by checking the VDOE FOIA page and the VDOE NextRequest portal so we understand the real submission options, response timeline, fee rules, and the main exemptions or privacy limits that could matter. Then search the public VDOE site to see whether likely records are already posted, because I do not want to waste a FOIA request on material that is already public. After that, identify 4 plausible VDOE record-request angles that an ordinary requester could credibly pursue from public clues on the site, and for each one note what page suggests the records exist, what office or records custodian seems most relevant, what narrowing choices could reduce fees, and any obvious exemption or FERPA/privacy problem. From those 4, choose the 2 strongest candidates and draft a complete FOIA request for each with a clear date range, reasonably specific record description, preferred electronic delivery, and a request for a cost estimate before any chargeable work proceeds. Keep the most useful evidence tabs open, including the VDOE FOIA page, the NextRequest new-request page, and the key public pages that support the 2 finalist request ideas. Finally, carry the better of the 2 drafts through the VDOE NextRequest workflow to the final submission or confirmation step if that can be done on public pages, and leave that page open. End with a concise comparison memo explaining which draft you chose, why it looks like the best balance of usefulness versus fee/exemption risk, what response timeline VDOE states, and what follow-up I should expect next. Put each of the drafts as seperate documents in CryptoPad Documents for me to review.","website":"https://vaedu.nextrequest.com","level":"hard","reference_length":9,"precomputed_rubric":{"items":[{"criterion":"The session verifies the live VDOE FOIA rules and VDOE NextRequest workflow, including submission options, stated response timeline, fee rules, and relevant exemption or privacy limits.","description":"The session verifies the live VDOE FOIA rules and VDOE NextRequest workflow, including submission options, stated response timeline, fee rules, and relevant exemption or privacy limits.\n\nHow a grader verifies this: Browser evidence shows the VDOE FOIA page and the VDOE NextRequest portal were opened, and the final memo accurately summarizes those live rules and workflow details.","max_points":170},{"criterion":"The session checks the public VDOE site for whether likely records are already available before filing, rather than assuming FOIA is necessary.","description":"The session checks the public VDOE site for whether likely records are already available before filing, rather than assuming FOIA is necessary.\n\nHow a grader verifies this: Opened public VDOE pages are used as evidence of record categories or already-published materials, and the memo distinguishes between what appears public versus what still seems FOIA-worthy.","max_points":140},{"criterion":"Exactly 4 plausible VDOE record-request angles are identified, and for each one the work notes the public clue that the records exist, the most relevant office or custodian, narrowing choices that could reduce fees, and any obvious exemption or FERPA/privacy concern.","description":"Exactly 4 plausible VDOE record-request angles are identified, and for each one the work notes the public clue that the records exist, the most relevant office or custodian, narrowing choices that could reduce fees, and any obvious exemption or FERPA/privacy concern.\n\nHow a grader verifies this: The final output lists 4 distinct request angles with all four requested elements for each, supported by the public pages visited.","max_points":200},{"criterion":"From those 4 angles, exactly 2 strongest candidates are selected and each is turned into a complete draft FOIA request with a clear date range, specific record description, preferred electronic delivery, and a request for a cost estimate before charges proceed, and each of the 2 draft requests is placed in its own CryptoPad Document for review.","description":"From those 4 angles, exactly 2 strongest candidates are selected and each is turned into a complete draft FOIA request with a clear date range, specific record description, preferred electronic delivery, and a request for a cost estimate before charges proceed, and each of the 2 draft requests is placed in its own CryptoPad Document for review.\n\nHow a grader verifies this: The final output contains 2 full draft requests and each includes all four requested drafting elements; the final deliverables include 2 separate CryptoPad Documents containing those full draft requests.","max_points":190},{"criterion":"The most useful evidence tabs are kept open, including the VDOE FOIA page, the NextRequest new-request page, and the key public pages supporting the 2 finalist request ideas.","description":"The most useful evidence tabs are kept open, including the VDOE FOIA page, the NextRequest new-request page, and the key public pages supporting the 2 finalist request ideas.\n\nHow a grader verifies this: The browser state at the end includes those required pages open and visibly available for review.","max_points":120},{"criterion":"The better of the 2 drafts is carried through the VDOE NextRequest workflow to the final submission or confirmation step if reachable on public pages, and the task ends with a concise comparison memo explaining the chosen draft, the usefulness-versus-fee/exemption tradeoff, VDOE’s stated timeline, and expected follow-up.","description":"The better of the 2 drafts is carried through the VDOE NextRequest workflow to the final submission or confirmation step if reachable on public pages, and the task ends with a concise comparison memo explaining the chosen draft, the usefulness-versus-fee/exemption tradeoff, VDOE’s stated timeline, and expected follow-up.\n\nHow a grader verifies this: A final portal step or confirmation page is left open when reachable, and the closing memo covers all four requested comparison points.","max_points":180}]}} +{"task_id":"a8a95699fe40ca1439fb714dcba8e01b022b3e6a","confirmed_task":"I’m trying to decide whether Silver Cross is actually a strong baby brand to buy from for a future first-child stroller setup, not just whether the marketing sounds nice, so please do a serious browser-based comparison that would help me make the call. Start by identifying one current full-size stroller or travel-system option from Silver Cross that seems representative of the brand, then compare it against at least 5 competing premium or upper-midrange brands that people realistically cross-shop, using public product pages plus independent reviews and owner feedback. For Silver Cross specifically, I want you to gather at least 6 concrete reputation or quality claims from independent sources, with at least 3 positives and at least 2 meaningful drawbacks, and make sure they are specific things like build quality, ride quality, fold, weight, fabric quality, durability, customer service, or long-term value rather than vague praise. Then build a side-by-side comparison across the 6 total brands for the factors a real buyer would care about: approximate price, stroller weight, fold or portability, newborn compatibility, car-seat or travel-system ecosystem, warranty or after-sales support, and any clearly stated safety or recall context from public pages; if a field is not shown, record it as not shown. As you work, keep the most useful evidence tabs open, including the Silver Cross product page, at least 3 independent review pages discussing Silver Cross, at least 2 competitor product pages, and at least 1 public safety or recall-check page so I can verify the reasoning afterward. After that, synthesize everything into one clear recommendation that answers three questions: whether Silver Cross seems like a genuinely good brand overall, what kind of buyer it fits best versus who should skip it, and whether I’d be better off buying Silver Cross or one of the compared alternatives for a practical everyday stroller setup.","website":"https://www.silvercrossbaby.com","level":"hard","reference_length":6,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies 1 current representative Silver Cross full-size stroller or travel-system option and compares it against at least 5 competing premium or upper-midrange brands, for 6 total brands.","description":"The browsing session identifies 1 current representative Silver Cross full-size stroller or travel-system option and compares it against at least 5 competing premium or upper-midrange brands, for 6 total brands.\n\nHow a grader verifies this: Final comparison includes the Silver Cross model plus at least 5 named competing brands/models drawn from public product pages.","max_points":180},{"criterion":"For Silver Cross, at least 6 concrete reputation or quality claims are gathered from independent sources, including at least 3 positives and at least 2 meaningful drawbacks.","description":"For Silver Cross, at least 6 concrete reputation or quality claims are gathered from independent sources, including at least 3 positives and at least 2 meaningful drawbacks.\n\nHow a grader verifies this: Final synthesis cites independent review or owner-feedback pages for the Silver Cross claims, and the claims are specific attributes such as build quality, ride quality, fold, weight, fabric quality, durability, customer service, or value.","max_points":220},{"criterion":"A side-by-side comparison is produced across the 6 total brands for approximate price, stroller weight, fold or portability, newborn compatibility, car-seat or travel-system ecosystem, warranty or after-sales support, and any clearly stated safety or recall context from public pages, using 'not shown' where needed.","description":"A side-by-side comparison is produced across the 6 total brands for approximate price, stroller weight, fold or portability, newborn compatibility, car-seat or travel-system ecosystem, warranty or after-sales support, and any clearly stated safety or recall context from public pages, using 'not shown' where needed.\n\nHow a grader verifies this: The final comparison covers all requested factors for all 6 brands and uses 'not shown' for missing fields rather than omitting them.","max_points":220},{"criterion":"Useful browser evidence is kept open, including the Silver Cross product page, at least 3 independent review pages discussing Silver Cross, at least 2 competitor product pages, and at least 1 public safety or recall-check page.","description":"Useful browser evidence is kept open, including the Silver Cross product page, at least 3 independent review pages discussing Silver Cross, at least 2 competitor product pages, and at least 1 public safety or recall-check page.\n\nHow a grader verifies this: Open tabs at the end visibly include the requested categories and counts of evidence pages.","max_points":140},{"criterion":"The final recommendation explicitly answers whether Silver Cross seems like a genuinely good brand overall, what kind of buyer it fits best versus who should skip it, and whether Silver Cross or one of the compared alternatives is the better choice for a practical everyday stroller setup.","description":"The final recommendation explicitly answers whether Silver Cross seems like a genuinely good brand overall, what kind of buyer it fits best versus who should skip it, and whether Silver Cross or one of the compared alternatives is the better choice for a practical everyday stroller setup.\n\nHow a grader verifies this: Closing synthesis addresses all three decision questions directly and ties the recommendation to the gathered comparison evidence.","max_points":160},{"criterion":"The work relies on public product pages plus independent reviews and owner feedback rather than brand marketing alone.","description":"The work relies on public product pages plus independent reviews and owner feedback rather than brand marketing alone.\n\nHow a grader verifies this: The reasoning incorporates both official product information and non-brand public sources, and the Silver Cross brand assessment is not based solely on Silver Cross pages.","max_points":80}]}} +{"task_id":"a24232e8f6e7e654f09be4e219e692af4fec62a5","confirmed_task":"I’m considering booking easyJet for a future Europe trip, but before I do I want a serious browser-based risk check rather than just a couple of random anecdotes. Please research easyJet complaints across public sources and build me a practical picture of what goes wrong most often and whether the airline’s published support and passenger-rights information actually covers those situations. Start by finding at least 18 distinct recent customer complaints from at least 4 different public sources, and group them into the biggest recurring themes like delays, cancellations, refunds, baggage, check-in, customer service, or anything else that clearly shows up. For each complaint you keep, note the topic, where it was posted, and a short plain-English summary of what happened. Then open easyJet’s own public help pages for the main complaint themes you found and compare what the airline says customers should do in those situations, including contact/help options and any refund, disruption, or baggage guidance that is publicly available. After that, check at least 3 independent public passenger-rights or consumer-guidance sources so I can see what travelers may actually be entitled to when flights are disrupted or problems occur. Keep the most useful evidence tabs open from both the complaint sources and the official/help pages so I can inspect them myself. Finally, give me one organized decision memo that covers: the top recurring complaint themes, how often each theme appeared in your 18-plus examples, what easyJet’s own pages say about those issues, where outside guidance seems to support or contradict the practical customer experience, and your bottom-line judgment on whether easyJet looks acceptable for a budget-conscious traveler, risky unless the fare savings are large, or worth avoiding for certain trip types.","website":"https://www.easyjet.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The final memo uses at least 18 distinct easyJet customer complaints drawn from at least 4 different public sources.","description":"The final memo uses at least 18 distinct easyJet customer complaints drawn from at least 4 different public sources.\n\nHow a grader verifies this: Count the complaint examples and confirm the cited/publicly named sources total 4 or more distinct sources.","max_points":180},{"criterion":"Each retained complaint includes the complaint topic, where it was posted, and a short plain-English summary of what happened.","description":"Each retained complaint includes the complaint topic, where it was posted, and a short plain-English summary of what happened.\n\nHow a grader verifies this: Review the complaint list and confirm every example has all three requested fields.","max_points":160},{"criterion":"The complaints are grouped into recurring themes, and the final memo reports the top recurring complaint themes with how often each theme appeared in the 18-plus examples.","description":"The complaints are grouped into recurring themes, and the final memo reports the top recurring complaint themes with how often each theme appeared in the 18-plus examples.\n\nHow a grader verifies this: Check that the memo contains named complaint categories and a frequency count for each major theme based on the collected examples.","max_points":180},{"criterion":"EasyJet’s own public help pages are opened and used for the main complaint themes found, including public contact/help information and any publicly available refund, disruption, or baggage guidance relevant to those themes.","description":"EasyJet’s own public help pages are opened and used for the main complaint themes found, including public contact/help information and any publicly available refund, disruption, or baggage guidance relevant to those themes.\n\nHow a grader verifies this: Confirm the final memo references easyJet help content for the major themes and that relevant official pages were opened during the session.","max_points":160},{"criterion":"At least 3 independent public passenger-rights or consumer-guidance sources are checked and incorporated into the analysis of what travelers may be entitled to when problems occur.","description":"At least 3 independent public passenger-rights or consumer-guidance sources are checked and incorporated into the analysis of what travelers may be entitled to when problems occur.\n\nHow a grader verifies this: Count the independent non-easyJet guidance sources cited or summarized in the memo and confirm there are at least 3.","max_points":120},{"criterion":"The most useful evidence tabs are kept open from both complaint sources and official/help pages so the user can inspect the evidence directly.","description":"The most useful evidence tabs are kept open from both complaint sources and official/help pages so the user can inspect the evidence directly.\n\nHow a grader verifies this: Inspect the final browser state and confirm that representative complaint-source tabs and easyJet official/help tabs remain open.","max_points":80},{"criterion":"The final organized decision memo gives a bottom-line judgment on whether easyJet looks acceptable for a budget-conscious traveler, risky unless fare savings are large, or worth avoiding for certain trip types, and it explains that judgment using the complaint patterns and policy/guidance comparison.","description":"The final organized decision memo gives a bottom-line judgment on whether easyJet looks acceptable for a budget-conscious traveler, risky unless fare savings are large, or worth avoiding for certain trip types, and it explains that judgment using the complaint patterns and policy/guidance comparison.\n\nHow a grader verifies this: Check that the memo ends with one of the requested decision outcomes and that the reasoning explicitly ties back to complaint themes, easyJet’s published pages, and outside guidance.","max_points":120}]}} +{"task_id":"4523f5c4b7d3a82e73209a447340dd7d46c53907","confirmed_task":"I’m planning a future downtown Pittsburgh stay and I’ll be driving, so I don’t just want to know whether the Fairmont Pittsburgh has parking — I want to know whether it’s actually a smart hotel choice once parking and total convenience are factored in. Start with Fairmont Pittsburgh and then compare it against 7 other well-reviewed upscale or upper-upscale hotels in downtown Pittsburgh or the immediately adjacent core, so I end up with 8 total hotels. For each hotel, use the official property site first and record whether parking is on-site, whether it’s valet or self-parking, whether the parking is free or paid, the stated nightly fee if shown, and any publicly stated details like in/out privileges, oversized vehicle limits, EV charging, or nearby garage arrangements; if an official page leaves something unclear, cross-check one public booking or listing page and mark anything still unavailable as \"not shown\" instead of guessing. Then use maps and hotel photo pages to sanity-check the arrival setup for each place — things like whether the entrance and garage situation look straightforward, whether it seems attached or off-site, and how walkable the hotel is to at least 3 downtown anchors: Market Square, PPG Paints Arena, and PNC Park. After that, compare the 8 hotels on driver convenience and likely all-in nightly cost, using one sample future 2-night stay price from a public booking flow or listing page for each hotel when available, and make me a ranked top-5 shortlist for a driver who wants a nice stay without getting killed on parking or ending up with awkward car logistics. Keep the Fairmont parking/location page open, keep open the exact parking or hotel details pages for the 3 strongest alternatives, and finish with a concise decision memo that tells me whether Fairmont Pittsburgh looks competitive, overpriced once parking is added, or worth it for the location and overall experience.","website":"https://www.fairmont.com","level":"hard","reference_length":5,"precomputed_rubric":{"items":[{"criterion":"The work compares exactly 8 total hotels: Fairmont Pittsburgh plus 7 other well-reviewed upscale or upper-upscale hotels in downtown Pittsburgh or the immediately adjacent core.","description":"The work compares exactly 8 total hotels: Fairmont Pittsburgh plus 7 other well-reviewed upscale or upper-upscale hotels in downtown Pittsburgh or the immediately adjacent core.\n\nHow a grader verifies this: The final comparison clearly lists 8 hotels by name, with Fairmont Pittsburgh included, and each hotel is within the stated geographic and quality scope.","max_points":170},{"criterion":"For each of the 8 hotels, the result records whether parking is on-site, whether it is valet or self-parking, whether it is free or paid, the stated nightly fee if shown, and any publicly stated details such as in/out privileges, oversized vehicle limits, EV charging, or nearby garage arrangements, using the official property site first and marking unresolved items as \"not shown\" if needed.","description":"For each of the 8 hotels, the result records whether parking is on-site, whether it is valet or self-parking, whether it is free or paid, the stated nightly fee if shown, and any publicly stated details such as in/out privileges, oversized vehicle limits, EV charging, or nearby garage arrangements, using the official property site first and marking unresolved items as \"not shown\" if needed.\n\nHow a grader verifies this: Each hotel entry contains the requested parking fields, shows that the official property site was used first, and uses \"not shown\" rather than unsupported guesses for missing details.","max_points":230},{"criterion":"When official pages are unclear, one public booking or listing page is used to cross-check the missing parking or stay details for the affected hotels.","description":"When official pages are unclear, one public booking or listing page is used to cross-check the missing parking or stay details for the affected hotels.\n\nHow a grader verifies this: At least the hotels with incomplete official information show a secondary public source check, and the final notes distinguish between official details and cross-checked listing details.","max_points":120},{"criterion":"Each of the 8 hotels is sanity-checked with maps and hotel photo pages for arrival setup and walkability, including how straightforward the entrance/garage situation appears and how walkable the hotel is to Market Square, PPG Paints Arena, and PNC Park.","description":"Each of the 8 hotels is sanity-checked with maps and hotel photo pages for arrival setup and walkability, including how straightforward the entrance/garage situation appears and how walkable the hotel is to Market Square, PPG Paints Arena, and PNC Park.\n\nHow a grader verifies this: Every hotel entry includes arrival/logistics observations plus walkability notes covering all 3 named downtown anchors.","max_points":160},{"criterion":"The comparison includes likely all-in nightly cost for each of the 8 hotels, using one sample future 2-night stay price from a public booking flow or listing page when available, so parking can be evaluated alongside room cost.","description":"The comparison includes likely all-in nightly cost for each of the 8 hotels, using one sample future 2-night stay price from a public booking flow or listing page when available, so parking can be evaluated alongside room cost.\n\nHow a grader verifies this: Each hotel has a sample stay price or a clearly marked \"not shown\" if unavailable, and the final comparison uses those figures alongside parking terms rather than treating parking in isolation.","max_points":140},{"criterion":"A ranked top-5 shortlist is produced for a driver who wants a nice stay without excessive parking cost or awkward car logistics, and it explicitly states whether Fairmont Pittsburgh looks competitive, overpriced once parking is added, or worth it for the location and experience.","description":"A ranked top-5 shortlist is produced for a driver who wants a nice stay without excessive parking cost or awkward car logistics, and it explicitly states whether Fairmont Pittsburgh looks competitive, overpriced once parking is added, or worth it for the location and experience.\n\nHow a grader verifies this: The final output contains a ranked top 5 and an explicit Fairmont conclusion in one of the requested forms, supported by the comparison findings.","max_points":120},{"criterion":"The browser is left with the Fairmont parking/location page open and the exact parking or hotel details pages for the 3 strongest alternative hotels also open.","description":"The browser is left with the Fairmont parking/location page open and the exact parking or hotel details pages for the 3 strongest alternative hotels also open.\n\nHow a grader verifies this: Open tabs at the end visibly include the Fairmont evidence page plus 3 alternative evidence pages that correspond to the final recommendation.","max_points":60}]}} +{"task_id":"86cc69e23296a471c4e9e3da30d63ff54f31665f","confirmed_task":"I’m trying to turn a dialogue-heavy script into a polished text-to-speech demo with distinct voices for each identifiable character, and before I start paying for anything I want a serious browser-based casting and platform check. Start with ElevenLabs’ public site and use only public pages to figure out whether it looks practical for a multi-character script workflow: review the relevant docs or help pages, pricing or plan pages, and any public voice-library or sample pages that help show how character voice selection would work. Then build me a casting shortlist of exactly 12 candidate ElevenLabs voices total, spread across 6 character slots of your choice that would make sense for a typical script cast—for example narrator, lead male, lead female, older character, younger character, and one wildcard or villain role—with 2 candidate voices per slot. For each of the 12, capture the voice name, apparent gender or style if shown, accent or language if shown, the reason it fits that slot, and mark anything missing as not shown. Keep the most useful public voice pages open so I can compare them later.\n\nAfter that, compare ElevenLabs against exactly 3 other public text-to-speech platforms that look relevant for multi-speaker character work. On public pages only, check whether each one appears to support the kinds of things I’d care about for this project: multiple voices, dialogue or long-form narration suitability, emotional or style control if shown, script or project organization features if shown, and pricing or usage limits if shown. I do not need private trials or sign-ins; if something is unclear, just say not shown. Keep the key pricing or feature pages open for the strongest alternatives.\n\nFinally, put everything into one organized decision memo or document I could actually use before opening an account: a short recommendation on whether ElevenLabs is the best choice for this kind of script, the 6 role slots with the 12-voice shortlist, a side-by-side comparison of the 4 total platforms, the biggest risks or unknowns from public information, and a simple first-pass voice assignment recommendation for the 6 roles. Leave the finished memo open at the end, and also leave open the most useful ElevenLabs voice pages, the most relevant ElevenLabs pricing/docs pages, and the best comparison pages from the alternative platforms.","website":"https://elevenlabs.io","level":"hard","reference_length":6,"precomputed_rubric":{"items":[{"criterion":"The final memo concludes whether ElevenLabs appears practical for a multi-character script workflow using public evidence from its docs/help pages, pricing/plan pages, and relevant public voice or sample pages.","description":"The final memo concludes whether ElevenLabs appears practical for a multi-character script workflow using public evidence from its docs/help pages, pricing/plan pages, and relevant public voice or sample pages.\n\nHow a grader verifies this: Check that the memo includes a clear recommendation about ElevenLabs and cites or is visibly grounded in opened public ElevenLabs documentation/help, pricing, and voice/sample pages.","max_points":180},{"criterion":"The memo includes exactly 6 character slots and exactly 12 total ElevenLabs voice candidates, with 2 candidate voices per slot.","description":"The memo includes exactly 6 character slots and exactly 12 total ElevenLabs voice candidates, with 2 candidate voices per slot.\n\nHow a grader verifies this: Count the role slots and voice entries in the memo and confirm the 6-by-2 structure is followed exactly.","max_points":200},{"criterion":"Each of the 12 ElevenLabs voice candidates includes the voice name, apparent gender or style if shown, accent or language if shown, a fit rationale for the slot, and 'not shown' where information is missing.","description":"Each of the 12 ElevenLabs voice candidates includes the voice name, apparent gender or style if shown, accent or language if shown, a fit rationale for the slot, and 'not shown' where information is missing.\n\nHow a grader verifies this: Review each voice entry in the memo and confirm all requested fields are present, using 'not shown' where needed rather than inventing details.","max_points":160},{"criterion":"The most useful public ElevenLabs voice pages are kept open so the shortlisted voices can be compared later.","description":"The most useful public ElevenLabs voice pages are kept open so the shortlisted voices can be compared later.\n\nHow a grader verifies this: Inspect the remaining browser tabs and confirm relevant public ElevenLabs voice pages for shortlisted candidates are still open.","max_points":120},{"criterion":"ElevenLabs is compared against exactly 3 other public text-to-speech platforms, for 4 total platforms, on the requested dimensions: multiple voices, dialogue or long-form narration suitability, emotional or style control if shown, script or project organization features if shown, and pricing or usage limits if shown.","description":"ElevenLabs is compared against exactly 3 other public text-to-speech platforms, for 4 total platforms, on the requested dimensions: multiple voices, dialogue or long-form narration suitability, emotional or style control if shown, script or project organization features if shown, and pricing or usage limits if shown.\n\nHow a grader verifies this: Check the comparison section of the memo and confirm there are exactly 4 platforms total and that each requested comparison dimension is addressed or marked not shown.","max_points":180},{"criterion":"The key pricing or feature pages for the strongest alternative platforms, along with the most relevant ElevenLabs pricing/docs pages, are left open as browser evidence.","description":"The key pricing or feature pages for the strongest alternative platforms, along with the most relevant ElevenLabs pricing/docs pages, are left open as browser evidence.\n\nHow a grader verifies this: Inspect the open tabs and confirm that the memo’s recommended or strongest comparison pages remain open for ElevenLabs and the alternative platforms.","max_points":80},{"criterion":"The finished memo is left open and includes a short overall recommendation, the 6-role casting shortlist, the 4-platform comparison, the biggest risks or unknowns from public information, and a simple first-pass voice assignment recommendation for the 6 roles.","description":"The finished memo is left open and includes a short overall recommendation, the 6-role casting shortlist, the 4-platform comparison, the biggest risks or unknowns from public information, and a simple first-pass voice assignment recommendation for the 6 roles.\n\nHow a grader verifies this: Open the final memo and confirm all five requested sections are present and populated.","max_points":80}]}} +{"task_id":"45408d6942b6c26bd37b4675e183993602b940bb","confirmed_task":"I want help turning a cheap-flights idea into a real decision for a future 7-night holiday in late November from the UK. Please start on Skyscanner and do a broad search from the UK, using a UK-wide departure search if the site supports it or else checking 4 major departure airports: London, Manchester, Birmingham, and Edinburgh. Scan exactly 16 destination candidates for a 7-night round-trip economy trip in late November, then narrow them to exactly 4 destinations that look like genuinely good holiday options, not just random cheap fares. For each of the 4 finalists, keep the Skyscanner results page open and record the destination city and airport, the departure airport used, the lowest displayed round-trip price, and any obvious routing drawback shown on the results page. Please treat a fare of £250 or less as strong value, and if none of the best options for a finalist are under £250, note that clearly instead of forcing it. Then sanity-check each finalist on public pages by opening 2 lodging options per destination and using map/photos to judge whether a full week there looks realistic on a moderate budget, with a target of no more than about £140 per night if possible; if that is not available, say so. Also open 1 reliable weather or climate page per finalist so I can see what late November is likely to feel like. After that, compare the 4 finalists side by side on flight cost, lodging realism, likely weather, and the kind of trip each suits best—beach, city break, nature, or mixed sightseeing. Finish by recommending exactly 1 best overall choice, 1 best budget pick, and 1 best warm-weather option, and leave the 4 Skyscanner tabs plus the most useful hotel and weather tabs open so I can review the evidence.","website":"https://www.skyscanner.net","level":"hard","reference_length":7,"precomputed_rubric":{"items":[{"criterion":"The session performs a broad Skyscanner discovery sweep covering exactly 16 destination candidates for a 7-night late-November round-trip economy trip from the UK, using a UK-wide departure search if supported or else the 4 specified departure airports: London, Manchester, Birmingham, and Edinburgh.","description":"The session performs a broad Skyscanner discovery sweep covering exactly 16 destination candidates for a 7-night late-November round-trip economy trip from the UK, using a UK-wide departure search if supported or else the 4 specified departure airports: London, Manchester, Birmingham, and Edinburgh.\n\nHow a grader verifies this: The final output states that exactly 16 candidates were scanned and indicates whether the search used a UK-wide departure option or the 4 named airports, with browser evidence showing a broad discovery process rather than only four destinations.","max_points":180},{"criterion":"Exactly 4 finalist destinations are selected, and for each finalist the result includes the destination city and airport, the departure airport used, the lowest displayed round-trip price, and any obvious routing drawback shown on the Skyscanner results page.","description":"Exactly 4 finalist destinations are selected, and for each finalist the result includes the destination city and airport, the departure airport used, the lowest displayed round-trip price, and any obvious routing drawback shown on the Skyscanner results page.\n\nHow a grader verifies this: The final comparison lists exactly four finalists with all requested flight fields, and the corresponding four Skyscanner results pages remain open as evidence.","max_points":240},{"criterion":"The flight assessment explicitly applies the stated value threshold by identifying whether each finalist has a fare of £250 or less, or clearly noting when the best option for that finalist is above £250.","description":"The flight assessment explicitly applies the stated value threshold by identifying whether each finalist has a fare of £250 or less, or clearly noting when the best option for that finalist is above £250.\n\nHow a grader verifies this: Each of the four finalist entries includes a clear value note tied to the £250 threshold, with no missing assessment.","max_points":120},{"criterion":"Each of the 4 finalists is sanity-checked with exactly 2 public lodging options that provide enough map and/or photo context to judge whether a full week there looks realistic on a moderate budget, using the target of about £140 per night if possible and stating when that target is not achievable.","description":"Each of the 4 finalists is sanity-checked with exactly 2 public lodging options that provide enough map and/or photo context to judge whether a full week there looks realistic on a moderate budget, using the target of about £140 per night if possible and stating when that target is not achievable.\n\nHow a grader verifies this: The final output includes two lodging checks for each finalist with nightly-price context and a clear note on whether the roughly £140 per night target appears feasible, supported by relevant hotel or lodging tabs.","max_points":170},{"criterion":"Each of the 4 finalists is also checked against exactly 1 reliable public weather or climate page for late November.","description":"Each of the 4 finalists is also checked against exactly 1 reliable public weather or climate page for late November.\n\nHow a grader verifies this: The final comparison includes late-November weather or climate context for all four finalists, with one supporting weather or climate source referenced for each and useful tabs left open.","max_points":110},{"criterion":"The 4 finalists are compared side by side on the four requested decision factors: flight cost, lodging realism, likely weather, and whether the destination is best suited to beach, city break, nature, or mixed sightseeing.","description":"The 4 finalists are compared side by side on the four requested decision factors: flight cost, lodging realism, likely weather, and whether the destination is best suited to beach, city break, nature, or mixed sightseeing.\n\nHow a grader verifies this: The final synthesis presents a structured four-way comparison that covers all four factors for every finalist, not just separate destination notes.","max_points":100},{"criterion":"The session ends with exactly 1 best overall choice, 1 best budget pick, and 1 best warm-weather option, and leaves open the 4 Skyscanner finalist tabs plus the most useful hotel and weather tabs for review.","description":"The session ends with exactly 1 best overall choice, 1 best budget pick, and 1 best warm-weather option, and leaves open the 4 Skyscanner finalist tabs plus the most useful hotel and weather tabs for review.\n\nHow a grader verifies this: The final recommendation names all three requested picks, and the browser state shows the four finalist flight tabs and selected lodging/weather evidence tabs still open.","max_points":80}]}} +{"task_id":"985d8f98bc40053112f164b8d6dd010db1dcfd10","confirmed_task":"I need a serious browser-based weekly briefing on Brazil’s finance and macroeconomic news, not just a quick skim of one portal. Please use public pages to find and compare at least 18 finance-related items published within the past 7 days from a mix of major Brazilian news outlets and official sources such as the central bank, finance ministry, statistics agency, stock exchange, or major banks’ research pages whenever relevant. For each item, capture the headline, publication date, source, link, and a short English summary or translation of the key point. Deduplicate overlapping coverage, then narrow the set to the 10 most important developments and group them into the major themes driving the week, such as inflation, rates, fiscal policy, currency, trade, markets, commodities, regulation, or corporate finance. For each of those 10, explain why it matters inside Brazil and whether it could plausibly affect the United States through trade, commodities, capital flows, inflation, supply chains, multinational earnings, or broader market sentiment; if there is no clear U.S. angle, say so. Keep the most useful evidence tabs open, including at least 6 source pages that show the strongest or most representative stories and at least 2 official-source pages used for verification. Then produce one organized briefing document in CryptoPad Documents with three parts: a source log of the 18+ items, a ranked top-10 summary in English, and a final section that highlights the 3 to 5 developments most likely to matter to a U.S.-based investor, policymaker, or business reader. ","website":"https://www.globo.com","level":"hard","reference_length":6,"precomputed_rubric":{"items":[{"criterion":"A briefing document is created and organized into the three requested parts: a source log of at least 18 finance-related items from the past 7 days, a ranked top-10 summary in English, and a final section highlighting the 3 to 5 developments most likely to matter to a U.S.-based investor, policymaker, or business reader, and the briefing is created in CryptoPad Documents and left open for review.","description":"A briefing document is created and organized into the three requested parts: a source log of at least 18 finance-related items from the past 7 days, a ranked top-10 summary in English, and a final section highlighting the 3 to 5 developments most likely to matter to a U.S.-based investor, policymaker, or business reader, and the briefing is created in CryptoPad Documents and left open for review.\n\nHow a grader verifies this: Check that the final document exists, is left accessible, and contains all three sections with the requested counts and structure; the open deliverable is a CryptoPad Document containing the requested three-part briefing structure.","max_points":200},{"criterion":"The source log includes at least 18 qualifying items from the past 7 days drawn from a mix of major Brazilian news outlets and official sources, and each item records the headline, publication date, source, link, and a short English summary or translation.","description":"The source log includes at least 18 qualifying items from the past 7 days drawn from a mix of major Brazilian news outlets and official sources, and each item records the headline, publication date, source, link, and a short English summary or translation.\n\nHow a grader verifies this: Review the source log entries for count, date window, source diversity, and the presence of all required fields for each item.","max_points":200},{"criterion":"Overlapping coverage is deduplicated and the research is narrowed to exactly 10 most important developments, grouped into the major themes requested such as inflation, rates, fiscal policy, currency, trade, markets, commodities, regulation, or corporate finance.","description":"Overlapping coverage is deduplicated and the research is narrowed to exactly 10 most important developments, grouped into the major themes requested such as inflation, rates, fiscal policy, currency, trade, markets, commodities, regulation, or corporate finance.\n\nHow a grader verifies this: Inspect the ranked top-10 section to confirm there are exactly 10 developments, that duplicates are consolidated, and that the items are grouped into coherent major themes.","max_points":160},{"criterion":"Each of the 10 most important developments includes an English explanation of why it matters inside Brazil and whether it could plausibly affect the United States through trade, commodities, capital flows, inflation, supply chains, multinational earnings, or broader market sentiment, with 'no clear U.S. angle' stated when appropriate.","description":"Each of the 10 most important developments includes an English explanation of why it matters inside Brazil and whether it could plausibly affect the United States through trade, commodities, capital flows, inflation, supply chains, multinational earnings, or broader market sentiment, with 'no clear U.S. angle' stated when appropriate.\n\nHow a grader verifies this: Check each top-10 entry for both a Brazil significance note and a U.S.-impact assessment aligned to the requested channels or an explicit statement that no clear U.S. angle exists.","max_points":180},{"criterion":"The most useful evidence tabs are kept open, including at least 6 source pages showing the strongest or most representative stories and at least 2 official-source pages used for verification.","description":"The most useful evidence tabs are kept open, including at least 6 source pages showing the strongest or most representative stories and at least 2 official-source pages used for verification.\n\nHow a grader verifies this: Confirm that the browser still has open tabs meeting the requested minimums for representative source pages and official verification pages.","max_points":120},{"criterion":"The final section clearly identifies 3 to 5 developments most likely to matter to a U.S.-based investor, policymaker, or business reader and explains why those were prioritized over the other stories.","description":"The final section clearly identifies 3 to 5 developments most likely to matter to a U.S.-based investor, policymaker, or business reader and explains why those were prioritized over the other stories.\n\nHow a grader verifies this: Review the final section for a shortlist of 3 to 5 items with explicit prioritization reasoning tied to U.S.-based readers.","max_points":140}]}} +{"task_id":"a5a6aae50363919e5500634aa0f97211c3d06feb","confirmed_task":"I’m trying to figure out the best 5-day camper-van road trip in Iceland for a future trip, and I don’t want a generic blog itinerary. Please use public sources to compare the most realistic 5-day options first, then build one final route that focuses on the famous highlights but is still reasonable to drive in a camper van. Start by checking a campground source like Tjalda plus maps and a few reputable Iceland travel pages so you can decide whether this should be a South Coast-heavy loop, a Golden Circle plus South Coast plan, or another clearly better 5-day route. Once you’ve chosen the best route, make me a day-by-day itinerary for exactly 5 days with the driving sequence, estimated driving time for each day, and the main stops and attractions in order. Include at least 12 total sightseeing stops across the trip, but keep the plan realistic rather than cramming everything in. For each night, verify one legal campground option on a public page and note the location plus any obvious camper-relevant details like showers, power, season, or check-in rules; if a detail is missing, say not shown. Also flag any day that looks especially weather- or road-sensitive and give 2 backup substitutions for the whole trip in case conditions are poor. Please keep the key evidence visible in the browser by leaving open the final map view for the chosen route, the 5 campground pages you relied on, and 3 especially useful attraction or planning pages with photos or visitor info so I can sanity-check the plan afterward. End with a concise recommendation explaining why this is the best 5-day camper-van version of Iceland rather than trying to overreach into an unrealistic Ring Road sprint.","website":"https://tjalda.is","level":"hard","reference_length":21,"precomputed_rubric":{"items":[{"criterion":"The final plan chooses one clear 5-day Iceland camper-van route after comparing the most realistic route shapes and explains why that route was selected.","description":"The final plan chooses one clear 5-day Iceland camper-van route after comparing the most realistic route shapes and explains why that route was selected.\n\nHow a grader verifies this: Check that the final answer explicitly compares plausible 5-day route options and then names one chosen route with a short justification tied to camper-van practicality and famous highlights.","max_points":180},{"criterion":"The itinerary covers exactly 5 days and gives the driving sequence, estimated driving time for each day, and the main stops and attractions in order.","description":"The itinerary covers exactly 5 days and gives the driving sequence, estimated driving time for each day, and the main stops and attractions in order.\n\nHow a grader verifies this: Check that there are exactly 5 day sections and that each includes route order plus an estimated driving time and ordered stops.","max_points":200},{"criterion":"The itinerary includes at least 12 total sightseeing stops across the trip while keeping the plan realistic.","description":"The itinerary includes at least 12 total sightseeing stops across the trip while keeping the plan realistic.\n\nHow a grader verifies this: Count the named sightseeing stops across all 5 days and confirm there are 12 or more, with no sign that the route contradicts the stated driving feasibility.","max_points":140},{"criterion":"Each of the 5 nights includes one verified legal campground option from a public page, with location and any obvious camper-relevant details such as showers, power, season, or check-in rules; missing details are marked not shown.","description":"Each of the 5 nights includes one verified legal campground option from a public page, with location and any obvious camper-relevant details such as showers, power, season, or check-in rules; missing details are marked not shown.\n\nHow a grader verifies this: Check that there are 5 overnight campground entries, each tied to a public campground page and containing the requested details or an explicit not shown note.","max_points":200},{"criterion":"The plan flags any especially weather- or road-sensitive day and provides 2 backup substitutions for the trip in case conditions are poor.","description":"The plan flags any especially weather- or road-sensitive day and provides 2 backup substitutions for the trip in case conditions are poor.\n\nHow a grader verifies this: Check that at least one sensitive day is identified when applicable and that exactly 2 backup substitutions are included and clearly connected to the itinerary.","max_points":120},{"criterion":"Key browser evidence is left open: the final map view for the chosen route, the 5 campground pages used, and 3 useful attraction or planning pages with photos or visitor information.","description":"Key browser evidence is left open: the final map view for the chosen route, the 5 campground pages used, and 3 useful attraction or planning pages with photos or visitor information.\n\nHow a grader verifies this: Check open tabs or end-state evidence for 1 route map, 5 campground pages, and 3 attraction/planning pages that match the itinerary.","max_points":160}]}} +{"task_id":"148dc4d3bebc57698033c4189935c3e5be4f38c1","confirmed_task":"I’m seriously thinking about buying a robot vacuum soon, and I don’t just want three random YouTube picks — I want a browser-based shortlist I could actually use to choose the right one for 2026. Please start on YouTube and find 6 strong video reviews or roundups about the best robot vacuums for 2026, with at least 3 from channels that appear to do hands-on testing rather than generic affiliate slideshows. From those videos, pull out the models that come up most often and build a candidate set of exactly 9 robot vacuums across 3 price tiers: 3 budget, 3 mid-range, and 3 premium. Then verify each candidate on its official product page and on at least one major retailer listing so I can see that it’s a real, current product with live pricing or 'not shown' if the price isn’t visible. For each of the 9, compare the things I’d actually care about for a normal home with mixed floors and some hair pickup: vacuuming performance, mopping approach if it has one, obstacle avoidance, auto-empty or dock features, battery/runtime claims, and whether replacement parts or consumables are easy to find on public pages. After that, narrow the 9 down to a final shortlist of 4 models: best overall, best value, best for pet hair, and best premium splurge. Keep the most useful evidence visible in the browser by leaving open the 3 strongest YouTube review tabs, the 4 final product pages, and at least 2 retailer pages for the finalists so I can sanity-check prices and photos myself. Can you start by playing the Youtube video for the best rated one?End with a concise recommendation that tells me which single model you’d buy if you were optimizing for value, and which one you’d buy if money were less important.","website":"https://www.youtube.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies exactly 6 YouTube video reviews or roundups about the best robot vacuums for 2026, with at least 3 coming from channels that appear to do hands-on testing rather than generic affiliate-style compilations.","description":"The browsing session identifies exactly 6 YouTube video reviews or roundups about the best robot vacuums for 2026, with at least 3 coming from channels that appear to do hands-on testing rather than generic affiliate-style compilations.\n\nHow a grader verifies this: Check that 6 distinct YouTube videos were selected and that the notes or final synthesis explicitly indicate which 3 or more were judged to be hands-on-testing sources.","max_points":160},{"criterion":"A candidate set of exactly 9 robot vacuums is built from the video findings, split into 3 budget, 3 mid-range, and 3 premium models.","description":"A candidate set of exactly 9 robot vacuums is built from the video findings, split into 3 budget, 3 mid-range, and 3 premium models.\n\nHow a grader verifies this: Check that the final comparison includes exactly 9 models and that each is assigned to one of the three requested price tiers with 3 models per tier.","max_points":170},{"criterion":"Each of the 9 candidates is verified on its official product page and on at least one major retailer listing, with live pricing captured where visible or marked 'not shown' when missing.","description":"Each of the 9 candidates is verified on its official product page and on at least one major retailer listing, with live pricing captured where visible or marked 'not shown' when missing.\n\nHow a grader verifies this: Check that every one of the 9 models has both an official-page reference and a retailer-page reference, and that the pricing field is filled with a visible price or the literal fallback 'not shown'.","max_points":190},{"criterion":"For each of the 9 robot vacuums, the comparison covers vacuuming performance, mopping approach if present, obstacle avoidance, auto-empty or dock features, battery/runtime claims, and replacement-parts or consumables availability on public pages.","description":"For each of the 9 robot vacuums, the comparison covers vacuuming performance, mopping approach if present, obstacle avoidance, auto-empty or dock features, battery/runtime claims, and replacement-parts or consumables availability on public pages.\n\nHow a grader verifies this: Check that all 9 entries include all six requested comparison dimensions, allowing 'not shown' only where a public page does not provide the information.","max_points":190},{"criterion":"The 9-model comparison is narrowed to a final shortlist of exactly 4 models labeled best overall, best value, best for pet hair, and best premium splurge.","description":"The 9-model comparison is narrowed to a final shortlist of exactly 4 models labeled best overall, best value, best for pet hair, and best premium splurge.\n\nHow a grader verifies this: Check that the final output contains exactly 4 finalists and that each is explicitly assigned to one of the four requested recommendation categories.","max_points":140},{"criterion":"Useful browser evidence is left open at the end: the 3 strongest YouTube review tabs, the 4 final product pages, and at least 2 retailer pages for the finalists, and the best-rated selected YouTube review video is started playing at the beginning of the task.","description":"Useful browser evidence is left open at the end: the 3 strongest YouTube review tabs, the 4 final product pages, and at least 2 retailer pages for the finalists, and the best-rated selected YouTube review video is started playing at the beginning of the task.\n\nHow a grader verifies this: Confirm that the specified tabs remain open and correspond to the chosen 3 review videos, 4 finalist official product pages, and 2 or more retailer listings for those finalists; one of the chosen YouTube evidence tabs is visibly the selected best-rated review and is playing.","max_points":150}]}} +{"task_id":"407381583456934981fbca0f1b91e4fa8a0883b2","confirmed_task":"I want to stop treating chess like a vague someday hobby and build myself a real beginner on-ramp that I could actually follow over the next month. Please use Lichess as the core hands-on tool, but don’t stay trapped on one site if a broader public-web comparison would help me learn more intelligently. Start by finding and comparing at least 8 beginner-friendly chess learning resources across public pages, with at least 3 of them coming from Lichess features or study pages and the rest coming from other reputable public resources like beginner lesson hubs, video-based instruction, or structured practice pages. For each one, figure out what it actually teaches a true novice, how interactive it is, and whether it seems more useful for rules, tactics, opening principles, endgames, or guided practice. Then on Lichess, actually complete the most foundational beginner material needed to understand legal moves, check and checkmate, basic tactics, and simple game play, and keep the key lesson or practice tabs open so I can see exactly what was used. After that, play exactly 3 practice games on public chess tools: 2 against the computer at beginner-friendly strength and 1 full game against a human if a public no-signup route is available, otherwise use a third computer game and note that the human option was not shown. While doing that, pay attention to the kinds of mistakes a novice is likely to make and use that to decide what I should practice first. Finally, give me one organized beginner study plan for my first 4 weeks that includes exactly 12 concrete practice items: 4 interactive lesson modules, 3 puzzle or tactics activities, 2 annotated game-watching resources, 2 practice-game habits to use every time I play, and 1 very simple opening principle guide. Recommend the best 5 tabs for me to keep open as my starter kit, leave those useful pages open, and explain why those 5 made the cut.","website":"https://lichess.org","level":"hard","reference_length":8,"precomputed_rubric":{"items":[{"criterion":"At least 8 beginner-friendly chess learning resources are compared on public pages, with at least 3 from Lichess and the rest from other reputable public resources, and each resource is evaluated for what it teaches and how useful it is for a true novice.","description":"At least 8 beginner-friendly chess learning resources are compared on public pages, with at least 3 from Lichess and the rest from other reputable public resources, and each resource is evaluated for what it teaches and how useful it is for a true novice.\n\nHow a grader verifies this: The final response lists 8 or more named resources, identifies which are from Lichess, and records for each the learning focus such as rules, tactics, opening principles, endgames, or guided practice, plus a brief usefulness judgment.","max_points":180},{"criterion":"The browsing session actually uses Lichess as the core hands-on tool by completing the foundational beginner material needed for legal moves, check and checkmate, basic tactics, and simple game play, with the key lesson or practice tabs kept open.","description":"The browsing session actually uses Lichess as the core hands-on tool by completing the foundational beginner material needed for legal moves, check and checkmate, basic tactics, and simple game play, with the key lesson or practice tabs kept open.\n\nHow a grader verifies this: The final response names the Lichess lessons or practice sections used for those four beginner topics and indicates that the key Lichess lesson/practice pages remain open.","max_points":200},{"criterion":"Exactly 3 practice games are played on public chess tools: 2 against the computer at beginner-friendly strength and 1 full human game if a public no-signup option is available, otherwise a third computer game with the limitation noted.","description":"Exactly 3 practice games are played on public chess tools: 2 against the computer at beginner-friendly strength and 1 full human game if a public no-signup option is available, otherwise a third computer game with the limitation noted.\n\nHow a grader verifies this: The final response reports exactly 3 practice games, specifies opponent type and beginner-friendly computer use, and clearly notes whether the human game was completed or replaced because a public no-signup route was not shown.","max_points":170},{"criterion":"The practice-game phase is used diagnostically by identifying the kinds of novice mistakes that showed up and using those observations to decide what should be practiced first.","description":"The practice-game phase is used diagnostically by identifying the kinds of novice mistakes that showed up and using those observations to decide what should be practiced first.\n\nHow a grader verifies this: The final response includes specific mistake patterns observed from the games and ties them to prioritized next-step practice recommendations.","max_points":140},{"criterion":"One organized 4-week beginner study plan is produced with exactly 12 concrete practice items: 4 interactive lesson modules, 3 puzzle or tactics activities, 2 annotated game-watching resources, 2 practice-game habits, and 1 simple opening principle guide.","description":"One organized 4-week beginner study plan is produced with exactly 12 concrete practice items: 4 interactive lesson modules, 3 puzzle or tactics activities, 2 annotated game-watching resources, 2 practice-game habits, and 1 simple opening principle guide.\n\nHow a grader verifies this: The final plan is structured over 4 weeks and contains exactly 12 items in the requested category counts with no missing or extra items.","max_points":190},{"criterion":"The session ends with the best 5 tabs kept open as a starter kit, and the final response explains why those 5 pages were chosen.","description":"The session ends with the best 5 tabs kept open as a starter kit, and the final response explains why those 5 pages were chosen.\n\nHow a grader verifies this: The final response identifies exactly 5 kept-open tabs and gives a reason for each that matches its role in the beginner learning path.","max_points":120}]}} +{"task_id":"5bca23145f1a3e0eaf5d207c4fe3eb4275707ecc","confirmed_task":"I want to buy a starter tool kit soon, but I don’t want to get fooled by inflated piece counts or end up with a case full of filler. Please do a serious browser-based comparison of 12 all-in-one tool kits from at least 4 major brands across Lowe’s and other big public retailers, with a target budget of about $75 to $200, aimed at someone who needs one kit that can realistically handle apartment setup, basic home fixes, furniture assembly, and light car or bike work. For each kit, use the actual product pages and any available manuals or contents lists to note the real included tools, not just the headline piece count, and call out obvious gaps like missing adjustable wrench, locking pliers, socket depth variety, precision screwdrivers, hex keys, tape measure, or utility knife. Also compare warranty terms, case organization, whether replacement tools are easy to find in that brand ecosystem, current listed price, and whether the page suggests shipping or store pickup. Keep the strongest 4 product tabs open, plus at least 2 warranty or official brand-support pages that helped verify the choice. Then recommend exactly 3 finalists: best overall, best budget, and best compact kit, with a short explanation of who each is for. After that, go to Lowe’s and find the best Lowe’s-listed option that matches your top recommendation as closely as possible, add it to the cart, and leave the Lowe’s cart open along with the chosen product page and the key comparison tabs so I can review everything before deciding whether to check out.","website":"https://www.lowes.com","level":"hard","reference_length":5,"precomputed_rubric":{"items":[{"criterion":"The session compares exactly 12 all-in-one tool kits from at least 4 major brands using Lowe’s and other major public retailer product pages, within the stated target budget range of about $75 to $200.","description":"The session compares exactly 12 all-in-one tool kits from at least 4 major brands using Lowe’s and other major public retailer product pages, within the stated target budget range of about $75 to $200.\n\nHow a grader verifies this: Evidence in open product tabs and the final synthesis shows 12 kits total, at least 4 brands, and recorded listed prices within or near the requested budget frame.","max_points":180},{"criterion":"For each of the 12 kits, the comparison records the real included tools from product pages and/or manuals or contents lists, and explicitly calls out obvious gaps such as missing adjustable wrench, locking pliers, socket depth variety, precision screwdrivers, hex keys, tape measure, or utility knife.","description":"For each of the 12 kits, the comparison records the real included tools from product pages and/or manuals or contents lists, and explicitly calls out obvious gaps such as missing adjustable wrench, locking pliers, socket depth variety, precision screwdrivers, hex keys, tape measure, or utility knife.\n\nHow a grader verifies this: The final comparison contains per-kit contents notes and gap callouts grounded in official listings or manuals/contents pages.","max_points":220},{"criterion":"The comparison also covers warranty terms, case organization, replacement-tool ecosystem, current listed price, and whether shipping or store pickup appears available for the kits reviewed.","description":"The comparison also covers warranty terms, case organization, replacement-tool ecosystem, current listed price, and whether shipping or store pickup appears available for the kits reviewed.\n\nHow a grader verifies this: The final synthesis includes those fields for the compared kits, with supporting retailer and brand-support pages open where used.","max_points":170},{"criterion":"Exactly 3 finalists are recommended: best overall, best budget, and best compact kit, each with a short explanation of who it is for.","description":"Exactly 3 finalists are recommended: best overall, best budget, and best compact kit, each with a short explanation of who it is for.\n\nHow a grader verifies this: The final recommendation section contains exactly three labeled finalists matching the requested categories and explanations.","max_points":140},{"criterion":"The strongest 4 product tabs remain open, along with at least 2 warranty or official brand-support pages that were used to verify the choice.","description":"The strongest 4 product tabs remain open, along with at least 2 warranty or official brand-support pages that were used to verify the choice.\n\nHow a grader verifies this: Open browser tabs include four finalist product pages and two relevant warranty/support pages tied to the comparison.","max_points":140},{"criterion":"A Lowe’s-listed option that matches the top recommendation as closely as possible is identified, added to the Lowe’s cart, and the Lowe’s cart and chosen Lowe’s product page are left open at the end.","description":"A Lowe’s-listed option that matches the top recommendation as closely as possible is identified, added to the Lowe’s cart, and the Lowe’s cart and chosen Lowe’s product page are left open at the end.\n\nHow a grader verifies this: The final browser state shows the selected Lowe’s product page and a Lowe’s cart containing the chosen kit.","max_points":150}]}} +{"task_id":"1d20e51c4aa23eb68d37f1eaea56431140ae53c9","confirmed_task":"I’m thinking seriously about applying to Indiana University for graduate school, but I don’t want a one-program skim or a dead-end application portal. Please do a thorough public-web sweep across Indiana University’s official graduate program and department pages and help me narrow this down into a real shortlist I could use. I want you to identify 12 distinct IU graduate programs that are still publicly described on official pages, spread across at least 4 different schools or departments and, if possible, more than one IU campus. For each of the 12 programs, capture the degree type, the school or department, the campus, the main focus or specialization area, the published admissions requirements, the application deadline or note 'not shown' if it isn’t clearly posted, and any obvious funding or assistantship information if it is publicly listed. Then go one level deeper on the 6 strongest options by opening their program pages plus the most relevant admissions or deadline pages in separate tabs, and also check faculty, research, or curriculum pages so I can see what makes each program meaningfully different. After that, recommend the best 3 programs for a broadly motivated applicant who wants a strong academic fit, clear admissions information, and a realistic path to applying in the next cycle. In your final summary, explain why those 3 rose to the top, note any missing or ambiguous information that would require a direct inquiry, and leave the most useful official evidence tabs open for the final 6 programs so I can review them myself.","website":"https://iugraduate2025.cas.myliaison.com","level":"hard","reference_length":8,"precomputed_rubric":{"items":[{"criterion":"Exactly 12 distinct Indiana University graduate programs are identified from official public IU pages, covering at least 4 different schools or departments and, where publicly available, more than one IU campus.","description":"Exactly 12 distinct Indiana University graduate programs are identified from official public IU pages, covering at least 4 different schools or departments and, where publicly available, more than one IU campus.\n\nHow a grader verifies this: Check the final comparison to confirm there are 12 unique programs, each tied to an official IU source, and that the set spans at least 4 schools or departments; campus labels are recorded where shown.","max_points":180},{"criterion":"For each of the 12 programs, the summary includes the degree type, school or department, campus, main focus or specialization area, published admissions requirements, application deadline or 'not shown,' and any publicly listed funding or assistantship information.","description":"For each of the 12 programs, the summary includes the degree type, school or department, campus, main focus or specialization area, published admissions requirements, application deadline or 'not shown,' and any publicly listed funding or assistantship information.\n\nHow a grader verifies this: Review the final program-by-program summary and confirm that each of the requested fields is filled in or marked 'not shown' when absent on the public pages.","max_points":220},{"criterion":"The 6 strongest options are investigated more deeply using separate official program pages plus the most relevant admissions or deadline pages, with faculty, research, or curriculum pages also consulted for each of those 6 programs.","description":"The 6 strongest options are investigated more deeply using separate official program pages plus the most relevant admissions or deadline pages, with faculty, research, or curriculum pages also consulted for each of those 6 programs.\n\nHow a grader verifies this: Confirm that browser evidence exists for all 6 deeper-review programs and that each one has both core program/admissions evidence and at least one faculty, research, or curriculum page informing the comparison.","max_points":170},{"criterion":"The final comparison clearly explains what meaningfully differentiates the 6 deeper-review programs using faculty, research, or curriculum evidence rather than only repeating generic catalog descriptions.","description":"The final comparison clearly explains what meaningfully differentiates the 6 deeper-review programs using faculty, research, or curriculum evidence rather than only repeating generic catalog descriptions.\n\nHow a grader verifies this: Inspect the write-up for the 6 deeper-review programs and confirm that each has at least one concrete differentiator grounded in the consulted official pages.","max_points":150},{"criterion":"A final recommendation names exactly 3 programs as the best options for a broadly motivated applicant seeking strong academic fit, clear admissions information, and a realistic path to applying in the next cycle.","description":"A final recommendation names exactly 3 programs as the best options for a broadly motivated applicant seeking strong academic fit, clear admissions information, and a realistic path to applying in the next cycle.\n\nHow a grader verifies this: Check that exactly 3 programs are recommended and that the reasoning for each explicitly addresses academic fit, clarity of admissions information, and application realism.","max_points":140},{"criterion":"The final summary explicitly notes any missing, unclear, or ambiguous information that would require direct follow-up with the program or admissions office.","description":"The final summary explicitly notes any missing, unclear, or ambiguous information that would require direct follow-up with the program or admissions office.\n\nHow a grader verifies this: Confirm that the final summary contains a dedicated note on missing or ambiguous items, tied to the relevant programs rather than omitted silently.","max_points":70},{"criterion":"The most useful official evidence tabs for the final 6 programs are left open at the end so the user can review the program and admissions/deadline pages directly.","description":"The most useful official evidence tabs for the final 6 programs are left open at the end so the user can review the program and admissions/deadline pages directly.\n\nHow a grader verifies this: Check the final browser state for open official IU tabs corresponding to the 6 deeper-review programs, including their key program and admissions/deadline evidence pages.","max_points":70}]}} +{"task_id":"591e86ed962faf5eddbee560c99b020b1c835aaf","confirmed_task":"I’m organizing a 16-player golf day and I don’t just want a quick guess at the teams — I want you to use the browser to help me build a fair setup I could actually use. Please research reputable public golf sources to compare at least 6 different pages covering 4-person competition formats, handicap-allocation methods, and common scoring or tie-break rules for casual tournaments or outings. Focus on formats that could work cleanly for exactly four teams of four, and compare things like whether the format rewards low handicaps too heavily, how much handicap adjustment is usually recommended, and whether the scoring is simple enough for one-day recreational play. Then choose the single best format for this group and explain why it fits better than the alternatives. After that, use this exact player list to create exactly 4 teams of 4: Cody: 1, C: 3, Greg: 4, Johnny: 4, Dave: 4, Sal: 4, Lane: 5, Steve: 10, Christian: 11, Carl: 11, Rawley: 11, Gary: 13, Albert: 14, Eric: 16, Doric: 17, Marcus: 18. For the final team recommendation, show each team’s players, total handicap, average handicap, and handicap spread, and make the teams as balanced as practical for the chosen format. Also recommend one clear scoring method, one handicap allowance rule, and one tie-break procedure based on the public sources you found. Keep the most useful rules and format-comparison pages open in separate tabs — including the chosen format source and at least 2 credible alternative-format pages — so I can review the evidence afterward. After this, can you help me find a public golf course near Cleveland where this would be possible? Find 5 possible candidates, then open them up in separate tabs. When this is done, write up a CrpytoPad document with the teams and the possible golf course candidates, ranked by feasibility and quality.","website":"https://ww3.unipark.de","level":"hard","reference_length":15,"precomputed_rubric":{"items":[{"criterion":"At least 6 reputable public golf pages are researched and compared for 4-person competition formats, handicap-allocation methods, and common scoring or tie-break rules relevant to a 16-player outing.","description":"At least 6 reputable public golf pages are researched and compared for 4-person competition formats, handicap-allocation methods, and common scoring or tie-break rules relevant to a 16-player outing.\n\nHow a grader verifies this: The final output cites or clearly uses findings from 6 or more public golf rules, format, or tournament-guidance pages.","max_points":120},{"criterion":"The format comparison focuses on options that work cleanly for exactly four teams of four and discusses fairness across handicap levels, handicap-adjustment approach, and scoring simplicity for one-day recreational play.","description":"The format comparison focuses on options that work cleanly for exactly four teams of four and discusses fairness across handicap levels, handicap-adjustment approach, and scoring simplicity for one-day recreational play.\n\nHow a grader verifies this: The comparison explicitly addresses format fit for four teams of four and covers the requested fairness, handicap, and simplicity tradeoffs.","max_points":140},{"criterion":"One single best format is chosen for this exact group and is explained as a better fit than the alternatives reviewed.","description":"One single best format is chosen for this exact group and is explained as a better fit than the alternatives reviewed.\n\nHow a grader verifies this: The final recommendation names one format and gives a specific why-this-one explanation tied to the compared alternatives.","max_points":120},{"criterion":"Using the exact provided player list, the final recommendation creates exactly 4 teams of 4 and shows each team’s players, total handicap, average handicap, and handicap spread, with teams balanced as practically as possible for the chosen format.","description":"Using the exact provided player list, the final recommendation creates exactly 4 teams of 4 and shows each team’s players, total handicap, average handicap, and handicap spread, with teams balanced as practically as possible for the chosen format.\n\nHow a grader verifies this: All 16 named players appear exactly once across 4 teams, and each team entry includes the required handicap metrics.","max_points":180},{"criterion":"The final recommendation includes one clear scoring method, one handicap allowance rule, and one tie-break procedure based on the public-source research, and the key format-comparison pages remain open, including the chosen format source and at least 2 credible alternatives.","description":"The final recommendation includes one clear scoring method, one handicap allowance rule, and one tie-break procedure based on the public-source research, and the key format-comparison pages remain open, including the chosen format source and at least 2 credible alternatives.\n\nHow a grader verifies this: The closing recommendation specifies the requested rules and the browser keeps open the chosen-format page plus at least 2 alternative-format evidence pages.","max_points":140},{"criterion":"The browsing session identifies exactly 5 public golf course candidates near Cleveland where the outing looks plausible, opens those candidate pages in separate tabs, and compares them for feasibility and quality.","description":"The browsing session identifies exactly 5 public golf course candidates near Cleveland where the outing looks plausible, opens those candidate pages in separate tabs, and compares them for feasibility and quality.\n\nHow a grader verifies this: There are 5 distinct public golf course candidates near Cleveland in the final output, and their pages remain open in separate tabs with comparative feasibility/quality notes.","max_points":140},{"criterion":"A CryptoPad Document is created that includes the final team setup plus the 5 golf-course candidates ranked by feasibility and quality.","description":"A CryptoPad Document is created that includes the final team setup plus the 5 golf-course candidates ranked by feasibility and quality.\n\nHow a grader verifies this: The open CryptoPad Document contains both the team recommendations and a ranked course-candidate section using the requested feasibility/quality framing.","max_points":160}]}} +{"task_id":"6365c47591545e1a214a0eae70d5bb7421b21ed8","confirmed_task":"I’m trying to choose a consumer DNA test primarily for health predisposition insights, not just ancestry, and I want you to do a serious browser-based comparison so I can make one confident purchase instead of guessing from Amazon listings. Start by identifying 6 to 8 consumer DNA kits that publicly advertise health-related genetic reporting or health predisposition information, using official company product pages first and expanding beyond Amazon if that gives a clearer picture. For each kit, check the official product page and the company’s help, FAQ, or report-description pages to record exactly what kind of health information is included, such as predisposition reports, carrier status, wellness traits, pharmacogenomics, or anything else relevant; if something is not shown clearly, mark it as not shown rather than assuming. Then verify the medical framing on public authoritative pages where useful, like FDA or major health-system guidance pages, so the comparison distinguishes between true consumer risk reports and broader wellness-style claims. I also want the practical decision details that matter before buying: list price, whether there is a subscription or paid add-on, sample type, stated turnaround time, whether raw data download is offered, whether data deletion is described, and whether the company says customer data may be used for research or shared with partners. Open and keep the most important evidence tabs available as you work: at least 4 official kit pages, at least 3 privacy or data-control pages from different companies, and 2 authoritative public guidance pages that help interpret what these tests can and cannot tell me. After comparing everything, narrow it to the best 3 options for someone whose top priority is meaningful health predisposition reporting with reasonable privacy controls, and explain the tradeoffs between them. End by recommending exactly 1 kit as the best overall choice, 1 runner-up for stronger privacy sensitivity, and 1 budget-conscious alternative, citing which health-predisposition-related features each one actually offers and clearly noting any important limitations or not-shown details. Leave the final recommendation pages and the most useful comparison evidence tabs open. Create a presentation for me to read your analysis in on CrpytoPad Presentations.","website":"https://www.amazon.com","level":"hard","reference_length":9,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies 6 to 8 consumer DNA kits that publicly advertise health-related genetic reporting or health predisposition information, using official company product pages as the core evidence.","description":"The browsing session identifies 6 to 8 consumer DNA kits that publicly advertise health-related genetic reporting or health predisposition information, using official company product pages as the core evidence.\n\nHow a grader verifies this: Check that the final output names 6 to 8 kits and that each is supported by an official public product page or equivalent official kit page visited during the session.","max_points":180},{"criterion":"For each of the 6 to 8 kits, the comparison records what health-related information is included, such as predisposition reports, carrier status, wellness traits, pharmacogenomics, or other relevant report categories, with 'not shown' used where the public pages do not make this clear.","description":"For each of the 6 to 8 kits, the comparison records what health-related information is included, such as predisposition reports, carrier status, wellness traits, pharmacogenomics, or other relevant report categories, with 'not shown' used where the public pages do not make this clear.\n\nHow a grader verifies this: Review the final comparison and confirm every kit has report-scope details filled in from official product/help/report-description pages, with no unsupported assumptions where details are missing.","max_points":200},{"criterion":"The final comparison includes the practical pre-purchase details explicitly requested for each kit: list price, subscription or paid add-on status, sample type, stated turnaround time, raw data download availability, whether data deletion is described, and whether customer data may be used for research or shared with partners.","description":"The final comparison includes the practical pre-purchase details explicitly requested for each kit: list price, subscription or paid add-on status, sample type, stated turnaround time, raw data download availability, whether data deletion is described, and whether customer data may be used for research or shared with partners.\n\nHow a grader verifies this: Check that each kit entry contains all requested decision fields or 'not shown' where needed, based on public company pages such as product, FAQ, terms, or privacy pages.","max_points":200},{"criterion":"The session uses authoritative public guidance where useful to distinguish more meaningful health predisposition reporting from broader wellness-style claims, including 2 authoritative public guidance pages kept as evidence.","description":"The session uses authoritative public guidance where useful to distinguish more meaningful health predisposition reporting from broader wellness-style claims, including 2 authoritative public guidance pages kept as evidence.\n\nHow a grader verifies this: Confirm that 2 authoritative public guidance pages were opened and that the final write-up uses them to explain limits or interpretation differences relevant to consumer genetic health testing.","max_points":120},{"criterion":"Key browser evidence remains visible: at least 4 official kit pages, at least 3 privacy or data-control pages from different companies, and 2 authoritative public guidance pages are opened and kept available.","description":"Key browser evidence remains visible: at least 4 official kit pages, at least 3 privacy or data-control pages from different companies, and 2 authoritative public guidance pages are opened and kept available.\n\nHow a grader verifies this: Inspect the open tabs or recorded evidence set and confirm the required counts and source types are present and correspond to the products discussed.","max_points":120},{"criterion":"The work narrows the field to the best 3 options for someone prioritizing meaningful health predisposition reporting with reasonable privacy controls, and explains the tradeoffs between those 3 options.","description":"The work narrows the field to the best 3 options for someone prioritizing meaningful health predisposition reporting with reasonable privacy controls, and explains the tradeoffs between those 3 options.\n\nHow a grader verifies this: Check that exactly 3 finalists are presented and that each finalist includes a clear tradeoff explanation tied to report scope, privacy controls, and other compared factors.","max_points":100},{"criterion":"The final recommendation names exactly 1 best overall kit, 1 runner-up for stronger privacy sensitivity, and 1 budget-conscious alternative, and cites which health-predisposition-related features each one actually offers while noting important limitations or not-shown details, and a CryptoPad Presentation is also created so the analysis can be read through as a presentation.","description":"The final recommendation names exactly 1 best overall kit, 1 runner-up for stronger privacy sensitivity, and 1 budget-conscious alternative, and cites which health-predisposition-related features each one actually offers while noting important limitations or not-shown details, and a CryptoPad Presentation is also created so the analysis can be read through as a presentation.\n\nHow a grader verifies this: Confirm the conclusion contains exactly these 3 recommendation roles and that each recommendation is justified with feature-specific evidence plus any relevant caveats; an open CryptoPad Presentation summarizing the comparison and recommendation is available for review.","max_points":80}]}} +{"task_id":"7e31dc76193fea7e052767ce6ccd65ecb9f2628d","confirmed_task":"I’m trying to buy a Nvidia GeForce RTX 3060 Ti in the UK without getting burned on a sketchy used listing, and I don’t want just the single cheapest result. Please do a proper browser-based buying sweep and help me figure out the best-value option under a hard cap of £500. Start on eBay UK and find 12 to 15 live RTX 3060 Ti listings that look plausibly fully functional, then narrow that set by checking the title, condition notes, photos, seller feedback, return policy, shipping cost, and whether the listing clearly appears to be for an actual Nvidia GeForce RTX 3060 Ti rather than a different card, broken card, parts-only listing, or ambiguous rebadge. Keep the strongest 4 eBay listing tabs open so I can review them later. After that, check at least 4 public UK alternatives such as refurbished or second-hand GPU pages from major retailers or marketplaces like CeX, Overclockers UK, Scan, Facebook Marketplace listings if publicly viewable, or other public UK computer parts sites, so we can see whether eBay is genuinely the best value or just the cheapest-looking option. Compare all viable options on total price, condition, seller trust, return protection, and any obvious risk flags from the photos or description. Then give me a final ranked shortlist of exactly 5 options, with one clear best pick and one safer-but-not-cheapest pick, and explain why. Leave the final shortlist pages open, including the best 3 eBay options and the 2 most relevant non-eBay comparison pages. Fill out a CrpytoPad Spreadsheet with all the options in an easily parsible manner.","website":"https://apply-worker-ad097d28ee25.herokuapp.com","level":"hard","reference_length":27,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies 12 to 15 live eBay UK RTX 3060 Ti listings under the user’s £500 cap and evaluates whether each is plausibly a fully functional Nvidia GeForce RTX 3060 Ti rather than a broken, parts-only, wrong-model, or ambiguous listing.","description":"The browsing session identifies 12 to 15 live eBay UK RTX 3060 Ti listings under the user’s £500 cap and evaluates whether each is plausibly a fully functional Nvidia GeForce RTX 3060 Ti rather than a broken, parts-only, wrong-model, or ambiguous listing.\n\nHow a grader verifies this: Check that the final response references 12 to 15 distinct eBay UK listings and records screening judgments tied to listing title, condition, or description details showing they were actively evaluated against the stated functional and model constraints.","max_points":200},{"criterion":"For the eBay sweep, the task checks the title, condition notes, photos, seller feedback, return policy, shipping cost, and listing clarity for the candidate listings.","description":"For the eBay sweep, the task checks the title, condition notes, photos, seller feedback, return policy, shipping cost, and listing clarity for the candidate listings.\n\nHow a grader verifies this: Confirm the comparison notes explicitly cover those requested fields for the viable eBay candidates, with no major requested factor omitted from the assessment.","max_points":160},{"criterion":"The agent keeps the strongest 4 eBay listing tabs open for later review.","description":"The agent keeps the strongest 4 eBay listing tabs open for later review.\n\nHow a grader verifies this: Verify that 4 eBay listing pages remain open at the end and that they correspond to listings described as the strongest or most viable eBay options.","max_points":120},{"criterion":"The session checks at least 4 public UK non-eBay alternatives, such as refurbished or second-hand GPU pages from major retailers or marketplaces, to test whether eBay is actually the best value.","description":"The session checks at least 4 public UK non-eBay alternatives, such as refurbished or second-hand GPU pages from major retailers or marketplaces, to test whether eBay is actually the best value.\n\nHow a grader verifies this: Confirm that at least 4 distinct public UK comparison pages outside eBay were opened and used in the analysis, and that they are relevant alternatives for buying an RTX 3060 Ti or closely comparable option within the same buying decision.","max_points":160},{"criterion":"All viable options are compared on total price, condition, seller trust, return protection, and any obvious risk flags from the photos or description.","description":"All viable options are compared on total price, condition, seller trust, return protection, and any obvious risk flags from the photos or description.\n\nHow a grader verifies this: Check that the final synthesis compares viable options using each of those requested dimensions, including explicit mention of risk flags where present or a clear note when none are obvious.","max_points":160},{"criterion":"The final output gives a ranked shortlist of exactly 5 options, including one clear best pick and one safer-but-not-cheapest pick, with explanations.","description":"The final output gives a ranked shortlist of exactly 5 options, including one clear best pick and one safer-but-not-cheapest pick, with explanations.\n\nHow a grader verifies this: Verify that exactly 5 options are ranked in the final answer and that the response explicitly identifies both the overall best pick and the safer-but-not-cheapest pick with reasons tied to the comparison criteria.","max_points":120},{"criterion":"The final shortlist pages are left open, including the best 3 eBay options and the 2 most relevant non-eBay comparison pages, and all compared options are also recorded in a CryptoPad Spreadsheet in an easily parsable format.","description":"The final shortlist pages are left open, including the best 3 eBay options and the 2 most relevant non-eBay comparison pages, and all compared options are also recorded in a CryptoPad Spreadsheet in an easily parsable format.\n\nHow a grader verifies this: Confirm that 5 final evidence tabs remain open at the end: 3 eBay shortlist pages and 2 non-eBay comparison pages that were actually used to support the recommendation; an open CryptoPad Spreadsheet captures the compared options in a parsable comparison format.","max_points":80}]}} +{"task_id":"9dd06f1da59b2d84a9852960561309e02b343476","confirmed_task":"I need help doing a serious browser-based gift search for a USC Trojans fan, not just grabbing the first sale item I see. Start on Fanatics and use its USC Trojans sale section as the anchor, but then widen naturally to other public retailers that carry licensed USC gear if that gives us better options. I want a final shortlist of exactly 12 gift options total, spread across 4 recipient types: 3 gifts for a student, 3 for an alum, 3 for a parent, and 3 for a general fan. Keep the budget practical by making sure each recipient group includes one option under $25, one from $25 to $60, and one over $60, using “not shown” if a size or variant price is unclear. For every gift, capture the product name, current listed price, whether it is marked as a sale item, the retailer, product type, any notable quality signal from the page such as brand or material, estimated shipping timing if publicly shown, and the return-policy summary from that retailer’s public pages. While you work, keep key product tabs open for at least the best 2 options in each recipient group, plus the return-policy or shipping page for each retailer you actually use, so I can review the evidence. At the end, recommend exactly 4 winners: best budget gift overall, best apparel gift, best home or office gift, and best premium gift, with a short explanation of why each one beat the alternatives.","website":"https://www.fanatics.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The final shortlist contains exactly 12 USC Trojans gift options total, with 3 gifts each for a student, an alum, a parent, and a general fan.","description":"The final shortlist contains exactly 12 USC Trojans gift options total, with 3 gifts each for a student, an alum, a parent, and a general fan.\n\nHow a grader verifies this: Check the final organized results and confirm there are exactly 12 entries and that each of the 4 recipient types has exactly 3 gifts assigned.","max_points":180},{"criterion":"Within each recipient group, the 3 gifts follow the requested budget spread: one under $25, one from $25 to $60, and one over $60.","description":"Within each recipient group, the 3 gifts follow the requested budget spread: one under $25, one from $25 to $60, and one over $60.\n\nHow a grader verifies this: Review the recorded prices for all 12 gifts and confirm that each recipient group matches the exact three-tier budget structure.","max_points":180},{"criterion":"Each gift entry includes the requested shopping details: product name, current listed price, whether it is marked as a sale item, retailer, product type, notable quality signal, estimated shipping timing if publicly shown, and the retailer’s return-policy summary, using 'not shown' where needed.","description":"Each gift entry includes the requested shopping details: product name, current listed price, whether it is marked as a sale item, retailer, product type, notable quality signal, estimated shipping timing if publicly shown, and the retailer’s return-policy summary, using 'not shown' where needed.\n\nHow a grader verifies this: Inspect each shortlist entry and confirm all required fields are present, with 'not shown' used only when the public page does not provide the field.","max_points":200},{"criterion":"The browsing session starts from Fanatics’ USC Trojans sale context and then broadens to other public retailers only as needed to build a stronger USC gift shortlist.","description":"The browsing session starts from Fanatics’ USC Trojans sale context and then broadens to other public retailers only as needed to build a stronger USC gift shortlist.\n\nHow a grader verifies this: Confirm that Fanatics USC sale listings were used as an anchor source and that any additional retailers included are public USC merchandise pages relevant to the gift comparison.","max_points":120},{"criterion":"Key browser evidence is left open: at least the best 2 product tabs for each of the 4 recipient groups, plus the return-policy or shipping page for each retailer actually used.","description":"Key browser evidence is left open: at least the best 2 product tabs for each of the 4 recipient groups, plus the return-policy or shipping page for each retailer actually used.\n\nHow a grader verifies this: Count the open tabs at the end and confirm there are product pages left open for 8 shortlisted gifts total across the recipient groups, along with policy or shipping pages for each participating retailer.","max_points":140},{"criterion":"The final recommendation identifies exactly 4 winners: best budget gift overall, best apparel gift, best home or office gift, and best premium gift, each with a short reason it beat the alternatives.","description":"The final recommendation identifies exactly 4 winners: best budget gift overall, best apparel gift, best home or office gift, and best premium gift, each with a short reason it beat the alternatives.\n\nHow a grader verifies this: Check the closing synthesis and confirm there are exactly 4 named winners matching the requested categories, each with a comparative justification.","max_points":180}]}} +{"task_id":"8956c9188720c23c45bbb0a5028ce5d61ddb6648","confirmed_task":"I’m trying to book a future trip to Manila and I don’t want to be fooled by one headline fare that turns ugly once the details matter. Please use Google Flights to do a serious Bay Area-to-Manila comparison for an evening departure, keeping the original spirit of max 1 stop, but broaden it into a real booking decision. Check 3 departure airports — SFO, OAK, and SJC — and compare 4 evening departure dates in the same future travel window, so I end up with 12 total candidate searches. For each search, identify the cheapest itinerary Google Flights shows that still respects the evening-departure preference and the maximum-1-stop rule, and note the price, airline, departure time, arrival time, layover airport, and total trip duration. Then narrow those 12 down to the 5 strongest options overall, not just the 5 absolute cheapest, by weighing price against total travel time and whether the layover looks reasonable. For those 5 finalists, open the airline’s own public booking or fare-details pages when possible and verify the practical catch points: whether the fare appears bookable there, the basic carry-on and first checked-bag situation if shown, and any obvious change/refund restrictions or fare-class limitations shown on public pages; if something is not shown, say 'not shown.' Keep the most useful Google Flights result tabs open for the final 5 options, plus at least 2 airline fare or policy pages that help explain the tradeoffs. At the end, give me one clear recommendation for the best value option, one backup that is the absolute cheapest, and one backup that is the least painful in total travel time, with a short explanation of why each earned that slot.","website":"https://www.google.com","level":"hard","reference_length":7,"precomputed_rubric":{"items":[{"criterion":"The session compares exactly 12 total candidate searches covering 3 departure airports — SFO, OAK, and SJC — across 4 evening departure dates in the same future travel window, all with a maximum of 1 stop.","description":"The session compares exactly 12 total candidate searches covering 3 departure airports — SFO, OAK, and SJC — across 4 evening departure dates in the same future travel window, all with a maximum of 1 stop.\n\nHow a grader verifies this: Final output lists 12 candidate search results and each one is traceable to a Google Flights search matching one of the 3 airports and one of the 4 dates, with the max-1-stop and evening-departure framing applied.","max_points":180},{"criterion":"For each of the 12 candidate searches, the cheapest qualifying Google Flights itinerary is recorded with price, airline, departure time, arrival time, layover airport, and total trip duration.","description":"For each of the 12 candidate searches, the cheapest qualifying Google Flights itinerary is recorded with price, airline, departure time, arrival time, layover airport, and total trip duration.\n\nHow a grader verifies this: All 12 entries include those six fields, using 'not shown' only where a field is genuinely unavailable on the public page.","max_points":180},{"criterion":"The 12 candidates are narrowed to exactly 5 strongest options overall using a stated comparison of price, total travel time, and layover reasonableness rather than price alone.","description":"The 12 candidates are narrowed to exactly 5 strongest options overall using a stated comparison of price, total travel time, and layover reasonableness rather than price alone.\n\nHow a grader verifies this: Final synthesis explicitly identifies 5 finalists and explains the comparison logic in terms of the three requested tradeoff factors.","max_points":180},{"criterion":"For each of the 5 finalists, public airline booking or fare-details pages are checked when possible to verify whether the fare appears bookable there and to capture the basic carry-on situation, first checked-bag situation, and any obvious change/refund restrictions or fare-class limitations shown; missing details may be marked 'not shown.'","description":"For each of the 5 finalists, public airline booking or fare-details pages are checked when possible to verify whether the fare appears bookable there and to capture the basic carry-on situation, first checked-bag situation, and any obvious change/refund restrictions or fare-class limitations shown; missing details may be marked 'not shown.'\n\nHow a grader verifies this: Each of the 5 finalists includes those verification notes, and the notes are grounded in airline public pages when available rather than only in Google Flights.","max_points":180},{"criterion":"The browser evidence is left in a useful state: the most useful Google Flights result tabs remain open for the final 5 options, plus at least 2 airline fare or policy pages that explain the tradeoffs.","description":"The browser evidence is left in a useful state: the most useful Google Flights result tabs remain open for the final 5 options, plus at least 2 airline fare or policy pages that explain the tradeoffs.\n\nHow a grader verifies this: Open tabs at the end visibly include Google Flights pages for the 5 finalists and at least 2 relevant airline fare or policy pages.","max_points":140},{"criterion":"The final recommendation gives exactly 3 named outcomes: one best value option, one backup that is the absolute cheapest, and one backup that is the least painful in total travel time, each with a short explanation.","description":"The final recommendation gives exactly 3 named outcomes: one best value option, one backup that is the absolute cheapest, and one backup that is the least painful in total travel time, each with a short explanation.\n\nHow a grader verifies this: Final response clearly labels all 3 outcomes and explains why each earned its slot based on the comparison work.","max_points":140}]}} +{"task_id":"2ab5a542b36b238ca0b84b812d7846e38e99aa13","confirmed_task":"I’m getting a poetry packet ready and I don’t just want one isolated guideline check — I want a real submission plan built around Margie / The American Journal of Poetry as one of my targets. Please start by finding the official submission page for Margie / The American Journal of Poetry, confirm the submission method, any fee, and whether there’s an active reading period or any stated window, and keep that official page open. Then build me a serious shortlist of 10 poetry journals total, including Margie / The American Journal of Poetry plus 9 comparable U.S. literary journals that are publicly open enough to evaluate from their websites. For each of the 10 journals, use the official journal site or official submissions page to capture the submission method, fee or no-fee status, reading period or 'not shown', simultaneous-submission policy if stated, and any obvious packet limits like number of poems or page count if stated. After that, open publicly viewable recent-poem, archive, or current-issue pages for at least 6 of the 10 journals so you can compare aesthetic fit rather than just logistics, and keep the strongest evidence tabs open. I want you to synthesize all of this into one browser-based final recommendation that ranks the 10 journals into three buckets for this packet: top 4 best first-wave submissions, next 3 worth sending if the first wave doesn’t land, and bottom 3 lower-priority or situational targets. In the final synthesis, explain the ranking using both logistics and editorial fit, call out which journals seem best for a more literary/traditional voice versus a more experimental voice when that can be inferred from public samples, and clearly mark anything as 'not shown' when the site doesn’t say. Leave the key official guidelines pages and the most useful sample-poem pages open at the end so I can review the evidence myself.","website":"https://www.google.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The browsing session finds and uses the official submission page for Margie / The American Journal of Poetry, confirms the submission method, any fee, and the active reading period or stated submission window if shown, and leaves that official page open.","description":"The browsing session finds and uses the official submission page for Margie / The American Journal of Poetry, confirms the submission method, any fee, and the active reading period or stated submission window if shown, and leaves that official page open.\n\nHow a grader verifies this: Check that the final result explicitly records those three APJ fields from an official page and that an APJ official guidelines tab remains open.","max_points":180},{"criterion":"A total of exactly 10 poetry journals are evaluated, consisting of Margie / The American Journal of Poetry plus 9 comparable U.S. literary journals that can be assessed from public pages.","description":"A total of exactly 10 poetry journals are evaluated, consisting of Margie / The American Journal of Poetry plus 9 comparable U.S. literary journals that can be assessed from public pages.\n\nHow a grader verifies this: Count the journals in the final comparison and confirm the set includes APJ plus 9 others, with no duplicates.","max_points":160},{"criterion":"For each of the 10 journals, the final comparison records the submission method, fee or no-fee status, reading period or 'not shown', simultaneous-submission policy if stated, and any obvious packet limits such as number of poems or page count if stated, using official journal or official submissions pages.","description":"For each of the 10 journals, the final comparison records the submission method, fee or no-fee status, reading period or 'not shown', simultaneous-submission policy if stated, and any obvious packet limits such as number of poems or page count if stated, using official journal or official submissions pages.\n\nHow a grader verifies this: Inspect the completed comparison and confirm all requested fields are present for all 10 journals, with 'not shown' used where needed and the information sourced from official pages.","max_points":220},{"criterion":"Publicly viewable recent-poem, archive, or current-issue pages are opened for at least 6 of the 10 journals so aesthetic fit can be compared, and the strongest evidence tabs are kept open.","description":"Publicly viewable recent-poem, archive, or current-issue pages are opened for at least 6 of the 10 journals so aesthetic fit can be compared, and the strongest evidence tabs are kept open.\n\nHow a grader verifies this: Check that at least 6 journals have public sample-work evidence consulted and that multiple relevant sample-poem or archive tabs remain open at the end.","max_points":140},{"criterion":"The final synthesis ranks all 10 journals into exactly three buckets: top 4 best first-wave submissions, next 3 worth sending if the first wave does not land, and bottom 3 lower-priority or situational targets.","description":"The final synthesis ranks all 10 journals into exactly three buckets: top 4 best first-wave submissions, next 3 worth sending if the first wave does not land, and bottom 3 lower-priority or situational targets.\n\nHow a grader verifies this: Confirm the final recommendation contains all 10 journals assigned once each into the specified 4/3/3 bucket structure.","max_points":160},{"criterion":"The ranking explanation explicitly uses both logistics and editorial fit, and it calls out which journals seem better suited to a more literary/traditional voice versus a more experimental voice whenever that can be inferred from public samples.","description":"The ranking explanation explicitly uses both logistics and editorial fit, and it calls out which journals seem better suited to a more literary/traditional voice versus a more experimental voice whenever that can be inferred from public samples.\n\nHow a grader verifies this: Review the written rationale for the rankings and confirm it discusses both operational submission factors and sample-based fit, including the traditional-versus-experimental distinction where inferable.","max_points":140}]}} +{"task_id":"341d7a70562b121c9702aa8845d51e0843575170","confirmed_task":"I’m seriously thinking about getting a dog, but I want a thorough browser-based reality check on whether dog ownership is likely to make me healthier, sicker, or a mix of both depending on the situation. Please treat this as a real decision, not just a quick fact check. Start by finding current public guidance from authoritative sources like the CDC, NIH or NLM/PubMed, major allergy organizations, and major medical systems, and build a balanced evidence base across at least 12 public pages total. I want this broken into five explicit questions: zoonotic or household infection risk, allergies and asthma risk, whether childhood exposure to dogs seems protective or harmful, claimed adult health benefits such as mental-health or cardiovascular effects, and what practical steps actually reduce risk if someone does own a dog. For each question, compare what the stronger sources say, note where the evidence looks consistent versus mixed, and distinguish clearly between evidence for healthy adults, young children, older adults, pregnant people, and immunocompromised households whenever a source addresses those groups. Also check at least 3 public discussion threads or Q&A-style pages, including Reddit if useful, to capture what people commonly believe or worry about, and explicitly compare those claims against the stronger medical or research sources. Keep the most useful evidence tabs open, including at least 2 official guidance pages, at least 2 research-review or PubMed-style pages, and 2 discussion pages that show common public concerns. Finish with one organized decision memo on-page, not a spreadsheet, that gives me: a bottom-line verdict on whether owning a dog increases, decreases, or mixes illness risk overall; a short section on who should be most cautious; a short section on who is probably overestimating the risk; and a practical checklist of at least 8 steps that would make dog ownership safer if I decide to go ahead.","website":"https://www.reddit.com","level":"hard","reference_length":6,"precomputed_rubric":{"items":[{"criterion":"The browsing session uses at least 12 public pages total and includes current authoritative sources from categories explicitly requested in the prompt: official guidance, research-review or PubMed-style material, and public discussion or Q&A-style pages.","description":"The browsing session uses at least 12 public pages total and includes current authoritative sources from categories explicitly requested in the prompt: official guidance, research-review or PubMed-style material, and public discussion or Q&A-style pages.\n\nHow a grader verifies this: Count the pages used and confirm the source mix includes authoritative guidance pages, research-review/PubMed-style pages, and at least 3 discussion or Q&A-style pages.","max_points":160},{"criterion":"The final memo is organized around the 5 explicit questions requested: zoonotic or household infection risk, allergies and asthma risk, childhood exposure effects, claimed adult health benefits, and practical risk-reduction steps.","description":"The final memo is organized around the 5 explicit questions requested: zoonotic or household infection risk, allergies and asthma risk, childhood exposure effects, claimed adult health benefits, and practical risk-reduction steps.\n\nHow a grader verifies this: Inspect the final memo and confirm that all 5 named sections are present and substantively completed.","max_points":160},{"criterion":"For each of the 5 questions, the memo compares what the stronger sources say and clearly identifies where the evidence appears consistent versus mixed.","description":"For each of the 5 questions, the memo compares what the stronger sources say and clearly identifies where the evidence appears consistent versus mixed.\n\nHow a grader verifies this: Check each of the 5 sections for comparative synthesis across multiple sources and an explicit statement of whether the evidence is consistent, mixed, or uncertain.","max_points":180},{"criterion":"The memo explicitly distinguishes evidence for the household groups named in the prompt whenever sources address them: healthy adults, young children, older adults, pregnant people, and immunocompromised households.","description":"The memo explicitly distinguishes evidence for the household groups named in the prompt whenever sources address them: healthy adults, young children, older adults, pregnant people, and immunocompromised households.\n\nHow a grader verifies this: Review the memo for group-specific treatment and confirm that these five groups are addressed where supported by the gathered sources rather than collapsed into one generic conclusion.","max_points":160},{"criterion":"At least 3 public discussion threads or Q&A-style pages are checked, and their common claims or worries are explicitly compared against stronger medical or research sources.","description":"At least 3 public discussion threads or Q&A-style pages are checked, and their common claims or worries are explicitly compared against stronger medical or research sources.\n\nHow a grader verifies this: Confirm that at least 3 discussion/Q&A pages were consulted and that the final memo directly contrasts those public beliefs with higher-authority evidence.","max_points":120},{"criterion":"Useful browser evidence is left visible at the end, including at least 2 official guidance pages, at least 2 research-review or PubMed-style pages, and 2 discussion pages showing common concerns.","description":"Useful browser evidence is left visible at the end, including at least 2 official guidance pages, at least 2 research-review or PubMed-style pages, and 2 discussion pages showing common concerns.\n\nHow a grader verifies this: Inspect the remaining open tabs and confirm the required counts and categories match the prompt.","max_points":100},{"criterion":"The final on-page decision memo includes all requested deliverables: a bottom-line verdict on whether owning a dog increases, decreases, or mixes illness risk overall; a section on who should be most cautious; a section on who is probably overestimating the risk; and a practical checklist of at least 8 steps to make dog ownership safer.","description":"The final on-page decision memo includes all requested deliverables: a bottom-line verdict on whether owning a dog increases, decreases, or mixes illness risk overall; a section on who should be most cautious; a section on who is probably overestimating the risk; and a practical checklist of at least 8 steps to make dog ownership safer.\n\nHow a grader verifies this: Review the final memo for the requested verdict, the two audience-specific sections, and a checklist with at least 8 concrete safety steps.","max_points":120}]}} +{"task_id":"73f7a6bce89de66f106a669c85c7908331d3d1b7","confirmed_task":"I’m seriously considering a future backpacking trip through China and want a version of the research that I could actually use, not just a generic average. Please build me a realistic backpacker budget across exactly 8 major tourist destinations in China, and make it structured enough that I could use it to decide whether the trip is affordable and which route makes the most sense. Start by identifying 8 widely recognized stops that would make a coherent backpacking route for a first-time visitor, then compare each one using public pages for hostels or budget hotels, food options, and major paid activities or sights. For every destination, record a typical nightly budget stay price, a low-cost daily food estimate, and 2 to 3 representative activity costs, using 'not shown' if a field truly is not available. Then figure out the most practical budget-minded way to travel between each stop, with an estimated fare and rough travel time, comparing trains, buses, or flights when that actually matters. Please open and keep useful evidence tabs along the way, including a few lodging pages with photos, a few attraction or ticket pages, and map pages for representative cities so I can visually sanity-check the route. Put the final result into one organized planning document that lists all 8 destinations in route order, the per-day and per-stop cost assumptions, the intercity transport costs, and a realistic total for the full trip. At the end, leave the finished document open along with the most useful lodging, transport, and map tabs so I can review the evidence.","website":"https://www.novo-monde.com","level":"hard","reference_length":6,"precomputed_rubric":{"items":[{"criterion":"The final plan identifies exactly 8 major tourist destinations in China and presents them in one coherent backpacking route for a first-time visitor.","description":"The final plan identifies exactly 8 major tourist destinations in China and presents them in one coherent backpacking route for a first-time visitor.\n\nHow a grader verifies this: Check the finished document for exactly 8 destination entries in route order, with no extra or missing stops.","max_points":180},{"criterion":"Each of the 8 destinations includes a typical nightly budget stay price, a low-cost daily food estimate, and 2 to 3 representative paid activity costs, using 'not shown' where necessary.","description":"Each of the 8 destinations includes a typical nightly budget stay price, a low-cost daily food estimate, and 2 to 3 representative paid activity costs, using 'not shown' where necessary.\n\nHow a grader verifies this: Review each destination entry in the document and confirm all requested cost fields are present for all 8 stops.","max_points":220},{"criterion":"The plan includes the most practical budget-minded transport choice between every consecutive stop, with an estimated fare and rough travel time, comparing trains, buses, or flights when that matters.","description":"The plan includes the most practical budget-minded transport choice between every consecutive stop, with an estimated fare and rough travel time, comparing trains, buses, or flights when that matters.\n\nHow a grader verifies this: Count the intercity legs in the document and confirm each leg has a chosen mode, estimated cost, and approximate duration.","max_points":180},{"criterion":"Useful browser evidence is kept open, including representative lodging pages with photos, attraction or ticket pages, and map pages for representative cities.","description":"Useful browser evidence is kept open, including representative lodging pages with photos, attraction or ticket pages, and map pages for representative cities.\n\nHow a grader verifies this: Inspect the open tabs at the end and confirm that lodging, attraction/ticket, and map evidence tabs are still available for review.","max_points":140},{"criterion":"The final planning document lists all 8 destinations in route order and includes the per-day and per-stop cost assumptions, plus the intercity transport costs.","description":"The final planning document lists all 8 destinations in route order and includes the per-day and per-stop cost assumptions, plus the intercity transport costs.\n\nHow a grader verifies this: Open the final document and confirm that destination order, daily assumptions, stop-level costs, and transport costs are all clearly organized.","max_points":160},{"criterion":"The finished result provides a realistic total estimated cost for the full trip and leaves the completed document open along with the most useful lodging, transport, and map tabs.","description":"The finished result provides a realistic total estimated cost for the full trip and leaves the completed document open along with the most useful lodging, transport, and map tabs.\n\nHow a grader verifies this: Confirm the document contains a full-trip total and that the final browser state still has the document and key evidence tabs open.","max_points":120}]}} +{"task_id":"6984e84f7f80fd923126e82006bbfaeeeec5b5d2","confirmed_task":"I’m trying to figure out the best realistic 'beer money' setup for someone in the UK, because I already know about Prolific and CloudResearch Connect but I don’t want to keep wasting time on sites that look good until you dig into them. Please do a serious public-web comparison of 12 to 15 UK-accessible platforms across the most relevant categories: survey/research panels, user-testing or interview platforms, cashback or receipt-reward apps, and general microtask sites. Start by checking the official public pages for each platform and confirm whether UK participants are accepted, what the platform mainly pays for, the payout method, the cash-out threshold, and any obvious waitlist, invite-only, or identity-check friction; if a field is not shown publicly, record it as 'not shown' instead of guessing. Then cross-check each platform with at least one independent public reputation source such as Trustpilot, Reddit discussions, app-store listings, or similar public review pages so I can tell the difference between 'legit but slow' and 'avoid this.' I want the final result in one comparison sheet with one row per platform and columns for platform name, category, UK eligibility status, task type, payout method, minimum cash-out, notable restrictions, public reputation signal, likely best use case, and your verdict. After that, rank the best 8 platforms for me in signup order: first the ones most worth applying to immediately, then the ones that are only worth using as filler. Keep the most useful evidence tabs open, including at least 6 official platform pages and at least 4 independent reputation or review pages, and end with the final comparison sheet open plus a short recommendation on whether I should mostly stick with Prolific and Connect or build a broader UK stack.","website":"https://connect.cloudresearch.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"A comparison sheet exists and is left open at the end, with one row per platform for 12 to 15 UK-accessible platforms and columns for platform name, category, UK eligibility status, task type, payout method, minimum cash-out, notable restrictions, public reputation signal, likely best use case, and verdict.","description":"A comparison sheet exists and is left open at the end, with one row per platform for 12 to 15 UK-accessible platforms and columns for platform name, category, UK eligibility status, task type, payout method, minimum cash-out, notable restrictions, public reputation signal, likely best use case, and verdict.\n\nHow a grader verifies this: Check that the open sheet contains 12 to 15 platform rows and all explicitly requested columns, using 'not shown' where public information was unavailable.","max_points":200},{"criterion":"The platform set covers the requested categories: survey/research panels, user-testing or interview platforms, cashback or receipt-reward apps, and general microtask sites.","description":"The platform set covers the requested categories: survey/research panels, user-testing or interview platforms, cashback or receipt-reward apps, and general microtask sites.\n\nHow a grader verifies this: Inspect the sheet rows and category labels to confirm all four categories are represented by at least one included platform.","max_points":140},{"criterion":"For each included platform, the official public pages are used to confirm whether UK participants are accepted, what the platform mainly pays for, the payout method, the cash-out threshold, and any obvious waitlist, invite-only, or identity-check friction.","description":"For each included platform, the official public pages are used to confirm whether UK participants are accepted, what the platform mainly pays for, the payout method, the cash-out threshold, and any obvious waitlist, invite-only, or identity-check friction.\n\nHow a grader verifies this: Spot-check platform rows against the kept-open official pages and confirm that unsupported fields are marked 'not shown' rather than guessed.","max_points":200},{"criterion":"Each platform is cross-checked with at least one independent public reputation source such as Trustpilot, Reddit, app-store listings, or similar public review pages so the comparison distinguishes trustworthy-but-slow options from poor choices.","description":"Each platform is cross-checked with at least one independent public reputation source such as Trustpilot, Reddit, app-store listings, or similar public review pages so the comparison distinguishes trustworthy-but-slow options from poor choices.\n\nHow a grader verifies this: Review the evidence and sheet entries to confirm every platform has at least one recorded independent reputation signal from a public source.","max_points":160},{"criterion":"The final output includes a ranked list of the best 8 platforms in signup order, separating the ones most worth applying to immediately from the ones only worth using as filler.","description":"The final output includes a ranked list of the best 8 platforms in signup order, separating the ones most worth applying to immediately from the ones only worth using as filler.\n\nHow a grader verifies this: Check the final recommendation section for exactly 8 ranked platforms and clear grouping into immediate-priority versus filler options.","max_points":140},{"criterion":"Useful browser evidence is left open, including at least 6 official platform pages and at least 4 independent reputation or review pages, along with the final comparison sheet.","description":"Useful browser evidence is left open, including at least 6 official platform pages and at least 4 independent reputation or review pages, along with the final comparison sheet.\n\nHow a grader verifies this: Inspect open tabs at the end to confirm the minimum counts and that both official and independent evidence pages remain available.","max_points":100},{"criterion":"The session ends with a short recommendation on whether the user should mostly stick with Prolific and CloudResearch Connect or build a broader UK stack.","description":"The session ends with a short recommendation on whether the user should mostly stick with Prolific and CloudResearch Connect or build a broader UK stack.\n\nHow a grader verifies this: Check the final written recommendation for an explicit conclusion addressing Prolific and Connect versus a broader stack.","max_points":60}]}} +{"task_id":"74da207c4a2c3ee82f3452d0d3c587af3f1a20ff","confirmed_task":"I’m trying to figure out what kind of community event I could realistically host in NYC, and I don’t just want generic brainstorming. Please use public event pages to study what already seems to work: start with Luma, then expand to a couple of other public event platforms if needed, and build me a comparison set of exactly 15 NYC community-oriented events that feel relevant for inspiration. I want a real mix across at least 4 neighborhoods and at least 5 event formats, like meetups, panels, workshops, socials, walks, volunteer events, coworking sessions, or founder/creator gatherings. For each of the 15 events, note the visible title, neighborhood or venue area, host or organizer if shown, format, price if shown, audience signal, and any community-planning details that are visible on the page such as description style, location clarity, agenda or run-of-show, RSVP/ticket framing, and whether there are clear participation expectations or community norms. Keep the strongest evidence tabs open for at least 6 representative events, including several Luma pages, so I can quickly inspect the examples myself. After comparing them, recommend exactly 8 event concepts I could plausibly host in NYC, with one sentence on why each concept fits the patterns you found. Then pick the best 3 concepts overall and, for each one, give me a draft event-page outline with a suggested title style, description structure, venue/location approach, agenda sections, and any community-related details I should make sure to include. End with a short recommendation on which single concept I should host first and why, and leave the most useful event example pages open. Write this up in a Crpytpad Document titled 'NYC Event Proposals'","website":"https://luma.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"A comparison set of exactly 15 NYC community-oriented public event pages is assembled, drawn from Luma first and expanded to other public event platforms if needed.","description":"A comparison set of exactly 15 NYC community-oriented public event pages is assembled, drawn from Luma first and expanded to other public event platforms if needed.\n\nHow a grader verifies this: Check that 15 distinct public event pages were reviewed and that the set includes Luma examples plus any additional public-platform examples used to reach the total.","max_points":200},{"criterion":"The 15-event set covers at least 4 neighborhoods and at least 5 event formats such as meetups, panels, workshops, socials, walks, volunteer events, coworking sessions, or founder/creator gatherings.","description":"The 15-event set covers at least 4 neighborhoods and at least 5 event formats such as meetups, panels, workshops, socials, walks, volunteer events, coworking sessions, or founder/creator gatherings.\n\nHow a grader verifies this: Check the recorded comparison notes for neighborhood coverage and format labels, confirming the minimum diversity counts are met.","max_points":180},{"criterion":"For each of the 15 events, the visible title, neighborhood or venue area, host or organizer if shown, format, price if shown, audience signal, and visible community-planning details are captured, including description style, location clarity, agenda or run-of-show, RSVP or ticket framing, and participation expectations or community norms where shown.","description":"For each of the 15 events, the visible title, neighborhood or venue area, host or organizer if shown, format, price if shown, audience signal, and visible community-planning details are captured, including description style, location clarity, agenda or run-of-show, RSVP or ticket framing, and participation expectations or community norms where shown.\n\nHow a grader verifies this: Review the final comparison output and confirm those fields are recorded for each of the 15 events, using 'not shown' where a public page does not display a field.","max_points":170},{"criterion":"At least 6 representative event pages are kept open as browser evidence, including several Luma pages, so the user can inspect the strongest examples directly.","description":"At least 6 representative event pages are kept open as browser evidence, including several Luma pages, so the user can inspect the strongest examples directly.\n\nHow a grader verifies this: Confirm that 6 or more relevant event tabs remain open at the end and that multiple open tabs are Luma event pages.","max_points":150},{"criterion":"Exactly 8 NYC event concepts are recommended, each with a one-sentence explanation of why it fits the patterns found in the comparison set.","description":"Exactly 8 NYC event concepts are recommended, each with a one-sentence explanation of why it fits the patterns found in the comparison set.\n\nHow a grader verifies this: Check that there are exactly 8 recommended concepts and that each includes a clear one-sentence rationale tied to the observed event patterns.","max_points":150},{"criterion":"The best 3 concepts are selected and each includes a draft event-page outline with a suggested title style, description structure, venue or location approach, agenda sections, and community-related details to include, followed by a final recommendation of which single concept to host first and why, and the write-up is captured in a CryptoPad Document titled 'NYC Event Proposals'.","description":"The best 3 concepts are selected and each includes a draft event-page outline with a suggested title style, description structure, venue or location approach, agenda sections, and community-related details to include, followed by a final recommendation of which single concept to host first and why, and the write-up is captured in a CryptoPad Document titled 'NYC Event Proposals'.\n\nHow a grader verifies this: Check that exactly 3 shortlisted concepts have complete draft page outlines covering all requested elements, and that the response ends with one recommended first event concept plus a reason; the final deliverable includes an open CryptoPad Document titled 'NYC Event Proposals'.","max_points":150}]}} +{"task_id":"228dda0bbfce65f778b3f421cb1f575c8c8977f1","confirmed_task":"I’m thinking about applying for teaching jobs around Athens, Georgia, and I don’t want a one-site skim — I want a real shortlist of the best districts or public-school employers to target for a future hiring cycle. Please start with Clarke County’s SchoolSpring employer page so we anchor on the local district correctly, then build out a comparison of exactly 8 school districts or public-school employers that are in Athens or within about a 45-minute drive of downtown Athens. For each one, use public pages only to find the employer name, the main jobs or careers page, whether certified teacher openings are currently visible, one representative teaching opening if shown, the current salary schedule for a new certified teacher if posted, key benefits or perks if posted, the school calendar or work-year clues if posted, and a rough drive time from downtown Athens. Also pull one public school-quality signal for each district, like the state report card, district profile, or another credible public source, and write \"not shown\" for anything you can’t verify publicly. Put the 8 employers into one clean comparison sheet or document so I can scan them side by side, then rank the best 5 districts for me to apply to with short reasons that balance pay, likely openings, commute, and overall district fit. Keep the most useful evidence visible in the browser by leaving open the career page and salary page for each of your top 5 picks, plus at least 2 representative job-posting tabs from the strongest options.","website":"https://clarke.schoolspring.com","level":"hard","reference_length":14,"precomputed_rubric":{"items":[{"criterion":"The browsing session starts from Clarke County’s SchoolSpring employer page and correctly identifies the local employer anchor before expanding to exactly 8 school districts or public-school employers in Athens or within about a 45-minute drive of downtown Athens.","description":"The browsing session starts from Clarke County’s SchoolSpring employer page and correctly identifies the local employer anchor before expanding to exactly 8 school districts or public-school employers in Athens or within about a 45-minute drive of downtown Athens.\n\nHow a grader verifies this: Check that Clarke County is explicitly used as the starting anchor and that the final comparison includes exactly 8 qualifying employers, each plausibly within the stated geography.","max_points":180},{"criterion":"A single comparison sheet or document is produced with one entry for each of the 8 employers and includes the requested public-page fields: employer name, main jobs/careers page, whether certified teacher openings are visible, one representative teaching opening if shown, current salary schedule for a new certified teacher if posted, key benefits or perks if posted, school calendar or work-year clues if posted, rough drive time from downtown Athens, and one public school-quality signal.","description":"A single comparison sheet or document is produced with one entry for each of the 8 employers and includes the requested public-page fields: employer name, main jobs/careers page, whether certified teacher openings are visible, one representative teaching opening if shown, current salary schedule for a new certified teacher if posted, key benefits or perks if posted, school calendar or work-year clues if posted, rough drive time from downtown Athens, and one public school-quality signal.\n\nHow a grader verifies this: Inspect the final sheet or document and confirm that all 8 entries contain the specified fields, with missing items recorded as \"not shown\" rather than invented.","max_points":200},{"criterion":"The comparison relies only on public pages and records \"not shown\" wherever a requested field cannot be verified publicly.","description":"The comparison relies only on public pages and records \"not shown\" wherever a requested field cannot be verified publicly.\n\nHow a grader verifies this: Spot-check multiple entries against visible public sources and confirm there are no login-only claims or uncited filled-in gaps where the prompt required \"not shown.\"","max_points":140},{"criterion":"The final output ranks the best 5 districts or public-school employers to apply to and gives short reasons for each ranking that balance pay, likely openings, commute, and overall district fit.","description":"The final output ranks the best 5 districts or public-school employers to apply to and gives short reasons for each ranking that balance pay, likely openings, commute, and overall district fit.\n\nHow a grader verifies this: Check that there are exactly 5 ranked recommendations and that each one includes concise reasoning touching the decision factors named in the prompt.","max_points":180},{"criterion":"For each of the top 5 ranked picks, the career page and salary page are left open in the browser as evidence.","description":"For each of the top 5 ranked picks, the career page and salary page are left open in the browser as evidence.\n\nHow a grader verifies this: Confirm that 10 evidence tabs remain open for the top 5 picks: one career/jobs page and one salary-related page for each ranked employer.","max_points":150},{"criterion":"At least 2 representative teaching job-posting tabs from the strongest options are also left open in the browser.","description":"At least 2 representative teaching job-posting tabs from the strongest options are also left open in the browser.\n\nHow a grader verifies this: Confirm that at least 2 live or recent representative teaching-opening pages remain open and correspond to employers discussed as strong options in the final ranking.","max_points":150}]}} +{"task_id":"4d13c31d463a1277ca8b7ef95947ee4117b0f922","confirmed_task":"I’m trying to figure out the smartest way for 3 adults to fly from the Washington, DC area to Orlando for a future trip built around the original Jan 30 travel window, and I don’t want just one American Airlines screenshot-price. Please start by checking American’s public booking flow for the original one-way search from DCA to MCO for 3 adults around that date so we have the baseline, then broaden into a real comparison that I could actually use: compare flights from all 3 Washington-area airports (DCA, IAD, and BWI) to Orlando-area options if publicly shown, using a tight date window centered on that trip timing and keeping the search bounded to 3 outbound dates total. For each airport/date combination, compare American with at least 2 other major airline options visible on public booking or metasearch pages, and focus on the cheapest practical itinerary for 3 adults rather than just the lowest teaser fare. As you compare, note whether each best option is nonstop or connecting, the departure and arrival times, the fare type, and any obvious restrictions that would materially affect the real price for normal travelers, especially carry-on limits, checked-bag assumptions if clearly shown, and seat-selection limitations if clearly shown; if something is not shown, record it as not shown. I want a final side-by-side comparison covering exactly 9 search combinations total (3 origin airports x 3 outbound dates), with one best option per combination and a clear overall recommendation for the best value, the best convenience, and the best American-only choice. Keep the most useful evidence tabs open at the end: the original American baseline search, the two strongest non-American comparison pages, and the final winner page so I can review them.","website":"https://www.aa.com","level":"hard","reference_length":11,"precomputed_rubric":{"items":[{"criterion":"The session starts with the original American Airlines baseline: a public booking search for 3 adults from DCA to MCO around the original Jan 30 trip timing, and that baseline is used as the reference point for the rest of the comparison.","description":"The session starts with the original American Airlines baseline: a public booking search for 3 adults from DCA to MCO around the original Jan 30 trip timing, and that baseline is used as the reference point for the rest of the comparison.\n\nHow a grader verifies this: An open American Airlines results/search page is present for the baseline route and traveler count, and the final comparison explicitly identifies it as the baseline.","max_points":160},{"criterion":"The final comparison covers exactly 9 search combinations total, formed by 3 Washington-area origin airports (DCA, IAD, and BWI) crossed with 3 outbound dates in the bounded trip window.","description":"The final comparison covers exactly 9 search combinations total, formed by 3 Washington-area origin airports (DCA, IAD, and BWI) crossed with 3 outbound dates in the bounded trip window.\n\nHow a grader verifies this: The delivered comparison contains one entry for each of the 9 airport/date combinations and does not omit or add combinations.","max_points":200},{"criterion":"For each of the 9 search combinations, American is compared with at least 2 other major airline options visible on public booking or metasearch pages, and one best option is chosen for that combination.","description":"For each of the 9 search combinations, American is compared with at least 2 other major airline options visible on public booking or metasearch pages, and one best option is chosen for that combination.\n\nHow a grader verifies this: Each combination entry shows a 3-way-or-more airline comparison including American plus at least 2 other major airline options, with one selected best option recorded.","max_points":180},{"criterion":"Each chosen best option records the practical trip details explicitly requested: whether it is nonstop or connecting, departure and arrival times, fare type, and any obvious restrictions that materially affect real price, especially carry-on limits, checked-bag assumptions if clearly shown, and seat-selection limitations if clearly shown, using 'not shown' where needed.","description":"Each chosen best option records the practical trip details explicitly requested: whether it is nonstop or connecting, departure and arrival times, fare type, and any obvious restrictions that materially affect real price, especially carry-on limits, checked-bag assumptions if clearly shown, and seat-selection limitations if clearly shown, using 'not shown' where needed.\n\nHow a grader verifies this: All 9 selected options include those fields, and any unavailable details are marked 'not shown' rather than invented.","max_points":180},{"criterion":"The final synthesis provides a clear overall recommendation naming the best value option, the best convenience option, and the best American-only option.","description":"The final synthesis provides a clear overall recommendation naming the best value option, the best convenience option, and the best American-only option.\n\nHow a grader verifies this: The conclusion explicitly labels all three recommendation categories and ties each to one of the compared options.","max_points":140},{"criterion":"Useful browser evidence is left open at the end: the original American baseline search, the 2 strongest non-American comparison pages, and the final winner page.","description":"Useful browser evidence is left open at the end: the original American baseline search, the 2 strongest non-American comparison pages, and the final winner page.\n\nHow a grader verifies this: Those 4 pages remain open and correspond to the exact evidence tabs requested in the prompt.","max_points":140}]}} +{"task_id":"20cea7be868a6dbaca0c1ff8a04562101f3fbd91","confirmed_task":"I’m trying to figure out which electric-vehicle websites are actually worth following for the next few months, not just find one random EV site. Please do a serious browser-based comparison of 10 EV-focused public websites that cover the space in meaningfully different ways—news, car reviews, buying guides, charging and ownership advice, future model coverage, or broader EV analysis. Start from one clearly EV-focused site and expand naturally to other credible EV-focused sites you find on the public web. For each of the 10 sites, check the homepage plus at least one deeper article or section page so you can verify what it really specializes in, how current it seems, and whether it is more useful for shoppers, enthusiasts, or industry-followers. Compare them on concrete things I’d care about: what kind of EV coverage they emphasize, whether they seem actively updated, whether they cover both current models and upcoming EVs, whether they have practical charging or ownership information, and whether the site feels easy to use for ongoing research. Then recommend exactly 5 sites to keep in my regular EV-reading rotation, with a short reason for each, and identify the single best site for each of these three use cases: keeping up with EV news, researching a future EV purchase, and learning about charging/ownership. Keep the 5 recommended sites open in tabs at the end, and for at least 3 of those, also leave one representative deeper page open that shows why the site made the cut. If any field is unclear from public pages, say not shown rather than guessing.","website":"https://insideevs.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"Exactly 10 EV-focused public websites are examined, and each is checked using both its homepage and at least one deeper article or section page.","description":"Exactly 10 EV-focused public websites are examined, and each is checked using both its homepage and at least one deeper article or section page.\n\nHow a grader verifies this: The final comparison names 10 distinct EV-focused sites and includes evidence drawn from both the homepage and one deeper page for each site.","max_points":200},{"criterion":"Each of the 10 sites is compared on the explicitly requested factors: coverage emphasis, apparent update activity, current-versus-upcoming EV coverage, charging or ownership information, and overall usability for ongoing research.","description":"Each of the 10 sites is compared on the explicitly requested factors: coverage emphasis, apparent update activity, current-versus-upcoming EV coverage, charging or ownership information, and overall usability for ongoing research.\n\nHow a grader verifies this: The final output contains side-by-side notes for every site covering all requested comparison dimensions, using 'not shown' where needed instead of guessing.","max_points":200},{"criterion":"The work distinguishes what each site is most useful for by assessing whether it is better suited to shoppers, enthusiasts, or industry-followers.","description":"The work distinguishes what each site is most useful for by assessing whether it is better suited to shoppers, enthusiasts, or industry-followers.\n\nHow a grader verifies this: Each site entry includes a clear audience-fit judgment tied to evidence from the public pages visited.","max_points":120},{"criterion":"Exactly 5 sites are recommended for the user's regular EV-reading rotation, each with a short reason for inclusion.","description":"Exactly 5 sites are recommended for the user's regular EV-reading rotation, each with a short reason for inclusion.\n\nHow a grader verifies this: The final recommendation section contains 5 and only 5 selected sites, each paired with a concise rationale grounded in the comparison.","max_points":160},{"criterion":"The final synthesis identifies the single best site for each of the three requested use cases: EV news, future EV purchase research, and charging/ownership learning.","description":"The final synthesis identifies the single best site for each of the three requested use cases: EV news, future EV purchase research, and charging/ownership learning.\n\nHow a grader verifies this: The final answer explicitly names one best site for each of the 3 use cases and explains why each won that category.","max_points":140},{"criterion":"Browser evidence is preserved by leaving the 5 recommended sites open, plus one representative deeper page open for at least 3 of those recommended sites.","description":"Browser evidence is preserved by leaving the 5 recommended sites open, plus one representative deeper page open for at least 3 of those recommended sites.\n\nHow a grader verifies this: The ending browser state shows the 5 chosen site tabs still open and at least 3 additional deeper pages open that visibly support the recommendations.","max_points":180}]}} +{"task_id":"5d3f0704adb650f6dfdea9e8bf1c070e3f4d02c7","confirmed_task":"I’m trying to decide whether I should keep using a browser-based password saver like Google Password Manager or switch to a dedicated password manager, and I want you to do a serious browser-based comparison that ends with exactly 3 apps I could realistically adopt. Start by identifying 8 to 10 credible password managers that have public pricing and security/privacy information, including Google Password Manager as a baseline if it qualifies. For each one, open and compare the official pricing page, the main security or architecture page, the privacy policy or privacy summary page, and a public help or feature page that confirms practical things like cross-device sync, passkey support, import/export options, and whether it works on the platforms I’m likely to care about. As you compare them, rule out anything that is obviously too expensive, weak on privacy, missing from major platforms, or unclear about how user data is protected. Then narrow the list to the best 5 finalists and do a deeper trust check for those 5 using public evidence such as independent security audits, bug bounty programs, encryption design explanations, breach or incident disclosures if any are relevant, and company transparency pages. After that, recommend exactly 3 applications that best balance affordability, privacy, and trustworthiness for a normal individual user, and for each of the 3 explain why it made the cut, what the likely monthly or annual cost is, what privacy or trust tradeoffs I’d be accepting, and whether migration from a browser-based password saver seems straightforward based on public import/export help pages. Keep the most useful comparison tabs open for the final 3, including at least one pricing tab and one security/privacy evidence tab for each finalist, plus one tab that shows Google Password Manager as the baseline I’m comparing against.","website":"https://passwords.google.com","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies and compares 8 to 10 credible password managers, with Google Password Manager included as a baseline if it qualifies.","description":"The browsing session identifies and compares 8 to 10 credible password managers, with Google Password Manager included as a baseline if it qualifies.\n\nHow a grader verifies this: Final output names 8 to 10 products considered and shows that Google Password Manager was included as the baseline or explicitly noted as not qualifying under the stated criteria.","max_points":160},{"criterion":"For each considered product, the comparison uses public official pages covering pricing, security or architecture, privacy policy or privacy summary, and a help or feature page confirming practical capabilities such as sync, passkeys, import/export, and platform support.","description":"For each considered product, the comparison uses public official pages covering pricing, security or architecture, privacy policy or privacy summary, and a help or feature page confirming practical capabilities such as sync, passkeys, import/export, and platform support.\n\nHow a grader verifies this: Final comparison notes include those four evidence categories for each considered product, with browser evidence visible through opened official tabs used for the comparison.","max_points":180},{"criterion":"Products that are too expensive, weak on privacy, missing major platform support, or unclear about user-data protection are ruled out before the shortlist is narrowed to the best 5 finalists.","description":"Products that are too expensive, weak on privacy, missing major platform support, or unclear about user-data protection are ruled out before the shortlist is narrowed to the best 5 finalists.\n\nHow a grader verifies this: The final synthesis explicitly shows which products were eliminated and why, and it presents a narrowed finalist set of exactly 5.","max_points":160},{"criterion":"The 5 finalists receive a deeper trust review using public evidence such as independent security audits, bug bounty programs, encryption design explanations, relevant breach or incident disclosures, and company transparency pages.","description":"The 5 finalists receive a deeper trust review using public evidence such as independent security audits, bug bounty programs, encryption design explanations, relevant breach or incident disclosures, and company transparency pages.\n\nHow a grader verifies this: For each of the 5 finalists, the final notes cite at least some of those deeper trust signals from public pages and distinguish stronger versus weaker trust evidence.","max_points":180},{"criterion":"The final recommendation includes exactly 3 applications that best balance affordability, privacy, and trustworthiness for a normal individual user.","description":"The final recommendation includes exactly 3 applications that best balance affordability, privacy, and trustworthiness for a normal individual user.\n\nHow a grader verifies this: The end result contains exactly 3 recommended apps and clearly identifies them as the final picks rather than a broader list.","max_points":140},{"criterion":"For each of the 3 recommended apps, the final explanation states why it made the cut, the likely monthly or annual cost, the privacy or trust tradeoffs involved, and whether migration from a browser-based password saver looks straightforward based on public import/export help pages.","description":"For each of the 3 recommended apps, the final explanation states why it made the cut, the likely monthly or annual cost, the privacy or trust tradeoffs involved, and whether migration from a browser-based password saver looks straightforward based on public import/export help pages.\n\nHow a grader verifies this: Each of the 3 finalist summaries contains all four requested elements: rationale, cost, tradeoffs, and migration assessment grounded in public help or support pages.","max_points":100},{"criterion":"Useful browser evidence is left open for review: for each of the final 3, at least one pricing tab and one security/privacy evidence tab remain open, plus one tab showing Google Password Manager as the baseline.","description":"Useful browser evidence is left open for review: for each of the final 3, at least one pricing tab and one security/privacy evidence tab remain open, plus one tab showing Google Password Manager as the baseline.\n\nHow a grader verifies this: Open tabs at the end include the required pricing and security/privacy evidence pages for each of the 3 finalists and one baseline Google Password Manager tab.","max_points":80}]}} +{"task_id":"b24db77407cc1befa527256883da54fd4f37c256","confirmed_task":"I’m trying to figure out which quick-dry camping towel I should actually buy for future trips, and I don’t want a one-listing answer. Start by using Temu as the low-cost baseline, then compare it against mainstream public retail options like Amazon, Walmart, REI, Target, or other public store pages you can access without signing in. I want a real browser-based market scan of 12 total quick-dry camping or travel towels, with at least 3 from Temu and at least 3 from non-discount outdoor or big-box retailers, and the rest from any credible public listings that fit. For each product, record the product name, current listed price, stated size, material if shown, packed or product weight if shown, whether it comes with a pouch or loop if shown, color/options note if relevant, and anything the page says about drying speed or absorbency; if a field is missing, write 'not shown.' As you compare them, open the actual product pages plus enough review/photo sections to sanity-check whether the towels look genuinely compact and camping-usable rather than just cheap gym towels. Then narrow the 12 down into 3 finalists: the best budget pick, the best overall value pick, and the best premium or most durable-looking pick. For those 3 finalists, also compare shipping or delivery info if publicly shown, return policy basics if publicly shown, and at least one review-based concern or tradeoff. Finish with a concise recommendation that tells me which one you would buy for a typical car-camping or weekend hiking user and why, and leave the 3 finalist product tabs plus 2 comparison tabs open so I can review the evidence myself.","website":"https://www.temu.com","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"The browsing session compares 12 total quick-dry camping or travel towels, including at least 3 from Temu and at least 3 from non-discount outdoor or big-box retailers, with the remaining items drawn from credible public listings.","description":"The browsing session compares 12 total quick-dry camping or travel towels, including at least 3 from Temu and at least 3 from non-discount outdoor or big-box retailers, with the remaining items drawn from credible public listings.\n\nHow a grader verifies this: Check the final comparison output for exactly 12 products and confirm retailer/source distribution meets the stated minimums using the recorded product pages.","max_points":200},{"criterion":"Each of the 12 products includes the requested recorded fields: product name, current listed price, stated size, material if shown, packed or product weight if shown, pouch or loop note if shown, color/options note if relevant, and any drying-speed or absorbency claim, using 'not shown' where information is missing.","description":"Each of the 12 products includes the requested recorded fields: product name, current listed price, stated size, material if shown, packed or product weight if shown, pouch or loop note if shown, color/options note if relevant, and any drying-speed or absorbency claim, using 'not shown' where information is missing.\n\nHow a grader verifies this: Inspect the final comparison notes and verify that every product row or entry contains all requested fields, with missing data explicitly marked as 'not shown' rather than omitted.","max_points":200},{"criterion":"The session uses actual product pages and review/photo sections to sanity-check whether the towels appear compact and camping-usable rather than generic gym towels.","description":"The session uses actual product pages and review/photo sections to sanity-check whether the towels appear compact and camping-usable rather than generic gym towels.\n\nHow a grader verifies this: Confirm that product pages were opened for the compared items and that review or photo evidence was consulted and reflected in the notes for compactness/camping suitability.","max_points":150},{"criterion":"The 12-product set is narrowed to exactly 3 finalists labeled as the best budget pick, the best overall value pick, and the best premium or most durable-looking pick.","description":"The 12-product set is narrowed to exactly 3 finalists labeled as the best budget pick, the best overall value pick, and the best premium or most durable-looking pick.\n\nHow a grader verifies this: Check the final synthesis for exactly three finalists with those three category labels and corresponding chosen products.","max_points":150},{"criterion":"For each of the 3 finalists, the comparison includes shipping or delivery info if publicly shown, return policy basics if publicly shown, and at least one review-based concern or tradeoff.","description":"For each of the 3 finalists, the comparison includes shipping or delivery info if publicly shown, return policy basics if publicly shown, and at least one review-based concern or tradeoff.\n\nHow a grader verifies this: Review the finalist summaries and confirm all three include those three elements, with 'not shown' used where public shipping or return details are unavailable.","max_points":150},{"criterion":"The final recommendation states which towel should be bought for a typical car-camping or weekend hiking user and explains why.","description":"The final recommendation states which towel should be bought for a typical car-camping or weekend hiking user and explains why.\n\nHow a grader verifies this: Check for a clear single recommended product and a concise rationale tied to the comparison criteria rather than a generic summary.","max_points":100},{"criterion":"At the end, the 3 finalist product tabs and 2 comparison tabs are left open for review.","description":"At the end, the 3 finalist product tabs and 2 comparison tabs are left open for review.\n\nHow a grader verifies this: Confirm that five tabs remain open matching the requested evidence set: three finalist product pages and two comparison-oriented tabs used during the evaluation.","max_points":50}]}} +{"task_id":"0dc4be74eb82df8d0ce1b556260bd615acffedd6","confirmed_task":"I’m trying to get a realistic picture of what would actually make me a stronger applicant for a future philosophy PhD cycle, especially whether publications are genuinely expected now or whether that’s internet panic. Please do a serious browser-based comparison across 12 funded U.S. philosophy PhD programs that are publicly visible and reasonably research-active, using official admissions pages first and then student, faculty, and department pages where helpful. For each program, figure out what the application seems to reward most strongly from public evidence: things like writing sample emphasis, research fit, letters, prior coursework, GRE if mentioned, publications if mentioned, teaching or language preparation if relevant, and funding structure. If a field is missing, record it as not shown instead of guessing. Then sanity-check the publications question by looking at at least 6 public non-official discussion or advice pages from places like Reddit, faculty advice posts, or department FAQs, and separate what is officially stated from what applicants seem to believe. I want you to keep key evidence tabs open, including at least 4 official admissions pages, 3 faculty or current-student pages that help show research-fit expectations, and 2 discussion or advice pages that shaped the publications conclusion. Finish by writing one organized decision memo that tells me, in plain English, whether publications look necessary, helpful, or mostly optional for these programs; what the strongest recurring signals actually are; where a typical applicant without publications would need to compensate; and a concrete priority list of the top 8 things I should improve before applying.","website":"https://www.reddit.com","level":"hard","reference_length":5,"precomputed_rubric":{"items":[{"criterion":"The browsing session covers exactly 12 funded U.S. philosophy PhD programs and uses official program or university admissions pages as the primary evidence base for each one.","description":"The browsing session covers exactly 12 funded U.S. philosophy PhD programs and uses official program or university admissions pages as the primary evidence base for each one.\n\nHow a grader verifies this: The final memo lists 12 distinct philosophy PhD programs, and each program entry includes evidence drawn from an official admissions or department page.","max_points":200},{"criterion":"For each of the 12 programs, the output identifies what the application appears to reward most strongly from public evidence, including writing sample emphasis, research fit, letters, prior coursework, GRE if mentioned, publications if mentioned, teaching or language preparation if relevant, and funding structure, with 'not shown' used where needed.","description":"For each of the 12 programs, the output identifies what the application appears to reward most strongly from public evidence, including writing sample emphasis, research fit, letters, prior coursework, GRE if mentioned, publications if mentioned, teaching or language preparation if relevant, and funding structure, with 'not shown' used where needed.\n\nHow a grader verifies this: Each of the 12 program summaries includes the requested factors or explicitly marks missing items as 'not shown' rather than inferring them.","max_points":220},{"criterion":"The publications question is sanity-checked using at least 6 public non-official discussion or advice pages, and the output clearly distinguishes official program statements from applicant or adviser commentary.","description":"The publications question is sanity-checked using at least 6 public non-official discussion or advice pages, and the output clearly distinguishes official program statements from applicant or adviser commentary.\n\nHow a grader verifies this: The final memo references at least 6 non-official discussion or advice pages and explicitly separates official evidence from community perception or informal advice.","max_points":160},{"criterion":"Key evidence tabs are kept open, including at least 4 official admissions pages, 3 faculty or current-student pages that help show research-fit expectations, and 2 discussion or advice pages that shaped the publications conclusion.","description":"Key evidence tabs are kept open, including at least 4 official admissions pages, 3 faculty or current-student pages that help show research-fit expectations, and 2 discussion or advice pages that shaped the publications conclusion.\n\nHow a grader verifies this: The final browser state includes the requested mix of open tabs, and those tabs correspond to evidence used in the memo.","max_points":120},{"criterion":"The final decision memo directly answers whether publications look necessary, helpful, or mostly optional for these programs.","description":"The final decision memo directly answers whether publications look necessary, helpful, or mostly optional for these programs.\n\nHow a grader verifies this: The memo gives a clear overall conclusion on publications and ties that conclusion back to the compared program evidence and outside discussion pages.","max_points":120},{"criterion":"The final decision memo explains what the strongest recurring signals actually are and where a typical applicant without publications would need to compensate.","description":"The final decision memo explains what the strongest recurring signals actually are and where a typical applicant without publications would need to compensate.\n\nHow a grader verifies this: The memo synthesizes cross-program patterns and includes a specific discussion of compensating strengths for applicants who do not have publications.","max_points":100},{"criterion":"The final decision memo ends with a concrete priority list of the top 8 things the user should improve before applying.","description":"The final decision memo ends with a concrete priority list of the top 8 things the user should improve before applying.\n\nHow a grader verifies this: The memo includes exactly 8 actionable improvement priorities tailored to the evidence gathered in the browsing session.","max_points":80}]}} +{"task_id":"9acfcc050ae8de65ba5f5de787a47cef0589dd90","confirmed_task":"I’m trying to choose a babywearing jacket for a future cold-weather season, and I don’t want just one random product link. Please do a serious browser-based comparison of 12 babywearing jacket options across at least 6 brands or retailers, including H&M if they still have a relevant option, and use only public product pages. For each option, capture the exact product name, listed price, whether it is a true babywearing jacket or a regular coat plus insert, whether the insert is included or sold separately, front-carry versus back-carry support if shown, maternity use if shown, weather clues like fleece/water-resistant/down, available size range if shown, and anything important from the product photos or description that affects real-world use. Then check the sizing/help or return-policy pages for the same brands when available so I can see which options are least risky to order online, and note ‘not shown’ anywhere the site doesn’t say. Narrow the list to the 5 strongest options for a practical buyer who cares about warmth, ease of use, and not overspending, and explain the tradeoffs between best budget, best for colder weather, best for extended maternity-to-babywearing use, and best overall. Keep the final 5 product pages open along with at least 3 relevant size-guide or return-policy pages and at least 2 pages with strong product-photo evidence, so I can review the finalists in the browser afterward. End with one organized comparison table and a clear recommendation. Make a presentation for me in CrptoPad so I can easily go through your findings.","website":"https://www2.hm.com","level":"hard","reference_length":7,"precomputed_rubric":{"items":[{"criterion":"The work compares exactly 12 babywearing jacket options drawn from at least 6 brands or retailers, using public product pages and including H&M if a relevant option is available.","description":"The work compares exactly 12 babywearing jacket options drawn from at least 6 brands or retailers, using public product pages and including H&M if a relevant option is available.\n\nHow a grader verifies this: Count 12 distinct product entries in the final comparison and confirm they span at least 6 brands or retailers; verify that H&M is included if a relevant option was found, or otherwise clearly noted as unavailable/not suitable.","max_points":180},{"criterion":"Each of the 12 options records the exact product name, listed price, whether it is a true babywearing jacket or a regular coat plus insert, whether the insert is included or separate, front-carry versus back-carry support if shown, maternity use if shown, weather clues, available size range if shown, and uses 'not shown' for missing fields.","description":"Each of the 12 options records the exact product name, listed price, whether it is a true babywearing jacket or a regular coat plus insert, whether the insert is included or separate, front-carry versus back-carry support if shown, maternity use if shown, weather clues, available size range if shown, and uses 'not shown' for missing fields.\n\nHow a grader verifies this: Inspect the final comparison table and confirm each listed option includes all requested fields or an explicit 'not shown' entry where the site does not provide the information.","max_points":220},{"criterion":"The browsing includes sizing/help or return-policy checks for the same brands when available, so the final comparison notes which options appear least risky to order online.","description":"The browsing includes sizing/help or return-policy checks for the same brands when available, so the final comparison notes which options appear least risky to order online.\n\nHow a grader verifies this: Verify that the final synthesis references sizing/help or return-policy information for the relevant brands when available, and that at least 3 such pages are kept open as requested.","max_points":140},{"criterion":"The list is narrowed to exactly 5 strongest options for a practical buyer focused on warmth, ease of use, and not overspending, with explicit tradeoff notes for best budget, best for colder weather, best for extended maternity-to-babywearing use, and best overall.","description":"The list is narrowed to exactly 5 strongest options for a practical buyer focused on warmth, ease of use, and not overspending, with explicit tradeoff notes for best budget, best for colder weather, best for extended maternity-to-babywearing use, and best overall.\n\nHow a grader verifies this: Check that the final output identifies 5 finalists and assigns the requested tradeoff categories with brief supporting reasoning tied to the gathered product details.","max_points":200},{"criterion":"Useful browser evidence is left open: the final 5 product pages, at least 3 relevant size-guide or return-policy pages, and at least 2 pages with strong product-photo evidence.","description":"Useful browser evidence is left open: the final 5 product pages, at least 3 relevant size-guide or return-policy pages, and at least 2 pages with strong product-photo evidence.\n\nHow a grader verifies this: Confirm that the specified product, policy/help, and photo-evidence pages remain open at the end and correspond to the finalists or brands discussed.","max_points":140},{"criterion":"The session ends with one organized comparison table plus a clear recommendation that synthesizes the market scan rather than just listing links, and a CryptoPad presentation is also created so the findings can be reviewed easily.","description":"The session ends with one organized comparison table plus a clear recommendation that synthesizes the market scan rather than just listing links, and a CryptoPad presentation is also created so the findings can be reviewed easily.\n\nHow a grader verifies this: Review the final output to confirm there is a structured comparison table covering all requested fields and a concise recommendation identifying the best choice and why; an open CryptoPad Presentation contains the comparison findings and recommendation.","max_points":120}]}} +{"task_id":"08b3ee12dd124bd3f046bd568260110899dc8695","confirmed_task":"I’m considering making a meaningful donation or sponsorship to one specific nonprofit, but before I do that I want a real browser-based due-diligence review rather than just a quick IRS check. Start by finding the organization on the IRS Tax Exempt Organization Search and confirm its exact legal name, EIN if shown, deductibility and tax-exempt status, and whether anything looks inactive, revoked, or unusual. Then verify the same organization on the most relevant public state charity-registration or business-record pages you can find, and note whether it appears properly registered or if any key fields are not shown. After that, pull the most recent publicly available Form 990 or equivalent filing you can access and summarize a few concrete basics I’d care about as a donor, like recent revenue, expenses, leadership compensation if shown, and whether the filing looks current. Next, review the nonprofit’s own public website for mission clarity, leadership or board transparency, annual reports, impact claims, and contact information, and open a few of the strongest evidence pages so I can inspect them later. Then check at least 3 independent public sources such as nonprofit watchdog, charity database, or major public-profile pages to see whether they agree on status, scale, and credibility, and note any material discrepancies. Finally, compare this organization against exactly 4 similar nonprofits working in the same cause area, using public pages only, so I can see whether this is the strongest option or just one plausible option. Give me a final recommendation on whether I should feel comfortable supporting this nonprofit now, support one of the alternatives instead, or hold off pending more information, and keep the most useful registry, filing, watchdog, and comparison tabs open at the end.","website":"https://apps.irs.gov","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The target nonprofit is verified on the IRS Tax Exempt Organization Search, with its exact legal name plus the requested IRS status details recorded, including whether anything appears inactive, revoked, or unusual.","description":"The target nonprofit is verified on the IRS Tax Exempt Organization Search, with its exact legal name plus the requested IRS status details recorded, including whether anything appears inactive, revoked, or unusual.\n\nHow a grader verifies this: Check that the final output includes the IRS lookup result for the target organization and that at least one relevant IRS results/details page is left open as evidence.","max_points":180},{"criterion":"The organization is also checked on the most relevant public state charity-registration or business-record pages, and the result notes whether it appears properly registered or whether key fields were not shown.","description":"The organization is also checked on the most relevant public state charity-registration or business-record pages, and the result notes whether it appears properly registered or whether key fields were not shown.\n\nHow a grader verifies this: Check that the final output records the state-level verification result and that at least one corresponding public registry page is left open or clearly referenced as evidence.","max_points":160},{"criterion":"The most recent publicly available Form 990 or equivalent filing is found and summarized with the requested concrete basics: recent revenue, expenses, leadership compensation if shown, and whether the filing appears current.","description":"The most recent publicly available Form 990 or equivalent filing is found and summarized with the requested concrete basics: recent revenue, expenses, leadership compensation if shown, and whether the filing appears current.\n\nHow a grader verifies this: Check that the final output includes those filing-based details and that a public filing page or filing source page is left open as evidence.","max_points":180},{"criterion":"The nonprofit’s own public website is reviewed for mission clarity, leadership or board transparency, annual reports, impact claims, and contact information, with a few of the strongest evidence pages opened for later inspection.","description":"The nonprofit’s own public website is reviewed for mission clarity, leadership or board transparency, annual reports, impact claims, and contact information, with a few of the strongest evidence pages opened for later inspection.\n\nHow a grader verifies this: Check that the final output covers all requested website-review categories and that multiple relevant pages from the nonprofit’s own site remain open.","max_points":140},{"criterion":"At least 3 independent public sources such as watchdog, charity-database, or major public-profile pages are checked, and any material discrepancies in status, scale, or credibility are noted.","description":"At least 3 independent public sources such as watchdog, charity-database, or major public-profile pages are checked, and any material discrepancies in status, scale, or credibility are noted.\n\nHow a grader verifies this: Check that 3 or more independent public sources are summarized in the final output and that at least some of those source pages are left open as evidence.","max_points":140},{"criterion":"The target nonprofit is compared against exactly 4 similar nonprofits in the same cause area using public pages only, so the user can judge whether it is the strongest option or just one plausible option.","description":"The target nonprofit is compared against exactly 4 similar nonprofits in the same cause area using public pages only, so the user can judge whether it is the strongest option or just one plausible option.\n\nHow a grader verifies this: Check that the final output includes exactly 4 peer nonprofits with a usable side-by-side comparison grounded in public pages, and that key comparison tabs remain open.","max_points":120},{"criterion":"A final recommendation is provided on whether to support the target nonprofit now, support one of the alternatives instead, or hold off pending more information, and the most useful registry, filing, watchdog, and comparison tabs are left open at the end.","description":"A final recommendation is provided on whether to support the target nonprofit now, support one of the alternatives instead, or hold off pending more information, and the most useful registry, filing, watchdog, and comparison tabs are left open at the end.\n\nHow a grader verifies this: Check that the final answer makes one of the three requested recommendation types and that the browser is left with the most useful evidence tabs open across those categories.","max_points":80}]}} +{"task_id":"09fe03cd3da0fac15bb28f24a627c4bbdea0e611","confirmed_task":"I want help building a real family safety starter plan for my household, not just finding one generic tip. Please use public pages only and focus mostly on official or major nonprofit guidance for a U.S.-based family. Research exactly 6 safety categories: home fire escape, severe weather sheltering, emergency contacts and family reunification, first aid and poisoning, car-seat or child passenger safety, and basic emergency supplies. For each category, find and compare at least 2 credible guidance pages, note the most important points they clearly agree on, and call out any meaningful differences or gaps. Keep the strongest page for each of the 6 categories open in separate tabs so I can review them later, and also keep open 2 or 3 comparison tabs that were especially useful for resolving differences. After that, create one organized family safety plan with exactly 15 action items total, grouped into 3 buckets: do today, do this week, and do this month. Each action item should be specific enough that I could actually do it, and it should cite which category it came from. If a source leaves something unclear or not shown, say so instead of guessing. End with a short section naming the 6 pages you trust most and why those are the ones worth keeping open.","website":"https://redcap.usuhs.edu","level":"hard","reference_length":29,"precomputed_rubric":{"items":[{"criterion":"The browsing work covers exactly 6 safety categories: home fire escape, severe weather sheltering, emergency contacts and family reunification, first aid and poisoning, car-seat or child passenger safety, and basic emergency supplies.","description":"The browsing work covers exactly 6 safety categories: home fire escape, severe weather sheltering, emergency contacts and family reunification, first aid and poisoning, car-seat or child passenger safety, and basic emergency supplies.\n\nHow a grader verifies this: The final output includes all 6 named categories and no substitute categories, with evidence drawn from public guidance pages for each one.","max_points":160},{"criterion":"For each of the 6 categories, at least 2 credible public guidance pages are found and compared.","description":"For each of the 6 categories, at least 2 credible public guidance pages are found and compared.\n\nHow a grader verifies this: The final output shows a comparison for every category with at least 2 sources per category, for a minimum of 12 total guidance pages, and identifies the agreements and any meaningful differences or gaps.","max_points":200},{"criterion":"The research relies mostly on official or major nonprofit guidance pages appropriate for a U.S.-based family.","description":"The research relies mostly on official or major nonprofit guidance pages appropriate for a U.S.-based family.\n\nHow a grader verifies this: The cited pages are predominantly official government, medical, safety-organization, or major nonprofit sources, and the final summary reflects that source quality standard.","max_points":140},{"criterion":"The strongest page for each of the 6 categories is kept open, and 2 or 3 especially useful comparison tabs are also kept open.","description":"The strongest page for each of the 6 categories is kept open, and 2 or 3 especially useful comparison tabs are also kept open.\n\nHow a grader verifies this: There is browser evidence that 6 category-winning tabs remain open, plus 2 or 3 comparison tabs that were used to resolve or inspect differences.","max_points":140},{"criterion":"One organized family safety plan is produced with exactly 15 action items grouped into do today, do this week, and do this month.","description":"One organized family safety plan is produced with exactly 15 action items grouped into do today, do this week, and do this month.\n\nHow a grader verifies this: The final plan contains exactly 15 actionable items total, clearly grouped into the 3 requested time buckets rather than a different structure.","max_points":220},{"criterion":"Each action item is specific enough to do, is linked back to its source category, and unclear or missing details are labeled as not shown instead of guessed. The plan ends with a short section naming the 6 most trusted pages and why they were kept open.","description":"Each action item is specific enough to do, is linked back to its source category, and unclear or missing details are labeled as not shown instead of guessed. The plan ends with a short section naming the 6 most trusted pages and why they were kept open.\n\nHow a grader verifies this: Every action item includes category attribution and practical specificity, any unknowns are marked not shown, and the ending section explicitly lists the 6 trusted pages with a brief rationale for each.","max_points":140}]}} +{"task_id":"cc7eea277860163ddebeea9449a50b65d1890a29","confirmed_task":"I want to figure out where I should actually go in the Grand Rapids area if I want to make art in public instead of just staying home with a craft kit, and I don’t want this to stop at one studio. Please include Spruced Studio, but build me a real comparison of 10 public-facing art or craft places in or near Grand Rapids that appear to offer beginner-friendly workshops, drop-in sessions, one-off classes, or other ways for regular adults to participate without already being members. Use public pages only, and for each place capture the art medium, whether it looks more like drop-in versus scheduled workshop, any visible beginner-friendly signal, the price or price range if shown, the neighborhood or city area, and the clearest attendance or registration instructions; if something is missing, mark it as not shown. Open and keep the most useful workshop or class pages for the strongest 5 options, and for at least 3 of those also open a map, photo, or location page so I can sanity-check the vibe and area. When you’re done, recommend exactly 4 places for different needs: best low-pressure first try, best date-night or group outing, best hands-on craft skill builder, and best overall option if I wanted to start doing this regularly.","website":"https://www.google.com","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"The final comparison covers exactly 10 public-facing art or craft places in or near Grand Rapids, and Spruced Studio is included as one of them.","description":"The final comparison covers exactly 10 public-facing art or craft places in or near Grand Rapids, and Spruced Studio is included as one of them.\n\nHow a grader verifies this: Count 10 distinct venues in the final output and confirm Spruced Studio appears among them.","max_points":180},{"criterion":"For each of the 10 places, the comparison records the requested fields: art medium, whether it appears to be drop-in versus scheduled workshop, any visible beginner-friendly signal, the price or price range if shown, the neighborhood or city area, and the clearest attendance or registration instructions, using 'not shown' where needed.","description":"For each of the 10 places, the comparison records the requested fields: art medium, whether it appears to be drop-in versus scheduled workshop, any visible beginner-friendly signal, the price or price range if shown, the neighborhood or city area, and the clearest attendance or registration instructions, using 'not shown' where needed.\n\nHow a grader verifies this: Inspect each venue entry and confirm all requested fields are present with either sourced details or an explicit 'not shown.'","max_points":220},{"criterion":"The 10 places are sourced from public pages that make them look like real beginner-accessible options rather than private membership-only or advanced-only programs.","description":"The 10 places are sourced from public pages that make them look like real beginner-accessible options rather than private membership-only or advanced-only programs.\n\nHow a grader verifies this: Check that each venue entry is supported by a public-facing page indicating workshops, classes, drop-ins, or other adult participation access.","max_points":140},{"criterion":"The most useful workshop or class pages for the strongest 5 options are opened and kept available.","description":"The most useful workshop or class pages for the strongest 5 options are opened and kept available.\n\nHow a grader verifies this: Confirm that 5 relevant class or workshop tabs remain open for the shortlisted options.","max_points":140},{"criterion":"For at least 3 of those stronger options, a map, photo, or location page is also opened so the user can sanity-check the vibe and area.","description":"For at least 3 of those stronger options, a map, photo, or location page is also opened so the user can sanity-check the vibe and area.\n\nHow a grader verifies this: Confirm at least 3 additional open tabs show map, photo, or location evidence corresponding to shortlisted venues.","max_points":100},{"criterion":"The final recommendations include exactly 4 places matched to the requested use cases: best low-pressure first try, best date-night or group outing, best hands-on craft skill builder, and best overall option for doing this regularly.","description":"The final recommendations include exactly 4 places matched to the requested use cases: best low-pressure first try, best date-night or group outing, best hands-on craft skill builder, and best overall option for doing this regularly.\n\nHow a grader verifies this: Check that there are exactly 4 recommendation labels and each is assigned to one venue with the requested category names.","max_points":120},{"criterion":"The recommendations are grounded in the comparison details rather than generic opinion, using the gathered information about medium, format, pricing, accessibility, or attendance flow.","description":"The recommendations are grounded in the comparison details rather than generic opinion, using the gathered information about medium, format, pricing, accessibility, or attendance flow.\n\nHow a grader verifies this: Review the recommendation rationale and confirm it explicitly cites comparison factors gathered during browsing.","max_points":100}]}} +{"task_id":"2dab64ba576aeaa10453c0c642792f5ac71fb646","confirmed_task":"I’m thinking about a future long weekend in Colorado Springs and I don’t just want a quick list of three ideas — I want to know whether the city actually has enough variety to build a fun, realistic plan. Please start with Springs magazine or similar local-guide coverage to discover what’s happening and what people recommend, then build me a Colorado Springs weekend playbook using only public pages. I want at least 12 total options, with exactly 3 free or very low-cost activities/resources, exactly 3 food or drink stops, exactly 3 outdoor options, and exactly 3 current or recurring local events, seasonal attractions, or neighborhood-specific things to do. For each option, verify it on an official or primary public page when possible and note the title, what category it fits, the neighborhood or area, price if shown, hours or timing if shown, and whether advance booking seems needed or not shown. Open and keep the most useful evidence tabs available as you go, including at least 2 local-guide pages, at least 4 official venue or organizer pages, and at least 3 map-based pages so I can sanity-check location clusters. After that, compare the 12 options and turn them into 2 different weekend plans: one budget-friendly plan and one more experience-focused plan, with each plan covering Friday evening, Saturday daytime, Saturday evening, and Sunday daytime. Make the plans geographically sensible instead of bouncing all over town, and mention where there are obvious backups if something appears closed or weather-dependent. Finish with a clear recommendation on whether Colorado Springs looks like a strong choice for that kind of weekend, plus the 5 strongest pages left open for me to review. Can you also make sure to plan out the dining itinerary for the weekend under these guidelines? Find high quality, can't miss restaurants for every meal of the trip and write them in CryptoPad Spreadsheet.","website":"https://springsmag.com","level":"hard","reference_length":28,"precomputed_rubric":{"items":[{"criterion":"The final result identifies at least 12 total Colorado Springs options with exactly 3 free or very low-cost activities/resources, exactly 3 food or drink stops, exactly 3 outdoor options, and exactly 3 current or recurring local events, seasonal attractions, or neighborhood-specific things to do.","description":"The final result identifies at least 12 total Colorado Springs options with exactly 3 free or very low-cost activities/resources, exactly 3 food or drink stops, exactly 3 outdoor options, and exactly 3 current or recurring local events, seasonal attractions, or neighborhood-specific things to do.\n\nHow a grader verifies this: Count the listed options and confirm the category totals match 3/3/3/3 exactly.","max_points":180},{"criterion":"Each of the 12 options includes the requested recorded details: title, category, neighborhood or area, price if shown, hours or timing if shown, and whether advance booking seems needed or not shown.","description":"Each of the 12 options includes the requested recorded details: title, category, neighborhood or area, price if shown, hours or timing if shown, and whether advance booking seems needed or not shown.\n\nHow a grader verifies this: Inspect each option entry and confirm every requested field is present, using 'not shown' where necessary.","max_points":160},{"criterion":"Discovery starts from Springs magazine or similar local-guide coverage, and each option is verified on an official or primary public page when possible.","description":"Discovery starts from Springs magazine or similar local-guide coverage, and each option is verified on an official or primary public page when possible.\n\nHow a grader verifies this: Check that local-guide pages were used for discovery and that the listed options rely on corresponding official or primary public pages where available.","max_points":140},{"criterion":"Useful browser evidence is kept open, including at least 2 local-guide pages, at least 4 official venue or organizer pages, and at least 3 map-based pages.","description":"Useful browser evidence is kept open, including at least 2 local-guide pages, at least 4 official venue or organizer pages, and at least 3 map-based pages.\n\nHow a grader verifies this: Review the open tabs and confirm the minimum counts for local-guide, official/organizer, and map-based pages are met.","max_points":120},{"criterion":"The findings are synthesized into 2 different weekend plans, one budget-friendly and one more experience-focused, and each plan covers Friday evening, Saturday daytime, Saturday evening, and Sunday daytime.","description":"The findings are synthesized into 2 different weekend plans, one budget-friendly and one more experience-focused, and each plan covers Friday evening, Saturday daytime, Saturday evening, and Sunday daytime.\n\nHow a grader verifies this: Check that both plans exist, are distinct, and each includes all four required time blocks.","max_points":140},{"criterion":"The weekend plans are geographically sensible, mention obvious backups for closed or weather-dependent options, and end with a clear recommendation on whether Colorado Springs is a strong choice for that kind of weekend, with the 5 strongest pages left open for review.","description":"The weekend plans are geographically sensible, mention obvious backups for closed or weather-dependent options, and end with a clear recommendation on whether Colorado Springs is a strong choice for that kind of weekend, with the 5 strongest pages left open for review.\n\nHow a grader verifies this: Confirm the write-up discusses geographic clustering and backups, gives a clear recommendation, and leaves 5 strong pages open.","max_points":120},{"criterion":"A dining itinerary is planned for every meal of the weekend using high-quality, can't-miss restaurants, and those meal recommendations are written into a CryptoPad Spreadsheet.","description":"A dining itinerary is planned for every meal of the weekend using high-quality, can't-miss restaurants, and those meal recommendations are written into a CryptoPad Spreadsheet.\n\nHow a grader verifies this: The final deliverables include a meal-by-meal restaurant plan for the weekend and an open CryptoPad Spreadsheet recording those dining recommendations.","max_points":140}]}} +{"task_id":"9825e1505ad6eacf0ae2af5709a74d9eb0280029","confirmed_task":"I’m trying to figure out where I could realistically apply in Australia as a New Zealand-qualified lawyer with about three years of post-qualification experience, and I want a serious browser-based market sweep rather than a skim of one recruiter site. Please start with Legal People Australia, but then widen naturally to other Australian legal recruiters, major job boards, and public law-firm or in-house careers pages so we’re not missing obvious opportunities. Build me a shortlist of exactly 18 to 24 current roles that are genuinely relevant to someone around 2 to 4 PQE, and only include roles where the listing either explicitly welcomes New Zealand qualification or NZ admission, is silent but otherwise looks plausibly transferable, or clearly states Australian admission is required so that distinction is visible. For each shortlisted role, record the job title, employer or recruiter, city/state, practice area, stated PQE or experience range, whether NZ-qualified or NZ-admitted candidates are explicitly mentioned, whether Australian admission appears mandatory, whether a practising certificate or relocation detail is mentioned, and the application link; if a field is missing, mark it as not shown. Keep the work grounded in public pages only. Open and keep available the strongest evidence tabs for at least 8 representative roles spread across different cities or practice areas so I can sanity-check them later, including a mix of recruiter listings and direct employer pages if available. As you go, compare patterns across the market and separate the shortlist into three buckets: clearly eligible now, likely eligible but needs admission clarification, and probably not viable without current Australian admission. Finish with one organized tracker or memo that includes all 18 to 24 roles plus a concise recommendation on where I should focus first by city and practice area, and leave the finished tracker and the most useful evidence tabs open. I would prefer a CrpytoPad Spreadsheet for this tracker.","website":"https://www.legalpeople.com.au","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"A final organized tracker or memo is produced and includes exactly 18 to 24 current Australian legal roles relevant to a New Zealand-qualified lawyer with about 2 to 4 PQE.","description":"A final organized tracker or memo is produced and includes exactly 18 to 24 current Australian legal roles relevant to a New Zealand-qualified lawyer with about 2 to 4 PQE.\n\nHow a grader verifies this: Check that the final artifact exists, is left open, and contains between 18 and 24 distinct roles that match the stated candidate profile and Australian market scope.","max_points":200},{"criterion":"The search starts with Legal People Australia and then widens to other Australian legal recruiters, major job boards, and public law-firm or in-house careers pages rather than staying on one site.","description":"The search starts with Legal People Australia and then widens to other Australian legal recruiters, major job boards, and public law-firm or in-house careers pages rather than staying on one site.\n\nHow a grader verifies this: Confirm the browsing history or open tabs show Legal People plus additional public sources from at least two other source types named in the prompt, with roles drawn from that broader sweep.","max_points":150},{"criterion":"Each shortlisted role records the requested fields: job title, employer or recruiter, city/state, practice area, stated PQE or experience range, whether NZ-qualified or NZ-admitted candidates are explicitly mentioned, whether Australian admission appears mandatory, whether a practising certificate or relocation detail is mentioned, and the application link, using 'not shown' where needed.","description":"Each shortlisted role records the requested fields: job title, employer or recruiter, city/state, practice area, stated PQE or experience range, whether NZ-qualified or NZ-admitted candidates are explicitly mentioned, whether Australian admission appears mandatory, whether a practising certificate or relocation detail is mentioned, and the application link, using 'not shown' where needed.\n\nHow a grader verifies this: Sample the final tracker and verify that every included role has all required fields populated or marked 'not shown' exactly as requested.","max_points":200},{"criterion":"The shortlist includes only roles that fit one of the three requested relevance conditions: explicitly welcoming NZ qualification/NZ admission, silent but plausibly transferable, or clearly requiring Australian admission so that distinction is visible.","description":"The shortlist includes only roles that fit one of the three requested relevance conditions: explicitly welcoming NZ qualification/NZ admission, silent but plausibly transferable, or clearly requiring Australian admission so that distinction is visible.\n\nHow a grader verifies this: Review the role notes and source pages to confirm each shortlisted listing is categorized on one of those stated eligibility bases rather than included without explanation.","max_points":150},{"criterion":"At least 8 representative evidence tabs are kept available, spread across different cities or practice areas, and include a mix of recruiter listings and direct employer pages if available.","description":"At least 8 representative evidence tabs are kept available, spread across different cities or practice areas, and include a mix of recruiter listings and direct employer pages if available.\n\nHow a grader verifies this: Inspect the remaining open tabs and confirm there are at least 8 relevant evidence pages spanning multiple cities or practice areas, with both recruiter and direct employer sources represented when available.","max_points":150},{"criterion":"The final output separates all shortlisted roles into the three requested buckets—clearly eligible now, likely eligible but needs admission clarification, and probably not viable without current Australian admission—and ends with a concise recommendation on where to focus first by city and practice area, and the organized tracker is produced as a CryptoPad Spreadsheet.","description":"The final output separates all shortlisted roles into the three requested buckets—clearly eligible now, likely eligible but needs admission clarification, and probably not viable without current Australian admission—and ends with a concise recommendation on where to focus first by city and practice area, and the organized tracker is produced as a CryptoPad Spreadsheet.\n\nHow a grader verifies this: Check the final tracker or memo for the three explicit buckets and a closing recommendation that prioritizes target cities and practice areas based on the gathered listings; the final deliverable includes an open CryptoPad Spreadsheet with the requested role tracker.","max_points":150}]}} +{"task_id":"5aaab80dfeee2bad47c26c6a9b706f3dbaf76ab2","confirmed_task":"I want to buy one compact electric cooker for a small kitchen, but I don’t want a generic pick—I want the best realistic option for making rice several times a week and steaming vegetables in the same appliance. Please do a serious browser-based comparison of exactly 8 compact electric multicookers in roughly the 2.5- to 4-quart range, starting with Amazon but expanding to manufacturer pages and other major public retailer pages when that helps verify specs, pricing, or bundle contents. For every model, capture the current product name, approximate price, stated capacity, overall dimensions or footprint, whether it has a dedicated rice program or rice guidance, whether it includes a steaming rack or basket, inner-pot material if shown, dishwasher-cleaning claims if shown, and any clearly stated safety features. Then go one layer deeper on the strongest candidates by opening the manual or official product page when available so you can verify how rice cooking and vegetable steaming are actually described, not just the retailer bullet points. I also want you to check at least 3 credible third-party review or comparison sources and use them only to help evaluate real-world usability issues like rice texture, steam performance, cleaning annoyance, nonstick concerns, slow preheat, or confusing controls. At the end, narrow the field to the best 3 options for my use case, explicitly naming the best overall pick, the best budget pick, and the best easiest-to-clean pick. Keep the final 3 product pages open, plus at least 2 supporting evidence tabs such as a manual, official spec page, or review page that helped drive the decision, and give me a concise final comparison that makes clear why the winner beats the others for rice-and-vegetable cooking in a small kitchen.","website":"https://www.amazon.com","level":"hard","reference_length":9,"precomputed_rubric":{"items":[{"criterion":"Exactly 8 compact electric multicookers in roughly the 2.5- to 4-quart range are compared.","description":"Exactly 8 compact electric multicookers in roughly the 2.5- to 4-quart range are compared.\n\nHow a grader verifies this: The final comparison includes 8 distinct models, each clearly identified by product name and falling within the requested capacity band as shown on public product pages.","max_points":180},{"criterion":"For every one of the 8 models, the comparison captures the requested product-page details: approximate price, stated capacity, dimensions or footprint, rice-program or rice-guidance status, included steaming rack or basket status, inner-pot material if shown, dishwasher-cleaning claims if shown, and clearly stated safety features.","description":"For every one of the 8 models, the comparison captures the requested product-page details: approximate price, stated capacity, dimensions or footprint, rice-program or rice-guidance status, included steaming rack or basket status, inner-pot material if shown, dishwasher-cleaning claims if shown, and clearly stated safety features.\n\nHow a grader verifies this: Each of the 8 entries records those fields from retailer or official product pages, with 'not shown' used only where the page does not provide the information.","max_points":200},{"criterion":"The strongest candidates are checked one level deeper using manuals or official product pages to verify how rice cooking and vegetable steaming are actually described.","description":"The strongest candidates are checked one level deeper using manuals or official product pages to verify how rice cooking and vegetable steaming are actually described.\n\nHow a grader verifies this: Manuals and/or official spec pages are opened for the leading candidates, and the final reasoning cites those sources for rice and steaming behavior rather than relying only on retailer bullets.","max_points":170},{"criterion":"At least 3 credible third-party review or comparison sources are checked to evaluate real-world usability issues such as rice texture, steam performance, cleaning annoyance, nonstick concerns, slow preheat, or confusing controls.","description":"At least 3 credible third-party review or comparison sources are checked to evaluate real-world usability issues such as rice texture, steam performance, cleaning annoyance, nonstick concerns, slow preheat, or confusing controls.\n\nHow a grader verifies this: The browsing session includes at least 3 review/comparison sources, and the final synthesis uses them for usability tradeoffs that go beyond raw specs.","max_points":140},{"criterion":"The field is narrowed to the best 3 options for this use case, explicitly identifying the best overall pick, the best budget pick, and the best easiest-to-clean pick.","description":"The field is narrowed to the best 3 options for this use case, explicitly identifying the best overall pick, the best budget pick, and the best easiest-to-clean pick.\n\nHow a grader verifies this: The final answer presents exactly 3 shortlisted models and labels them with the three requested recommendation categories.","max_points":180},{"criterion":"Browser evidence is left visible by keeping open the final 3 product pages plus at least 2 supporting evidence tabs such as a manual, official spec page, or review page that materially informed the decision.","description":"Browser evidence is left visible by keeping open the final 3 product pages plus at least 2 supporting evidence tabs such as a manual, official spec page, or review page that materially informed the decision.\n\nHow a grader verifies this: Open tabs at the end include the 3 shortlisted product pages and at least 2 relevant support pages tied to the final recommendation.","max_points":130}]}} +{"task_id":"7f9a7d1771191e31cdb77715a4207fcd469346ca","confirmed_task":"I’m trying to plan a future trip from Kuala Lumpur to airport code LBJ, and I don’t just want one snapshot fare search — I want a real browser-based comparison that helps me decide when and how to book. First, confirm which airport LBJ refers to on public flight sites, then search for this route across at least 12 plausible future departure date combinations spread over multiple weeks or months so we can see whether true direct service actually exists and when it tends to be cheapest. Prioritize nonstop options whenever they are real, but if nonstop service is missing or clearly poor value on some dates, include the strongest one-stop fallback options for those same windows so I can compare the tradeoff. For each of the 12 date checks, capture the cheapest viable option, total travel time, airline, and whether it is nonstop or one-stop, and cross-check the most promising results on at least two public flight-search or airline pages rather than trusting a single source. Then narrow that down to the 6 strongest overall options and compare the important booking details on public pages: baggage allowance, refund or change flexibility if shown, and the booking source or airline page that appears most trustworthy. Keep the most useful search tabs open, including at least 3 tabs showing the best nonstop candidates and at least 2 tabs showing the best one-stop backups, so I can inspect them later. Finish with a concise recommendation that names the best overall option, the best cheapest nonstop option if different, and the best value fallback if I need to compromise on stops or travel time, with all prices, durations, and any missing details marked as not shown.","website":"https://www.google.com","level":"hard","reference_length":5,"precomputed_rubric":{"items":[{"criterion":"The task confirms which airport code 'LBJ' refers to on public flight pages before doing the comparison.","description":"The task confirms which airport code 'LBJ' refers to on public flight pages before doing the comparison.\n\nHow a grader verifies this: Final output explicitly identifies the airport represented by LBJ and cites or references the public flight-search evidence used to confirm it.","max_points":120},{"criterion":"At least 12 plausible future departure date combinations are searched for the KUL-to-LBJ route across multiple weeks or months.","description":"At least 12 plausible future departure date combinations are searched for the KUL-to-LBJ route across multiple weeks or months.\n\nHow a grader verifies this: Final comparison includes 12 distinct future date checks with recorded result details for each search window.","max_points":180},{"criterion":"Each of the 12 date checks records the cheapest viable option, total travel time, airline, and whether it is nonstop or one-stop.","description":"Each of the 12 date checks records the cheapest viable option, total travel time, airline, and whether it is nonstop or one-stop.\n\nHow a grader verifies this: The final comparison table or memo contains all four required fields for all 12 date checks, using 'not shown' where needed.","max_points":180},{"criterion":"Nonstop options are prioritized when real, but strong one-stop fallback options are also included for the same windows when nonstop service is missing or poor value.","description":"Nonstop options are prioritized when real, but strong one-stop fallback options are also included for the same windows when nonstop service is missing or poor value.\n\nHow a grader verifies this: The results clearly distinguish nonstop versus one-stop options and include one-stop alternatives where relevant instead of only listing nonstops.","max_points":160},{"criterion":"The most promising results are cross-checked on at least two public flight-search or airline pages, and the 6 strongest overall options are narrowed down for closer comparison.","description":"The most promising results are cross-checked on at least two public flight-search or airline pages, and the 6 strongest overall options are narrowed down for closer comparison.\n\nHow a grader verifies this: The final output identifies 6 shortlisted options and notes cross-check evidence from at least two public sources for the leading candidates.","max_points":160},{"criterion":"For the 6 strongest overall options, the comparison includes baggage allowance, refund or change flexibility if shown, and the most trustworthy booking source or airline page.","description":"For the 6 strongest overall options, the comparison includes baggage allowance, refund or change flexibility if shown, and the most trustworthy booking source or airline page.\n\nHow a grader verifies this: Each of the 6 shortlisted options contains those booking-detail fields, with 'not shown' used when a public page does not provide them.","max_points":100},{"criterion":"The most useful tabs are left open, including at least 3 best nonstop candidate tabs and at least 2 best one-stop backup tabs, and the final recommendation names the best overall option, the best cheapest nonstop option if different, and the best value fallback.","description":"The most useful tabs are left open, including at least 3 best nonstop candidate tabs and at least 2 best one-stop backup tabs, and the final recommendation names the best overall option, the best cheapest nonstop option if different, and the best value fallback.\n\nHow a grader verifies this: Open tabs visibly include the required nonstop and one-stop evidence pages, and the final written recommendation contains all three requested recommendation labels with price and duration details.","max_points":100}]}} +{"task_id":"58c130841b267ab28c26fc1df1a2bdfa00b3b5ba","confirmed_task":"I’m trying to make a serious Apple-only buying decision for a small three-role setup in the UK, and I want you to do the whole comparison in the browser instead of just glancing at one MacBook Pro page. Please start from Apple UK’s Mac pages and compare at least 8 real configurations drawn across the current MacBook Air, MacBook Pro, iMac, Mac mini, and Mac Studio families, using only public Apple pages. I need this organized around exactly three roles: a travel-heavy general user, a software developer, and a photo/video creator. For each role, figure out which Mac is the best fit and which lower-cost fallback is still reasonable, and note the key reasons from the official specs pages such as chip tier, memory, storage, display, ports, battery, and external-display support whenever Apple shows it. As you work, keep the most useful product pages open for the finalist options, including at least one MacBook Pro page and the main Mac lineup page, so I can review the evidence. After the comparison, build one draft Apple UK shopping bag with exactly 3 chosen Macs total, one for each role, and add AppleCare+ for each if publicly available on the path you use. Also check whether each chosen Mac plausibly needs any first-party accessory from Apple, like an adapter, mouse, keyboard, or external display, and add only the accessories that are clearly justified by the role; if something is not shown publicly, say not shown rather than guessing. In the end, give me a concise final buying recommendation that includes the 8 compared configurations, the 3 selected Macs, any added AppleCare+ or accessories, the estimated subtotal from the bag if visible, and the main tradeoffs you used to decide. Leave the final bag open along with the key comparison tabs. Give me the different options and viable options you drew up in a CryptoPad spreadsheet.","website":"https://www.apple.com","level":"hard","reference_length":10,"precomputed_rubric":{"items":[{"criterion":"The browsing session compares at least 8 real configurations across the current MacBook Air, MacBook Pro, iMac, Mac mini, and Mac Studio families using public Apple UK pages.","description":"The browsing session compares at least 8 real configurations across the current MacBook Air, MacBook Pro, iMac, Mac mini, and Mac Studio families using public Apple UK pages.\n\nHow a grader verifies this: Final output lists 8 or more specific compared configurations and shows coverage across the named Mac families from Apple UK product/configuration pages.","max_points":180},{"criterion":"The comparison is organized around exactly three roles: a travel-heavy general user, a software developer, and a photo/video creator, with one best-fit Mac and one lower-cost fallback identified for each role.","description":"The comparison is organized around exactly three roles: a travel-heavy general user, a software developer, and a photo/video creator, with one best-fit Mac and one lower-cost fallback identified for each role.\n\nHow a grader verifies this: Final recommendation includes all 3 roles and, for each one, names both a best-fit option and a lower-cost fallback tied to that role.","max_points":180},{"criterion":"For the role-based decisions, the key reasons are drawn from official specs pages and cover the requested categories when shown: chip tier, memory, storage, display, ports, battery, and external-display support, and the compared configurations and viable options are also captured in a CryptoPad Spreadsheet.","description":"For the role-based decisions, the key reasons are drawn from official specs pages and cover the requested categories when shown: chip tier, memory, storage, display, ports, battery, and external-display support, and the compared configurations and viable options are also captured in a CryptoPad Spreadsheet.\n\nHow a grader verifies this: Each chosen role recommendation includes spec-based reasoning from Apple pages, and missing fields are explicitly marked as not shown rather than inferred; an open CryptoPad Spreadsheet records the compared Mac configurations and viable options.","max_points":160},{"criterion":"The browser keeps the most useful product pages open for the finalist options, including at least one MacBook Pro page and the main Mac lineup page.","description":"The browser keeps the most useful product pages open for the finalist options, including at least one MacBook Pro page and the main Mac lineup page.\n\nHow a grader verifies this: Open tabs at the end include the main Mac lineup page, at least one MacBook Pro page, and additional finalist product/spec pages used in the comparison.","max_points":120},{"criterion":"A draft Apple UK shopping bag is built with exactly 3 chosen Macs total, one for each role, and AppleCare+ is added for each if publicly available on the path used.","description":"A draft Apple UK shopping bag is built with exactly 3 chosen Macs total, one for each role, and AppleCare+ is added for each if publicly available on the path used.\n\nHow a grader verifies this: The final bag shows 3 Macs corresponding to the 3 roles, and AppleCare+ appears for each where available; if not available, that is explicitly noted.","max_points":180},{"criterion":"The session checks whether each chosen Mac plausibly needs any first-party Apple accessory and adds only clearly justified accessories, using not shown for anything unavailable publicly.","description":"The session checks whether each chosen Mac plausibly needs any first-party Apple accessory and adds only clearly justified accessories, using not shown for anything unavailable publicly.\n\nHow a grader verifies this: Final output and/or bag include only role-justified first-party accessories, with unsupported or unavailable details marked not shown instead of guessed.","max_points":80},{"criterion":"The end result gives a concise final buying recommendation including the 8 compared configurations, the 3 selected Macs, any added AppleCare+ or accessories, the estimated subtotal from the bag if visible, and the main tradeoffs, with the final bag left open.","description":"The end result gives a concise final buying recommendation including the 8 compared configurations, the 3 selected Macs, any added AppleCare+ or accessories, the estimated subtotal from the bag if visible, and the main tradeoffs, with the final bag left open.\n\nHow a grader verifies this: Final summary contains all requested elements and the bag remains open at the end as the decision endpoint.","max_points":100}]}} +{"task_id":"304ff89d7a6a6df597aafaf720ba11f6c49214c3","confirmed_task":"I need help choosing an emerald green dress in women’s size 12 as a birthday gift for my sister, and I want this done like a real shopping decision instead of just grabbing the first result. Start with PrettyLittleThing since that’s where I first looked, but please widen out to other mainstream public retailers if they have stronger options. Find 12 total candidate dresses across at least 6 different retailers, and only include items that are explicitly shown in or clearly selectable to size 12; if some useful detail like fabric or exact length is not shown, just note it as not shown instead of guessing. For each dress, compare the product photos, silhouette, occasion level, price, any visible reviews or ratings, color wording if it’s close to emerald, shipping timing if publicly shown, and the retailer’s return policy from the public site. I want you to separate the 12 candidates into three groups of exactly 4 each: best value, best for a dressier birthday dinner or night out, and safest gift choice if I’m unsure about her exact style. Keep the product pages open for the best 6 finalists, and also keep open the key return-policy or delivery-info pages for at least 3 of the retailers so I can sanity-check the buying risk. At the end, recommend exactly 3 dresses ranked in order, explain why each made the cut, call out any tradeoffs like final sale or slow shipping, and leave open the single best buy plus the two strongest backup product pages.","website":"https://www.prettylittlething.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"Exactly 12 candidate emerald-green-or-close dresses are identified across at least 6 different retailers, and each candidate is explicitly shown in or clearly selectable to women’s size 12.","description":"Exactly 12 candidate emerald-green-or-close dresses are identified across at least 6 different retailers, and each candidate is explicitly shown in or clearly selectable to women’s size 12.\n\nHow a grader verifies this: Count the candidate dresses and retailers used, and confirm each included product page shows size 12 availability or selection rather than an inferred fit.","max_points":200},{"criterion":"Each of the 12 candidates is compared on the requested fields: product photos, silhouette, occasion level, price, any visible reviews or ratings, color wording, shipping timing if publicly shown, and the retailer’s public return policy, with missing fields marked not shown where needed.","description":"Each of the 12 candidates is compared on the requested fields: product photos, silhouette, occasion level, price, any visible reviews or ratings, color wording, shipping timing if publicly shown, and the retailer’s public return policy, with missing fields marked not shown where needed.\n\nHow a grader verifies this: Review the final comparison output and confirm every candidate includes all requested comparison dimensions or an explicit not shown note for unavailable details.","max_points":200},{"criterion":"The 12 candidates are organized into exactly three groups of 4 each: best value, best for a dressier birthday dinner or night out, and safest gift choice if the sister’s exact style is uncertain.","description":"The 12 candidates are organized into exactly three groups of 4 each: best value, best for a dressier birthday dinner or night out, and safest gift choice if the sister’s exact style is uncertain.\n\nHow a grader verifies this: Check that there are exactly 3 labeled groups and exactly 4 dresses in each group, with no missing or extra entries.","max_points":150},{"criterion":"The browser evidence is preserved by keeping open the product pages for the best 6 finalists and the key return-policy or delivery-info pages for at least 3 retailers.","description":"The browser evidence is preserved by keeping open the product pages for the best 6 finalists and the key return-policy or delivery-info pages for at least 3 retailers.\n\nHow a grader verifies this: Inspect open tabs to confirm 6 finalist product pages remain open and that at least 3 retailer policy or delivery-information pages are also open.","max_points":150},{"criterion":"A final recommendation of exactly 3 dresses is provided in ranked order, with a short explanation for why each made the cut and explicit tradeoffs such as final sale, weak reviews, limited size range, or slower shipping where applicable.","description":"A final recommendation of exactly 3 dresses is provided in ranked order, with a short explanation for why each made the cut and explicit tradeoffs such as final sale, weak reviews, limited size range, or slower shipping where applicable.\n\nHow a grader verifies this: Check that the final recommendation contains exactly 3 ranked dresses and that each entry includes both a positive rationale and any relevant tradeoff notes.","max_points":180},{"criterion":"The session ends with the single best buy and the two strongest backup product pages left open for direct purchase review.","description":"The session ends with the single best buy and the two strongest backup product pages left open for direct purchase review.\n\nHow a grader verifies this: Inspect the final open tabs and confirm that the top-ranked dress page plus the 2 backup product pages are still open.","max_points":120}]}} +{"task_id":"24705fdf2d165f4f844984e62b4511396a88206d","confirmed_task":"I’m trying to figure out whether I can realistically handle a failing instrument cluster on my 2012 Mercedes GLK350 myself or whether I should pay a shop, so please do a serious browser-based repair-prep session instead of just summarizing one video. Start by finding and comparing at least 7 public sources on the actual cluster-removal process for this exact model or the closest clearly compatible X204/GLK fit, including at least 2 videos, at least 2 forum or owner-discussion threads, at least 1 parts diagram or exploded-view page, and at least 1 written guide or listing that helps confirm how the cluster is mounted. From those sources, build one consolidated step-by-step removal and reinstall explanation that calls out the tool sizes or trim tools when shown, battery or airbag-related precautions if mentioned, hidden clips or fasteners, and any disagreements between sources. Then research replacement paths on public pages by comparing at least 5 cluster options total across used, rebuilt, and OEM-or-OE-style listings, and note the part number when shown, fitment notes, price, mileage disclosure if applicable, warranty or return details if shown, and whether the listing or surrounding sources suggest coding, VIN matching, immobilizer issues, or odometer problems. After that, look for at least 3 public repair-shop or cluster-rebuild pages that could serve as a professional fallback, and capture what each one publicly shows about Mercedes cluster service, turnaround, pricing guidance if shown, or contact limitations if pricing is not shown. Keep the most useful evidence tabs open for the best removal video, the clearest forum thread, one parts-diagram page, the 3 strongest replacement listings, and the 2 most credible professional-service pages. Finish with a practical decision memo that tells me whether the evidence supports DIY removal only, full DIY replacement, or using a pro, and include a concise risk checklist, a shopping shortlist of the best 3 replacement options, and a final recommendation based only on what the public pages actually support.","website":"https://www.youtube.com","level":"hard","reference_length":5,"precomputed_rubric":{"items":[{"criterion":"At least 7 public sources are used for the removal-process research, including at least 2 videos, at least 2 forum or owner-discussion threads, at least 1 parts diagram or exploded-view page, and at least 1 written guide or listing that helps confirm mounting/removal details.","description":"At least 7 public sources are used for the removal-process research, including at least 2 videos, at least 2 forum or owner-discussion threads, at least 1 parts diagram or exploded-view page, and at least 1 written guide or listing that helps confirm mounting/removal details.\n\nHow a grader verifies this: The browsing session and final memo show these source types and counts, with source-specific evidence visible from the pages used.","max_points":180},{"criterion":"A single consolidated step-by-step explanation for removing and reinstalling the 2012 Mercedes GLK350 instrument cluster is produced, and it includes tools when shown, battery or airbag-related precautions if mentioned, hidden clips or fasteners, and any disagreements between sources.","description":"A single consolidated step-by-step explanation for removing and reinstalling the 2012 Mercedes GLK350 instrument cluster is produced, and it includes tools when shown, battery or airbag-related precautions if mentioned, hidden clips or fasteners, and any disagreements between sources.\n\nHow a grader verifies this: The final memo contains the full synthesized procedure and explicitly notes cautions, hidden hardware, and source disagreements rather than only linking pages.","max_points":200},{"criterion":"At least 5 replacement cluster options are compared across used, rebuilt, and OEM-or-OE-style listings, with part number when shown, fitment notes, price, mileage disclosure if applicable, warranty or return details if shown, and any coding/VIN/immobilizer/odometer notes supported by the public pages.","description":"At least 5 replacement cluster options are compared across used, rebuilt, and OEM-or-OE-style listings, with part number when shown, fitment notes, price, mileage disclosure if applicable, warranty or return details if shown, and any coding/VIN/immobilizer/odometer notes supported by the public pages.\n\nHow a grader verifies this: The final comparison includes 5 or more listings spanning the requested replacement categories and records the requested fields as shown or 'not shown.'","max_points":220},{"criterion":"At least 3 public repair-shop or cluster-rebuild pages are reviewed as professional fallback options, capturing what each page shows about Mercedes cluster service, turnaround, pricing guidance if shown, or contact limitations when pricing is not shown.","description":"At least 3 public repair-shop or cluster-rebuild pages are reviewed as professional fallback options, capturing what each page shows about Mercedes cluster service, turnaround, pricing guidance if shown, or contact limitations when pricing is not shown.\n\nHow a grader verifies this: The final memo lists 3 or more professional-service options and records the requested public-page details for each.","max_points":140},{"criterion":"The most useful evidence tabs are kept open for the best removal video, the clearest forum thread, one parts-diagram page, the 3 strongest replacement listings, and the 2 most credible professional-service pages.","description":"The most useful evidence tabs are kept open for the best removal video, the clearest forum thread, one parts-diagram page, the 3 strongest replacement listings, and the 2 most credible professional-service pages.\n\nHow a grader verifies this: Open tabs at the end correspond to these exact evidence categories and quantities.","max_points":120},{"criterion":"The session ends with a practical decision memo that recommends DIY removal only, full DIY replacement, or using a pro, and it includes a concise risk checklist plus a shopping shortlist of the best 3 replacement options.","description":"The session ends with a practical decision memo that recommends DIY removal only, full DIY replacement, or using a pro, and it includes a concise risk checklist plus a shopping shortlist of the best 3 replacement options.\n\nHow a grader verifies this: The final memo contains one of the three requested recommendation outcomes, a risk checklist, and a clearly labeled top-3 replacement shortlist tied to the browsing evidence.","max_points":140}]}} +{"task_id":"c4b9782b4f76679169f1438e1e70addd96d7b518","confirmed_task":"I’m trying to find a pair of women’s bootcut jeans that I’d actually buy, and my requirements are annoyingly specific: long length, dark denim wash, and frayed hems. Don’t just check one store. Use the public web to build me a serious comparison across 10 to 12 viable pairs from brands that are realistically buyable online in the U.S., including Abercrombie if they have a match. For each candidate, verify from the product page whether it clearly offers a long or tall length, a dark wash, and a frayed/raw hem; if one of those details is unclear, mark it as not shown instead of guessing. While comparing them, pay close attention to the details that will actually affect whether I’d keep them: inseam or length info, rise, stretch or fabric composition, price, available sizes, and whether customer photos/reviews make the wash and hem look the way the product page claims. Also check the return policy on each brand’s public site so I can avoid something that would be a pain to send back. Narrow everything down to the best 4 finalists, and for those 4, keep the product tabs open along with the most useful size-guide or return-policy tabs. In the end, give me one clear best overall pick, one best value pick, and one safest-to-order pick, with short reasons tied to the evidence you found.","website":"https://www.abercrombie.com","level":"hard","reference_length":7,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies 10 to 12 viable jean candidates from publicly accessible U.S. retail pages, including Abercrombie if it has a match.","description":"The browsing session identifies 10 to 12 viable jean candidates from publicly accessible U.S. retail pages, including Abercrombie if it has a match.\n\nHow a grader verifies this: Count the final candidate set and confirm each item is a distinct public product page from a buyable online retailer; confirm Abercrombie is included if a qualifying or near-qualifying candidate was found there.","max_points":180},{"criterion":"For every candidate, the comparison explicitly checks long or tall length, dark wash, and frayed or raw hem status from the product page, using 'not shown' where the page does not clearly state one of those details.","description":"For every candidate, the comparison explicitly checks long or tall length, dark wash, and frayed or raw hem status from the product page, using 'not shown' where the page does not clearly state one of those details.\n\nHow a grader verifies this: Inspect the final comparison and confirm all candidates have entries for the three requested criteria with no guessed values where evidence is missing.","max_points":200},{"criterion":"The comparison records the practical purchase details the prompt asked for for each candidate: inseam or length info, rise, stretch or fabric composition, price, available sizes, and whether customer photos or reviews support the claimed wash and hem appearance.","description":"The comparison records the practical purchase details the prompt asked for for each candidate: inseam or length info, rise, stretch or fabric composition, price, available sizes, and whether customer photos or reviews support the claimed wash and hem appearance.\n\nHow a grader verifies this: Check that each candidate includes all listed comparison fields or 'not shown' where applicable, and that review/photo observations are tied to the actual product listing context.","max_points":200},{"criterion":"A public-site return-policy check is completed for each brand represented in the candidate set so the user can judge return friction before ordering.","description":"A public-site return-policy check is completed for each brand represented in the candidate set so the user can judge return friction before ordering.\n\nHow a grader verifies this: Confirm that each brand in the comparison has a corresponding return-policy finding from a public page and that the result is incorporated into the evaluation.","max_points":120},{"criterion":"The candidate pool is narrowed to exactly 4 finalists, and the most useful evidence tabs are left open for those finalists, including the product pages and any key size-guide or return-policy pages used to judge them.","description":"The candidate pool is narrowed to exactly 4 finalists, and the most useful evidence tabs are left open for those finalists, including the product pages and any key size-guide or return-policy pages used to judge them.\n\nHow a grader verifies this: Confirm there are exactly 4 finalists and that relevant finalist product tabs remain open, along with any supporting size-guide or return-policy tabs referenced as important evidence.","max_points":150},{"criterion":"The final recommendation gives exactly one best overall pick, one best value pick, and one safest-to-order pick, each with short reasons grounded in the compared evidence.","description":"The final recommendation gives exactly one best overall pick, one best value pick, and one safest-to-order pick, each with short reasons grounded in the compared evidence.\n\nHow a grader verifies this: Check that all three recommendation categories are present exactly once and that each rationale refers back to comparison evidence such as fit details, price, review support, or return policy.","max_points":150}]}} +{"task_id":"63658b88215cc7b85e97e36d5b8b39b47904c6b6","confirmed_task":"I’m trying to stop wasting money on blushes that look cute online but disappear by lunch, so please help me do a real browser-based comparison focused on rosy pink shades that are meant to last through a full day. Start by checking Hey Hottie and identify the best rosy-pink blush they currently sell, then capture the exact product name and the exact text on the public page that supports a long-lasting or all-day-wear claim. After that, broaden the search one practical level up and compare Hey Hottie against 11 other rosy-pink blush options from reputable beauty brands or major beauty retailers, so I end up with 12 total products including Hey Hottie. For each of the 12, record the product name, brand, formula type, the closest rosy-pink shade name, the listed price, size if shown, the exact wear-claim text if the page gives one, and mark \"not shown\" when a field is missing. Please also use public product photos, swatches, or retailer/brand images to sanity-check whether each shade really reads as rosy pink instead of peach, mauve, or berry, and note any obvious mismatch. I want this to feel like a serious buying decision, so use a mix of official brand pages and major retailer pages where helpful, and compare review signals too when they’re publicly visible. Then narrow it down to the best 3 options for three different needs: best overall rosy-pink all-day pick, best budget pick, and best cream-or-dewy pick. Be explicit about whether the Hey Hottie option actually makes the final top 3 and why or why not. Keep the Hey Hottie product page open, plus the final 3 winning product pages and at least 2 useful comparison/review tabs, so I can inspect the evidence afterward.","website":"https://heyhottie.co","level":"hard","reference_length":5,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies the best current Hey Hottie rosy-pink blush candidate and includes both the exact product name and the exact public-page text supporting a long-lasting or all-day-wear claim.","description":"The browsing session identifies the best current Hey Hottie rosy-pink blush candidate and includes both the exact product name and the exact public-page text supporting a long-lasting or all-day-wear claim.\n\nHow a grader verifies this: Check the final result for one Hey Hottie product name plus a verbatim supporting claim text taken from a public Hey Hottie page, and confirm the Hey Hottie product tab is left open.","max_points":180},{"criterion":"A total of 12 rosy-pink blush products are compared, including Hey Hottie plus 11 other options from reputable beauty brands or major beauty retailers.","description":"A total of 12 rosy-pink blush products are compared, including Hey Hottie plus 11 other options from reputable beauty brands or major beauty retailers.\n\nHow a grader verifies this: Count the compared products in the final comparison and confirm there are exactly 12 total entries with Hey Hottie included.","max_points":180},{"criterion":"For each of the 12 products, the comparison records product name, brand, formula type, closest rosy-pink shade name, listed price, size if shown, exact wear-claim text if shown, and uses \"not shown\" where fields are missing.","description":"For each of the 12 products, the comparison records product name, brand, formula type, closest rosy-pink shade name, listed price, size if shown, exact wear-claim text if shown, and uses \"not shown\" where fields are missing.\n\nHow a grader verifies this: Inspect all 12 entries and confirm each required field is present, with missing values explicitly labeled \"not shown\" rather than omitted.","max_points":200},{"criterion":"The session uses public product photos, swatches, or retailer/brand images to assess whether each option truly reads as rosy pink, and notes any obvious mismatch with peach, mauve, or berry tones.","description":"The session uses public product photos, swatches, or retailer/brand images to assess whether each option truly reads as rosy pink, and notes any obvious mismatch with peach, mauve, or berry tones.\n\nHow a grader verifies this: Review the comparison notes for all 12 products and confirm there is a shade-fit judgment for each, including mismatch notes where applicable, supported by visible product-image or swatch browsing.","max_points":140},{"criterion":"The comparison uses a mix of official brand pages and major retailer pages where helpful, and includes publicly visible review signals when available.","description":"The comparison uses a mix of official brand pages and major retailer pages where helpful, and includes publicly visible review signals when available.\n\nHow a grader verifies this: Check that the evidence comes from more than one source type, with at least some official brand pages and some major retailer pages or retailer review evidence, and that review signals are noted when visible.","max_points":100},{"criterion":"The final synthesis names exactly 3 winners: best overall rosy-pink all-day pick, best budget pick, and best cream-or-dewy pick, and explicitly states whether Hey Hottie makes the top 3 and why or why not.","description":"The final synthesis names exactly 3 winners: best overall rosy-pink all-day pick, best budget pick, and best cream-or-dewy pick, and explicitly states whether Hey Hottie makes the top 3 and why or why not.\n\nHow a grader verifies this: Confirm the final recommendation section contains exactly 3 category winners and a direct statement about Hey Hottie’s top-3 status with reasoning.","max_points":120},{"criterion":"The browser is left with the Hey Hottie product page open, the 3 winning product pages open, and at least 2 useful comparison or review tabs open for later inspection.","description":"The browser is left with the Hey Hottie product page open, the 3 winning product pages open, and at least 2 useful comparison or review tabs open for later inspection.\n\nHow a grader verifies this: Inspect the open tabs at the end and confirm the Hey Hottie tab, 3 winner tabs, and at least 2 comparison/review evidence tabs remain open.","max_points":80}]}} +{"task_id":"54fdb6126b926e2a48b4e45ffc1fe303e873eb7f","confirmed_task":"I’m trying to figure out what kind of genuinely fun, alternative adult outing I should plan in London for a future weekend, and I don’t want just three random ideas. Please do a real browser-based comparison of 12 to 15 London experiences that skew unusual or playful for adults, with at least 4 immersive game/story-led options, at least 4 challenge or escape-style activities, and at least 4 social or competitive experiences that still feel more interesting than a normal bar night. Use official venue pages plus maps/review pages where helpful, and for each option note the name, borough or area, the core format, typical duration, indicative price per person if public, ideal group size if shown, and anything important about booking or age restrictions; if something is missing, write \"not shown.\" As you work, keep the strongest candidate tabs open so I can inspect them later, and for at least 6 of the best options open pages with photos, maps, or reviews so there’s visible evidence of what the experience is really like and where it sits in the city. Then narrow the list to the best 6 overall and compare them more carefully for vibe, value, logistical ease, and how distinctive they feel versus touristy or generic options. Finally, recommend exactly 3 winners: one best immersive game, one best escape/challenge activity, and one best overall alternative night-out pick. Also build 2 realistic sample plans from your shortlist: one for a date night and one for a group of 4 to 6 friends, each centered on one main experience plus a nearby food or drink spot that makes geographic sense. Leave the most useful venue and map/review tabs open at the end so I can review the finalists.","website":"https://www.inthehiddencity.com","level":"hard","reference_length":14,"precomputed_rubric":{"items":[{"criterion":"The browsing session compares 12 to 15 London experiences, including at least 4 immersive game/story-led options, at least 4 challenge or escape-style activities, and at least 4 social or competitive experiences that feel more interesting than a normal bar night.","description":"The browsing session compares 12 to 15 London experiences, including at least 4 immersive game/story-led options, at least 4 challenge or escape-style activities, and at least 4 social or competitive experiences that feel more interesting than a normal bar night.\n\nHow a grader verifies this: Count the total experiences in the final comparison and confirm the category minimums are satisfied using the listed venues and their described formats.","max_points":200},{"criterion":"For each compared experience, the results include the name, borough or area, core format, typical duration, indicative price per person if public, ideal group size if shown, and any booking or age-restriction notes, using \"not shown\" where needed.","description":"For each compared experience, the results include the name, borough or area, core format, typical duration, indicative price per person if public, ideal group size if shown, and any booking or age-restriction notes, using \"not shown\" where needed.\n\nHow a grader verifies this: Check every listed experience entry for all requested fields and confirm missing fields are explicitly marked \"not shown\" rather than omitted.","max_points":180},{"criterion":"Official venue pages are used for the experiences, and for at least 6 of the best options there is visible browser evidence from pages with photos, maps, or reviews showing what the experience is like and where it is located.","description":"Official venue pages are used for the experiences, and for at least 6 of the best options there is visible browser evidence from pages with photos, maps, or reviews showing what the experience is like and where it is located.\n\nHow a grader verifies this: Inspect the open tabs or navigation history to confirm official venue pages were used and that at least 6 shortlisted options have supporting photo, map, or review pages opened.","max_points":160},{"criterion":"The work narrows the broader set to the best 6 overall and compares those finalists specifically on vibe, value, logistical ease, and distinctiveness versus touristy or generic options.","description":"The work narrows the broader set to the best 6 overall and compares those finalists specifically on vibe, value, logistical ease, and distinctiveness versus touristy or generic options.\n\nHow a grader verifies this: Review the finalist comparison section and confirm there are exactly 6 finalists with explicit commentary on all four requested decision factors.","max_points":160},{"criterion":"The final recommendations name exactly 3 winners: one best immersive game, one best escape/challenge activity, and one best overall alternative night-out pick.","description":"The final recommendations name exactly 3 winners: one best immersive game, one best escape/challenge activity, and one best overall alternative night-out pick.\n\nHow a grader verifies this: Check the conclusion for exactly three labeled winners matching the requested categories, with no missing or extra winner categories.","max_points":120},{"criterion":"Two realistic sample plans are produced from the shortlist: one for a date night and one for a group of 4 to 6 friends, each built around one main experience plus a nearby food or drink spot that makes geographic sense.","description":"Two realistic sample plans are produced from the shortlist: one for a date night and one for a group of 4 to 6 friends, each built around one main experience plus a nearby food or drink spot that makes geographic sense.\n\nHow a grader verifies this: Confirm there are exactly two plans, each with the correct audience type, one chosen main experience, and one nearby food or drink stop that is plausibly close based on the opened map/review evidence.","max_points":100},{"criterion":"The most useful venue and map/review tabs for the finalists are left open so the user can inspect the strongest options afterward.","description":"The most useful venue and map/review tabs for the finalists are left open so the user can inspect the strongest options afterward.\n\nHow a grader verifies this: At the end of the session, confirm that key finalist venue tabs and supporting map/review tabs remain open rather than being fully closed out.","max_points":80}]}} +{"task_id":"d7696e89cb472203c4fe131625ff8a025b86e6e7","confirmed_task":"I just got serious about buying a Nintendo Switch OLED for someone who is basically new to console gaming, and I do not want to waste money on random accessories that look useful but are not actually the right starter setup. Please build me a real browser-based buying guide that starts with the official Nintendo Switch OLED pages so you can verify what is actually compatible, then compare public product pages across Amazon plus at least two other major retailers like Best Buy, Target, Walmart, or GameStop. I want you to cover at least 5 categories that matter for a first setup: a screen protector, carrying case, extra controller option, microSD storage, and one charging or dock-friendly power accessory; if another category looks truly important, you can include it as optional. For each of those 5 required categories, compare exactly 3 viable products, so I end up with 15 compared items total, and note the product name, price, retailer, any obvious compatibility notes, and the main reason someone would pick it over the others. After that, turn the comparison into 3 complete starter bundles for different buyers: a lean budget bundle, a balanced everyday bundle, and a travel-or-multiplayer bundle. Each bundle should include exactly 5 to 7 items total, use only products you already checked, and clearly mark which items are essential versus optional. Also include one beginner-friendly first game recommendation and whether Nintendo Switch Online seems worth adding for that bundle. Keep the most useful evidence tabs open, including the official compatibility pages and at least one live product page for each required category, and finish with one clear recommendation for which bundle I should actually buy if I want the best overall value for a new Switch OLED owner.","website":"https://www.amazon.com","level":"hard","reference_length":7,"precomputed_rubric":{"items":[{"criterion":"The browsing session starts from official Nintendo Switch OLED pages and uses them to verify compatibility or setup constraints relevant to the recommended accessories.","description":"The browsing session starts from official Nintendo Switch OLED pages and uses them to verify compatibility or setup constraints relevant to the recommended accessories.\n\nHow a grader verifies this: Official Nintendo Switch OLED or Nintendo support pages are opened and the final comparison references compatibility or setup notes drawn from those pages.","max_points":160},{"criterion":"Exactly 5 required categories are covered: screen protector, carrying case, extra controller option, microSD storage, and one charging or dock-friendly power accessory.","description":"Exactly 5 required categories are covered: screen protector, carrying case, extra controller option, microSD storage, and one charging or dock-friendly power accessory.\n\nHow a grader verifies this: The final comparison is organized by those 5 categories and includes no missing required category.","max_points":160},{"criterion":"For each of the 5 required categories, exactly 3 viable products are compared using public product pages from Amazon plus at least two other major retailers overall.","description":"For each of the 5 required categories, exactly 3 viable products are compared using public product pages from Amazon plus at least two other major retailers overall.\n\nHow a grader verifies this: The final output contains 15 compared items total, with product name, price, retailer, compatibility note if relevant, and a main reason to choose each item; retailer coverage includes Amazon and at least two of Best Buy, Target, Walmart, or GameStop.","max_points":220},{"criterion":"The comparison is synthesized into 3 complete starter bundles: one lean budget bundle, one balanced everyday bundle, and one travel-or-multiplayer bundle.","description":"The comparison is synthesized into 3 complete starter bundles: one lean budget bundle, one balanced everyday bundle, and one travel-or-multiplayer bundle.\n\nHow a grader verifies this: All 3 named bundles appear, each contains exactly 5 to 7 items drawn only from previously compared products, and each item is marked essential or optional.","max_points":180},{"criterion":"Each of the 3 bundles includes one beginner-friendly first game recommendation and a judgment on whether Nintendo Switch Online seems worth adding for that bundle.","description":"Each of the 3 bundles includes one beginner-friendly first game recommendation and a judgment on whether Nintendo Switch Online seems worth adding for that bundle.\n\nHow a grader verifies this: Every bundle explicitly lists one game recommendation and states yes, no, or conditional value for Nintendo Switch Online.","max_points":120},{"criterion":"Useful browser evidence is preserved by keeping open the official compatibility pages and at least one live product page for each required category.","description":"Useful browser evidence is preserved by keeping open the official compatibility pages and at least one live product page for each required category.\n\nHow a grader verifies this: Open tabs at the end include official Nintendo pages plus at least 5 live product pages representing the 5 required categories.","max_points":160}]}} +{"task_id":"a5d88b699a2524de24e157d9269d0c42a070edab","confirmed_task":"I’m trying to book a perfume bar or custom scent experience for a future wedding, and I don’t want just one random lead—I want a real shortlist I could confidently contact for a destination event anywhere in the world. Please use public web pages to find 8 to 10 perfume-bar, fragrance-bar, or scent-activation vendors that appear to do weddings, private events, or luxury event activations, and for each one verify whether they explicitly mention international travel, destination events, global service, or leave that as not shown if you can’t confirm it. Compare them on the details that would actually matter to me: whether they look wedding-appropriate, what kind of guest experience they offer, whether they seem able to handle on-site personalization, what regions they mention serving, how polished the setup looks from photos, and exactly how I would inquire or book them. Keep the strongest 4 vendor pages open, including at least 2 that clearly suggest international or destination capability and at least 1 that looks especially strong for weddings. Then give me a final ranked shortlist of the best 5 options, explain which single vendor you’d contact first for a destination wedding and why, and include the booking or contact method for each finalist.","website":"https://www.google.com","level":"hard","reference_length":9,"precomputed_rubric":{"items":[{"criterion":"A total of 8 to 10 perfume-bar, fragrance-bar, or scent-activation vendors are identified from public pages, and each appears to serve weddings, private events, or luxury event activations.","description":"A total of 8 to 10 perfume-bar, fragrance-bar, or scent-activation vendors are identified from public pages, and each appears to serve weddings, private events, or luxury event activations.\n\nHow a grader verifies this: Check that the final comparison includes 8 to 10 distinct vendors and that each entry cites or reflects a public page showing relevant event-service fit.","max_points":180},{"criterion":"For every vendor, the result states whether the vendor explicitly mentions international travel, destination events, global service, or 'not shown' when that cannot be confirmed.","description":"For every vendor, the result states whether the vendor explicitly mentions international travel, destination events, global service, or 'not shown' when that cannot be confirmed.\n\nHow a grader verifies this: Review each vendor entry for a destination/international-service field populated with either confirmed evidence or 'not shown' rather than omission.","max_points":180},{"criterion":"The comparison covers the decision factors explicitly requested for each vendor: wedding appropriateness, guest experience offered, apparent ability to handle on-site personalization, regions served, setup polish from photos, and inquiry or booking method.","description":"The comparison covers the decision factors explicitly requested for each vendor: wedding appropriateness, guest experience offered, apparent ability to handle on-site personalization, regions served, setup polish from photos, and inquiry or booking method.\n\nHow a grader verifies this: Inspect the vendor comparison and confirm that each requested factor is addressed for all reviewed vendors, using 'not shown' where needed.","max_points":200},{"criterion":"The strongest 4 vendor pages are left open, including at least 2 that clearly suggest international or destination capability and at least 1 that looks especially strong for weddings.","description":"The strongest 4 vendor pages are left open, including at least 2 that clearly suggest international or destination capability and at least 1 that looks especially strong for weddings.\n\nHow a grader verifies this: Check the open browser tabs at the end and confirm there are 4 relevant vendor pages matching the requested mix.","max_points":140},{"criterion":"A final ranked shortlist of the best 5 options is provided.","description":"A final ranked shortlist of the best 5 options is provided.\n\nHow a grader verifies this: Confirm that exactly 5 finalists are ranked in order in the final output.","max_points":120},{"criterion":"The final answer identifies which single vendor should be contacted first for a destination wedding and explains why.","description":"The final answer identifies which single vendor should be contacted first for a destination wedding and explains why.\n\nHow a grader verifies this: Check that one vendor is explicitly named as the top outreach choice and that a reasoned explanation is given.","max_points":100},{"criterion":"Each of the 5 finalists includes a booking or contact method.","description":"Each of the 5 finalists includes a booking or contact method.\n\nHow a grader verifies this: Review the finalist entries and confirm that each contains a concrete inquiry path such as contact form, email, phone, DM, or booking page.","max_points":80}]}} +{"task_id":"265abf0668773a8295847d5f5abf3bd003cbe3a3","confirmed_task":"I want to stop randomly clicking tech videos and instead figure out which YouTube reviewers are actually worth following for my future gadget research. Please use public YouTube pages to build me a serious shortlist of 10 tech-related videos drawn from at least 6 different channels that seem most relevant to a shopper who cares about product reviews and first-impression videos for mainstream consumer tech like phones, laptops, audio gear, wearables, or similar gadgets. For each video, include the exact title, channel name, product category, how recent it is, and a short 2–3 sentence summary of what the reviewer actually says. While doing that, compare the channels on things like whether they show real hands-on testing, whether the video feels more like a quick impression or a deeper review, whether pros and cons are clearly stated, and whether the presentation style seems useful for someone trying to decide what to buy. Keep at least 4 of the strongest video tabs open and at least 3 channel pages open so I can review the evidence myself. At the end, recommend the best 3 channels for me to follow going forward, explain what each is best for, and identify the best 5 videos from your set if I only have time to watch a few. Prepare a presentation in CryptoPad presentations of a rundown on your recommendations and analysis.","website":"https://www.youtube.com","level":"hard","reference_length":12,"precomputed_rubric":{"items":[{"criterion":"The final result includes exactly 10 tech-related YouTube videos drawn from at least 6 different channels.","description":"The final result includes exactly 10 tech-related YouTube videos drawn from at least 6 different channels.\n\nHow a grader verifies this: Count the videos and confirm the channel names show coverage from 6 or more distinct YouTube channels.","max_points":200},{"criterion":"Each of the 10 videos includes the exact title, channel name, product category, how recent it is, and a 2–3 sentence summary.","description":"Each of the 10 videos includes the exact title, channel name, product category, how recent it is, and a 2–3 sentence summary.\n\nHow a grader verifies this: Check every listed video entry for all requested fields and confirm the summary length is 2–3 sentences.","max_points":180},{"criterion":"The selected videos stay relevant to product reviews and first-impression coverage for mainstream consumer tech such as phones, laptops, audio gear, wearables, or similar gadgets.","description":"The selected videos stay relevant to product reviews and first-impression coverage for mainstream consumer tech such as phones, laptops, audio gear, wearables, or similar gadgets.\n\nHow a grader verifies this: Review the 10 chosen videos and confirm they fit the requested review/impression focus and product-category scope.","max_points":140},{"criterion":"The channels are compared on whether they show real hands-on testing, whether each video feels like a quick impression or a deeper review, whether pros and cons are clearly stated, and whether the presentation style seems useful for buying decisions.","description":"The channels are compared on whether they show real hands-on testing, whether each video feels like a quick impression or a deeper review, whether pros and cons are clearly stated, and whether the presentation style seems useful for buying decisions.\n\nHow a grader verifies this: Inspect the comparison writeup and confirm all four requested comparison dimensions are explicitly addressed for the shortlisted channels or videos.","max_points":180},{"criterion":"At least 4 of the strongest video tabs are kept open and at least 3 channel pages are kept open as visible evidence.","description":"At least 4 of the strongest video tabs are kept open and at least 3 channel pages are kept open as visible evidence.\n\nHow a grader verifies this: Check the open browser tabs at the end and confirm there are 4 or more video pages plus 3 or more channel pages left open.","max_points":120},{"criterion":"The final recommendation names the best 3 channels to follow going forward and explains what each is best for.","description":"The final recommendation names the best 3 channels to follow going forward and explains what each is best for.\n\nHow a grader verifies this: Confirm there are exactly 3 recommended channels and that each has a clear explanation of its best use case.","max_points":100},{"criterion":"The final recommendation also identifies the best 5 videos from the set for someone who only has time to watch a few, and a CryptoPad Presentation is also prepared as a rundown of the recommendations and analysis.","description":"The final recommendation also identifies the best 5 videos from the set for someone who only has time to watch a few, and a CryptoPad Presentation is also prepared as a rundown of the recommendations and analysis.\n\nHow a grader verifies this: Check that exactly 5 videos are singled out from the 10-video set as the top viewing priorities; an open CryptoPad Presentation contains the channel and video recommendation rundown.","max_points":80}]}} +{"task_id":"d03a08650b62f90b2de5e4bcbb98a50653100e1e","confirmed_task":"I want to set up my browser for smarter online shopping, but I do not want to install a bunch of sketchy coupon extensions blindly. Please use the Chrome Web Store Shopping category as the starting point and build me a serious comparison of 10 Chrome extensions that help with price comparison, coupon finding, cashback, or price-history tracking. For each one, open the Chrome Web Store listing and then verify it on the extension’s official public site or help page if available, so we can compare what it actually claims to do. Record for all 10: extension name, Chrome Web Store listing page, star rating, review count, whether it focuses on coupons, price comparison, cashback, or price history, whether an account is required if that is clearly stated, any notable permissions or data-access warnings shown publicly, and the official site or support page if shown; if something is missing, write not shown. Then narrow that set to the best 4 options for a privacy-conscious shopper in the U.S. who wants real savings without installing redundant tools, and explain the role each of the 4 would play so I know whether they overlap or complement each other. Keep the most useful tabs open at the end: the Chrome Web Store listings for the final 4, at least 2 official privacy or help pages that were important to the decision, and 2 runner-up tabs that were strong but not selected, so I can review the evidence myself. Can you create a CryptoPad Document with your rankings - and links for each of the extensions to relevant reviews or online sentiment for each?","website":"https://chromewebstore.google.com","level":"hard","reference_length":5,"precomputed_rubric":{"items":[{"criterion":"A comparison set of exactly 10 Chrome extensions in the Shopping category is assembled, starting from the Chrome Web Store, and each candidate is relevant to price comparison, coupon finding, cashback, or price-history tracking.","description":"A comparison set of exactly 10 Chrome extensions in the Shopping category is assembled, starting from the Chrome Web Store, and each candidate is relevant to price comparison, coupon finding, cashback, or price-history tracking.\n\nHow a grader verifies this: The final output lists 10 distinct extensions with Chrome Web Store listing pages and shopping-related functions that match the requested categories.","max_points":140},{"criterion":"For each of the 10 extensions, the required fields are recorded: extension name, Chrome Web Store listing page, star rating, review count, primary shopping function, whether an account is required if clearly stated, any notable permissions or data-access warnings shown publicly, and the official site or support page if shown, using 'not shown' where needed.","description":"For each of the 10 extensions, the required fields are recorded: extension name, Chrome Web Store listing page, star rating, review count, primary shopping function, whether an account is required if clearly stated, any notable permissions or data-access warnings shown publicly, and the official site or support page if shown, using 'not shown' where needed.\n\nHow a grader verifies this: Each of the 10 entries contains all requested fields or explicitly says 'not shown' for missing information.","max_points":180},{"criterion":"Each Chrome Web Store listing is cross-checked against the extension’s official public site or help page when available, rather than relying only on the store listing.","description":"Each Chrome Web Store listing is cross-checked against the extension’s official public site or help page when available, rather than relying only on the store listing.\n\nHow a grader verifies this: Official public pages or support/help pages are cited when available, and the comparison reflects claims verified beyond the Web Store listing.","max_points":120},{"criterion":"The final material includes a relevant public review, reputation, or online-sentiment link for each extension so the ranking is not anchored only to official pages and store metadata.","description":"The final material includes a relevant public review, reputation, or online-sentiment link for each extension so the ranking is not anchored only to official pages and store metadata.\n\nHow a grader verifies this: Each extension entry includes at least one public review, reputation, or sentiment link that can be opened and inspected separately from the official site and Web Store page.","max_points":140},{"criterion":"The analysis narrows the 10 extensions to exactly 4 best options for a privacy-conscious shopper in the U.S. who wants real savings without installing redundant tools, and explains the role each of the 4 would play, including overlap versus complementarity.","description":"The analysis narrows the 10 extensions to exactly 4 best options for a privacy-conscious shopper in the U.S. who wants real savings without installing redundant tools, and explains the role each of the 4 would play, including overlap versus complementarity.\n\nHow a grader verifies this: Exactly 4 finalists are clearly identified, and each finalist includes a role explanation covering both usefulness and overlap/complementarity.","max_points":180},{"criterion":"Useful browser evidence is left open at the end: the Chrome Web Store listings for the final 4, at least 2 official privacy or help pages that mattered to the decision, and 2 runner-up tabs that were strong but not selected.","description":"Useful browser evidence is left open at the end: the Chrome Web Store listings for the final 4, at least 2 official privacy or help pages that mattered to the decision, and 2 runner-up tabs that were strong but not selected.\n\nHow a grader verifies this: The kept-open tabs match the requested counts and types: 4 finalist store listings, 2 official privacy/help pages, and 2 runner-up tabs.","max_points":120},{"criterion":"A CryptoPad Document is created with the rankings and links for each extension, including the relevant review or online-sentiment links.","description":"A CryptoPad Document is created with the rankings and links for each extension, including the relevant review or online-sentiment links.\n\nHow a grader verifies this: The open CryptoPad Document contains the extension rankings and includes the requested review/sentiment links for each listed extension.","max_points":120}]}} +{"task_id":"a931f7baacfd7f1bcea8409bb8b3d84383734680","confirmed_task":"I’m trying to get a genuinely useful picture of how jazz connects to community service, not just a vague summary, because I may want to volunteer, donate, or suggest a partner organization later. Please use public web pages to research 10 to 12 jazz-focused nonprofits or community programs in the U.S. that clearly use jazz as part of education, outreach, health, social support, youth development, or other community-serving work. For each organization, confirm at least 3 concrete ways it serves the community from its own site or other strong public pages, and note whether there is a visible way for an outsider to get involved through volunteering, donating, attending, or partnering; if something is not shown, say not shown. As you work, keep the strongest evidence tabs open for at least 6 organizations, including a mix of program, impact, and volunteer or support pages, so I can review them later. Then synthesize what you found into a comparison that groups the organizations by service model, points out the most common and the most distinctive community-service approaches, and recommends the 5 strongest organizations for someone who specifically wants hands-on community impact through jazz. In the final write-up, include a ranked top 5 with a short reason for each choice and make clear which organization looks best for volunteering, which looks best for youth education impact, and which looks best for broader community outreach.","website":"https://www.google.com","level":"hard","reference_length":5,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies 10 to 12 jazz-focused U.S. nonprofits or community programs that clearly use jazz in community-serving work.","description":"The browsing session identifies 10 to 12 jazz-focused U.S. nonprofits or community programs that clearly use jazz in community-serving work.\n\nHow a grader verifies this: Count the organizations in the final comparison and confirm each is jazz-focused and community-serving based on public pages reviewed during the session.","max_points":180},{"criterion":"For each organization, the final write-up records at least 3 concrete ways it serves the community, drawn from its own site or other strong public pages.","description":"For each organization, the final write-up records at least 3 concrete ways it serves the community, drawn from its own site or other strong public pages.\n\nHow a grader verifies this: Check that every listed organization has 3 or more specific service activities or program types described in the final output, with enough detail to distinguish them from generic mission language.","max_points":200},{"criterion":"For each organization, the final output states whether there is a visible way for an outsider to get involved through volunteering, donating, attending, or partnering, and uses 'not shown' when that is missing.","description":"For each organization, the final output states whether there is a visible way for an outsider to get involved through volunteering, donating, attending, or partnering, and uses 'not shown' when that is missing.\n\nHow a grader verifies this: Review each organization entry and confirm that an engagement pathway or an explicit 'not shown' note is included.","max_points":140},{"criterion":"At least 6 strong evidence tabs are kept open, covering a mix of program, impact, and volunteer or support pages for different organizations.","description":"At least 6 strong evidence tabs are kept open, covering a mix of program, impact, and volunteer or support pages for different organizations.\n\nHow a grader verifies this: Inspect the open tabs at the end and confirm there are at least 6 relevant public pages left open, with the required mix of page types across multiple organizations.","max_points":140},{"criterion":"The final synthesis groups the organizations by service model and identifies both the most common and the most distinctive ways jazz is being used for community service.","description":"The final synthesis groups the organizations by service model and identifies both the most common and the most distinctive ways jazz is being used for community service.\n\nHow a grader verifies this: Check that the final write-up includes explicit grouping by service model and clearly labeled observations about common patterns and distinctive approaches.","max_points":140},{"criterion":"The final output includes a ranked top 5 recommendation for someone seeking hands-on community impact through jazz, with a short reason for each choice.","description":"The final output includes a ranked top 5 recommendation for someone seeking hands-on community impact through jazz, with a short reason for each choice.\n\nHow a grader verifies this: Confirm there are exactly 5 ranked recommendations and that each has a brief rationale tied to the comparison findings.","max_points":120},{"criterion":"The final recommendation explicitly identifies which organization looks best for volunteering, which looks best for youth education impact, and which looks best for broader community outreach.","description":"The final recommendation explicitly identifies which organization looks best for volunteering, which looks best for youth education impact, and which looks best for broader community outreach.\n\nHow a grader verifies this: Check that these three labeled best-fit conclusions are present and correspond to organizations already researched in the session.","max_points":80}]}} +{"task_id":"e8748499c5ee313e307ba6819d4978cd061d8445","confirmed_task":"I’m seriously considering a Honda Civic Hybrid for my next car, but I don’t want just one review pulled out of context. Please do a thorough browser-based comparison that helps me decide whether the Civic Hybrid is actually the best fit in the compact-hybrid segment. Start by finding at least 6 professional reviews of the Honda Civic Hybrid from reputable automotive publications, and for each one capture the model year reviewed plus the main pros, cons, and overall takeaway. Then verify the current official Honda Civic Hybrid trims, key specs, fuel-economy figures, and notable feature differences on Honda’s public pages. After that, compare the Civic Hybrid against 3 realistic rivals in the same general class—such as the Toyota Prius, Toyota Corolla Hybrid, and Hyundai Elantra Hybrid, or close substitutes if one of those is not comparable enough—using official manufacturer pages plus professional reviews. I want the comparison to cover starting price, mpg or equivalent efficiency, horsepower, cargo space, standout tech or comfort features, and whether reviewers describe it as more fun-to-drive, more practical, or more value-oriented. Also check public safety sources like IIHS and NHTSA if available, and include what is shown there rather than guessing. Keep the most useful evidence tabs open as you go, including at least 3 Honda-related tabs, at least 3 rival-model tabs, and at least 2 review or safety tabs that best support the final recommendation. Finish with a concise buyer-oriented decision memo that tells me whether the Civic Hybrid is the best overall pick, the best fun-to-drive pick, or not the best choice for value, and clearly note any important fields as not shown if the public pages don’t provide them. Prepare a presentation in CryptoPad that makes it fun to read through and read your recommendations.","website":"https://www.idehonda.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"At least 6 professional reviews of the Honda Civic Hybrid are gathered from reputable automotive publications, and each includes the reviewed model year plus main pros, cons, and overall takeaway.","description":"At least 6 professional reviews of the Honda Civic Hybrid are gathered from reputable automotive publications, and each includes the reviewed model year plus main pros, cons, and overall takeaway.\n\nHow a grader verifies this: The final output lists 6 or more distinct professional review sources and records the requested review details for each one.","max_points":200},{"criterion":"The current official Honda Civic Hybrid trims, key specs, fuel-economy figures, and notable feature differences are verified on Honda public pages.","description":"The current official Honda Civic Hybrid trims, key specs, fuel-economy figures, and notable feature differences are verified on Honda public pages.\n\nHow a grader verifies this: The final output includes trim-level Honda information sourced from official public Honda pages, with specs, efficiency, and feature differences clearly summarized.","max_points":160},{"criterion":"The Honda Civic Hybrid is compared against 3 realistic rivals in the same general class using official manufacturer pages plus professional reviews.","description":"The Honda Civic Hybrid is compared against 3 realistic rivals in the same general class using official manufacturer pages plus professional reviews.\n\nHow a grader verifies this: Exactly 3 rival models are included, and each has comparison details drawn from both official model pages and review-based evidence.","max_points":180},{"criterion":"The comparison covers starting price, mpg or equivalent efficiency, horsepower, cargo space, standout tech or comfort features, and whether reviewers describe each model as more fun-to-drive, more practical, or more value-oriented.","description":"The comparison covers starting price, mpg or equivalent efficiency, horsepower, cargo space, standout tech or comfort features, and whether reviewers describe each model as more fun-to-drive, more practical, or more value-oriented.\n\nHow a grader verifies this: For the Civic Hybrid and all 3 rivals, the final comparison explicitly includes each requested category or marks missing items as not shown.","max_points":180},{"criterion":"Public safety sources like IIHS and NHTSA are checked if available, and the results are included without guessing.","description":"Public safety sources like IIHS and NHTSA are checked if available, and the results are included without guessing.\n\nHow a grader verifies this: The final output reports what IIHS and/or NHTSA show for the relevant vehicles when available, and uses 'not shown' or equivalent where information is unavailable.","max_points":100},{"criterion":"The most useful evidence tabs are kept open, including at least 3 Honda-related tabs, at least 3 rival-model tabs, and at least 2 review or safety tabs that best support the final recommendation.","description":"The most useful evidence tabs are kept open, including at least 3 Honda-related tabs, at least 3 rival-model tabs, and at least 2 review or safety tabs that best support the final recommendation.\n\nHow a grader verifies this: Open tabs at the end include the required minimum counts across Honda, rival, and review/safety evidence pages.","max_points":80},{"criterion":"A concise buyer-oriented decision memo is produced stating whether the Civic Hybrid is the best overall pick, the best fun-to-drive pick, or not the best choice for value, with important missing fields marked as not shown, and a CryptoPad Presentation is also created to make the buyer recommendation easy to review.","description":"A concise buyer-oriented decision memo is produced stating whether the Civic Hybrid is the best overall pick, the best fun-to-drive pick, or not the best choice for value, with important missing fields marked as not shown, and a CryptoPad Presentation is also created to make the buyer recommendation easy to review.\n\nHow a grader verifies this: The final memo makes an explicit recommendation using the requested framing and clearly flags any unavailable public-page fields as not shown; an open CryptoPad Presentation presents the compact-hybrid comparison and recommendation.","max_points":100}]}} +{"task_id":"e1184e98c9a0c78c6170d6c740bdf30b8dd11442","confirmed_task":"I’m trying to figure out whether a Honda Civic Hatchback Hybrid would actually work for me as a winter daily driver in a place that gets regular snow, slush, and cold mornings, not just whether one reviewer liked it. Please do a serious browser-based investigation using public pages only. Start by finding at least 8 credible pieces of evidence about this car’s snowy-weather behavior, including at least 3 video reviews or tests, at least 2 owner-reported winter experiences from public forums or communities, and at least 2 official or manufacturer-style sources such as specs, manuals, or feature pages that help explain traction, drive modes, tires, clearance, or cold-weather limitations. Summarize what each source actually says about snow starts, traction, braking, hill driving, stability, cabin warm-up, visibility, and any cautions or missing information, and keep the strongest evidence tabs open. Then compare the Civic Hatchback Hybrid against exactly 3 realistic alternatives for the same kind of buyer, preferably other compact hatchback or sedan hybrids or similarly efficient daily drivers that someone might cross-shop for winter use. For each of the 4 total vehicles, capture drivetrain, approximate ground-clearance context if publicly shown, winter-relevant features, and whether the evidence suggests it is a confident snow choice, an acceptable-with-good-winter-tires choice, or a poor fit. Use tire retailer or fitment pages only as supporting evidence to check whether common winter-tire sizing appears straightforward for the Civic, and note any obvious constraints. Finish with a concise decision memo that tells me whether the Honda Civic Hatchback Hybrid looks viable for a future snowy-climate commute, what the biggest winter caveats are, which of the 3 alternatives looks strongest if I want more confidence in snow, and leave the most useful comparison and evidence tabs open so I can review them myself.","website":"https://www.youtube.com","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"At least 8 credible pieces of evidence are gathered about the Honda Civic Hatchback Hybrid's snowy-weather behavior, including at least 3 video reviews or tests, at least 2 owner-reported winter experiences, and at least 2 official or manufacturer-style sources.","description":"At least 8 credible pieces of evidence are gathered about the Honda Civic Hatchback Hybrid's snowy-weather behavior, including at least 3 video reviews or tests, at least 2 owner-reported winter experiences, and at least 2 official or manufacturer-style sources.\n\nHow a grader verifies this: The final output lists 8 or more distinct public sources and their categories, and the open browser tabs include representative examples from each required source type.","max_points":200},{"criterion":"Each source is summarized for what it says about snow starts, traction, braking, hill driving, stability, cabin warm-up, visibility, and any cautions or missing information.","description":"Each source is summarized for what it says about snow starts, traction, braking, hill driving, stability, cabin warm-up, visibility, and any cautions or missing information.\n\nHow a grader verifies this: The final memo contains per-source summaries covering these winter-use dimensions when available, and explicitly marks 'not shown' or similar where a source does not address a category.","max_points":180},{"criterion":"The Honda Civic Hatchback Hybrid is compared against exactly 3 realistic alternatives for the same buyer, for 4 total vehicles.","description":"The Honda Civic Hatchback Hybrid is compared against exactly 3 realistic alternatives for the same buyer, for 4 total vehicles.\n\nHow a grader verifies this: The final comparison includes exactly 4 vehicles total and clearly identifies the 3 alternatives as cross-shopped options relevant to efficient winter daily driving.","max_points":160},{"criterion":"For each of the 4 total vehicles, the comparison captures drivetrain, approximate ground-clearance context if publicly shown, winter-relevant features, and whether the evidence suggests it is a confident snow choice, an acceptable-with-good-winter-tires choice, or a poor fit.","description":"For each of the 4 total vehicles, the comparison captures drivetrain, approximate ground-clearance context if publicly shown, winter-relevant features, and whether the evidence suggests it is a confident snow choice, an acceptable-with-good-winter-tires choice, or a poor fit.\n\nHow a grader verifies this: The final comparison table or memo includes all requested fields for each vehicle, using 'not shown' where public pages do not provide a field.","max_points":170},{"criterion":"Tire retailer or fitment pages are used only as supporting evidence to check whether common winter-tire sizing appears straightforward for the Civic, with any obvious constraints noted.","description":"Tire retailer or fitment pages are used only as supporting evidence to check whether common winter-tire sizing appears straightforward for the Civic, with any obvious constraints noted.\n\nHow a grader verifies this: The final output includes a Civic-specific winter-tire fitment note based on public fitment pages and does not expand the task into unrelated tire shopping beyond the requested support role.","max_points":110},{"criterion":"The session ends with a concise decision memo stating whether the Honda Civic Hatchback Hybrid looks viable for a future snowy-climate commute, the biggest winter caveats, and which of the 3 alternatives looks strongest for more confidence in snow.","description":"The session ends with a concise decision memo stating whether the Honda Civic Hatchback Hybrid looks viable for a future snowy-climate commute, the biggest winter caveats, and which of the 3 alternatives looks strongest for more confidence in snow.\n\nHow a grader verifies this: The final memo contains an explicit viability judgment for the Civic, names the key caveats, and identifies one strongest alternative with a brief reason tied to the gathered evidence.","max_points":100},{"criterion":"The most useful comparison and evidence tabs are left open for review.","description":"The most useful comparison and evidence tabs are left open for review.\n\nHow a grader verifies this: Open tabs at the end include the strongest Civic snow evidence sources and the key comparison pages used for the alternatives.","max_points":80}]}} +{"task_id":"03328a94fec2dce938ded3959bdb6ea292c22186","confirmed_task":"I’m trying to figure out whether the Kia K5 GT should actually make my next test-drive shortlist, not just watch one random review. Start on YouTube and find 10 recent, substantive video reviews or comparison videos that cover the Kia K5 GT, and record for each one the video title, channel name, approximate viewpoint of the reviewer, and whether the review is mainly positive, mixed, or negative. Keep the 4 most useful review tabs open, including at least one straight review and at least one head-to-head comparison. Then open Kia’s official K5 GT pages and verify the current core facts I’d care about as a shopper: engine/performance details, transmission, notable standard features, warranty, and starting price or price range if shown. After that, compare the K5 GT against exactly 4 realistic sporty sedan alternatives on public pages from official manufacturer sites, using models that are actually plausible cross-shops for someone considering a quick midsize or near-midsize sport sedan. For those 5 cars total, compare price, horsepower, drivetrain, key comfort or tech features, and anything clearly positioned as performance-oriented. Next, check public safety and ownership-risk signals for the Kia K5 GT using sources like IIHS, NHTSA, recall pages, and owner-review or reliability pages if available, and note anything that looks like a meaningful concern or a reassuring sign; if a field is not shown, say not shown. Finally, give me a ranked recommendation with exactly 3 outcomes: best value pick, best performance-leaning pick, and best all-around daily-driver pick. Explain where the Kia K5 GT lands, what kind of buyer it fits best, and whether I should prioritize it for a test drive now or focus on one of the alternatives instead. Leave the key review, official spec, and safety/evidence pages open so I can inspect them afterward.","website":"https://www.youtube.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies exactly 10 recent, substantive YouTube video reviews or comparison videos covering the Kia K5 GT, and for each one records the video title, channel name, approximate reviewer viewpoint, and whether the review is positive, mixed, or negative.","description":"The browsing session identifies exactly 10 recent, substantive YouTube video reviews or comparison videos covering the Kia K5 GT, and for each one records the video title, channel name, approximate reviewer viewpoint, and whether the review is positive, mixed, or negative.\n\nHow a grader verifies this: Check that the final output lists 10 videos with all four requested fields and that they are clearly about the Kia K5 GT.","max_points":180},{"criterion":"Exactly 4 of the most useful review tabs are kept open, including at least 1 straight Kia K5 GT review and at least 1 head-to-head comparison video.","description":"Exactly 4 of the most useful review tabs are kept open, including at least 1 straight Kia K5 GT review and at least 1 head-to-head comparison video.\n\nHow a grader verifies this: Inspect open tabs or end-state browser evidence to confirm 4 relevant YouTube tabs remain open with the required mix.","max_points":140},{"criterion":"Official Kia K5 GT pages are opened and used to verify the requested shopper-facing facts: engine or performance details, transmission, notable standard features, warranty, and starting price or price range if shown.","description":"Official Kia K5 GT pages are opened and used to verify the requested shopper-facing facts: engine or performance details, transmission, notable standard features, warranty, and starting price or price range if shown.\n\nHow a grader verifies this: Check that official Kia pages are open or cited in the final synthesis and that each requested fact is reported, with 'not shown' used where needed.","max_points":160},{"criterion":"The Kia K5 GT is compared against exactly 4 realistic sporty sedan alternatives using public official manufacturer pages, and the comparison covers all 5 cars total on price, horsepower, drivetrain, key comfort or tech features, and performance-oriented positioning.","description":"The Kia K5 GT is compared against exactly 4 realistic sporty sedan alternatives using public official manufacturer pages, and the comparison covers all 5 cars total on price, horsepower, drivetrain, key comfort or tech features, and performance-oriented positioning.\n\nHow a grader verifies this: Review the final comparison to confirm there are 5 total cars, 4 alternatives, and all requested fields are present from public official pages.","max_points":200},{"criterion":"Public safety and ownership-risk signals for the Kia K5 GT are checked using sources such as IIHS, NHTSA, recall pages, and owner-review or reliability pages if available, with meaningful concerns or reassuring signs noted and 'not shown' used for missing fields.","description":"Public safety and ownership-risk signals for the Kia K5 GT are checked using sources such as IIHS, NHTSA, recall pages, and owner-review or reliability pages if available, with meaningful concerns or reassuring signs noted and 'not shown' used for missing fields.\n\nHow a grader verifies this: Confirm the final output includes findings from the requested categories of public sources and notes concrete concerns, reassuring signs, or 'not shown' where appropriate.","max_points":140},{"criterion":"The final recommendation provides exactly 3 outcomes—best value pick, best performance-leaning pick, and best all-around daily-driver pick—and explicitly explains where the Kia K5 GT lands, what buyer it fits best, and whether it should be prioritized for a test drive now.","description":"The final recommendation provides exactly 3 outcomes—best value pick, best performance-leaning pick, and best all-around daily-driver pick—and explicitly explains where the Kia K5 GT lands, what buyer it fits best, and whether it should be prioritized for a test drive now.\n\nHow a grader verifies this: Check that the conclusion includes exactly the 3 requested outcome labels and directly addresses Kia K5 GT fit and test-drive priority.","max_points":100},{"criterion":"Key evidence pages are left open at the end, including useful review tabs plus official spec and safety/evidence pages that support the recommendation.","description":"Key evidence pages are left open at the end, including useful review tabs plus official spec and safety/evidence pages that support the recommendation.\n\nHow a grader verifies this: Inspect the final browser state to confirm that relevant YouTube, official Kia, and safety or ownership-risk pages remain open for later review.","max_points":80}]}} +{"task_id":"b6bbdc8e0388a5a1e256f561a8f0b7d92c6e772c","confirmed_task":"I want to make one genuinely good winter purchase instead of impulse-buying the first heated blanket I see. Please do a serious browser comparison of 12 current electric heated blanket options across major public retailer pages and manufacturer pages where available, focusing on products that are realistically usable for bed warmth in winter in the U.S. Include a mix of at least 4 full-size bed blankets, at least 4 throws, and at least 4 larger or premium options for shared use or colder sleepers. For each option, record the product name, retailer, current listed price, size, material, number of heat settings, auto shutoff details, machine-washability, controller style, safety certification or compliance language if shown, warranty length if shown, and shipping or delivery info if it is prominently visible; use 'not shown' when a field is missing. Open the actual listing pages for all candidates, and for at least 6 of them also open either the manufacturer product page, care/manual page, or another public page that helps verify the safety or care details, so the comparison is not based on retailer copy alone. While comparing, look through photos and review sections closely enough to capture recurring positives and negatives for each product, especially anything about uneven heating, reliability, shedding, thinness, controller annoyance, or washing problems. Then narrow the 12 down to a final shortlist of 5 with a clear recommendation for best overall, best value, best throw, best for couples or larger beds, and best premium pick. End with one concise buying memo that explains which single blanket I should buy and why, and keep the 5 finalist product tabs plus the 2 most useful verification tabs open so I can review them.","website":"https://www.amazon.com","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"Exactly 12 current heated blanket candidates are compared from public pages, including at least 4 full-size bed blankets, at least 4 throws, and at least 4 larger or premium options for shared use or colder sleepers.","description":"Exactly 12 current heated blanket candidates are compared from public pages, including at least 4 full-size bed blankets, at least 4 throws, and at least 4 larger or premium options for shared use or colder sleepers.\n\nHow a grader verifies this: The final comparison includes 12 distinct products and clearly identifies their type/category so the required 4/4/4 coverage can be checked from the recorded entries and open product tabs.","max_points":180},{"criterion":"For each of the 12 options, the comparison records the product name, retailer, current listed price, size, material, number of heat settings, auto shutoff details, machine-washability, controller style, safety certification or compliance language if shown, warranty length if shown, and shipping or delivery info if prominently visible, using 'not shown' when needed.","description":"For each of the 12 options, the comparison records the product name, retailer, current listed price, size, material, number of heat settings, auto shutoff details, machine-washability, controller style, safety certification or compliance language if shown, warranty length if shown, and shipping or delivery info if prominently visible, using 'not shown' when needed.\n\nHow a grader verifies this: Each of the 12 products has all requested fields filled in or explicitly marked 'not shown' in the final comparison.","max_points":220},{"criterion":"All 12 candidates have their actual retailer listing pages opened, and at least 6 of them also have a manufacturer product page, care/manual page, or other public verification page opened to confirm safety or care details.","description":"All 12 candidates have their actual retailer listing pages opened, and at least 6 of them also have a manufacturer product page, care/manual page, or other public verification page opened to confirm safety or care details.\n\nHow a grader verifies this: Open tabs show the 12 retailer listings and at least 6 additional verification pages tied to specific candidates.","max_points":170},{"criterion":"The comparison captures recurring review and photo-based positives and negatives for each product, with attention to uneven heating, reliability, shedding, thinness, controller annoyance, and washing problems when those themes appear.","description":"The comparison captures recurring review and photo-based positives and negatives for each product, with attention to uneven heating, reliability, shedding, thinness, controller annoyance, and washing problems when those themes appear.\n\nHow a grader verifies this: Each candidate includes brief synthesized pros/cons or review-theme notes drawn from its public listing and supporting pages.","max_points":140},{"criterion":"The 12 products are narrowed to a final shortlist of exactly 5, and the shortlist includes a named recommendation for best overall, best value, best throw, best for couples or larger beds, and best premium pick.","description":"The 12 products are narrowed to a final shortlist of exactly 5, and the shortlist includes a named recommendation for best overall, best value, best throw, best for couples or larger beds, and best premium pick.\n\nHow a grader verifies this: The final output presents exactly 5 finalists and assigns each of the five requested recommendation labels.","max_points":140},{"criterion":"A concise final buying memo recommends one single blanket to buy and explains why it wins over the other finalists.","description":"A concise final buying memo recommends one single blanket to buy and explains why it wins over the other finalists.\n\nHow a grader verifies this: The final memo explicitly names one product as the buy recommendation and gives a comparative rationale grounded in the recorded features, verification details, and review themes.","max_points":90},{"criterion":"The 5 finalist product tabs and the 2 most useful verification tabs are left open at the end for review.","description":"The 5 finalist product tabs and the 2 most useful verification tabs are left open at the end for review.\n\nHow a grader verifies this: The final browser state shows 7 kept-open tabs: 5 finalist product pages and 2 verification pages.","max_points":60}]}} +{"task_id":"7c0efe6388e3b7745a4db7e755b4e9b0a7b23508","confirmed_task":"I want help making a real purchase decision on a warm, military-style winter jacket, not just grabbing the first tactical-looking one I see. Start with Viktos, but widen out to other credible brands that sell the same general kind of jacket on public product pages. Compare exactly 12 jackets total from at least 6 brands, with Viktos definitely included, and focus on options that visibly read as military or tactical rather than fashion-only parkas. For each jacket, capture the product name, listed price, insulation or warmth information if shown, shell or weather-protection details, available colors, whether common tactical colors like black, ranger green, coyote, olive, or gray are offered, and whether size Large is shown or not shown. Also check the sizing chart or fit notes where available, plus the return policy or warranty page for each brand, because I do not want to get stuck with an expensive bad fit. Keep the browsing practical and visual: open the strongest product pages in separate tabs, compare photos closely so the styling stays in the para-military lane, and leave the best 4 jacket pages open at the end along with 2 useful sizing or return-policy pages. Then give me a final ranked shortlist of 5 jackets with a clear winner, a best value pick, and a best severe-cold pick, explaining the tradeoffs in warmth, style, price, and purchase risk. Create a tracker of all the options and their prices, along with their links in CryptoPad Spreadsheets.","website":"https://viktos.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"Exactly 12 jackets are compared from at least 6 brands, and Viktos is included among those brands.","description":"Exactly 12 jackets are compared from at least 6 brands, and Viktos is included among those brands.\n\nHow a grader verifies this: The final comparison explicitly lists 12 distinct jacket product pages and shows brand names demonstrating coverage of at least 6 brands including Viktos.","max_points":180},{"criterion":"Each of the 12 jackets includes the requested recorded details: product name, listed price, insulation or warmth information if shown, shell or weather-protection details, available colors, whether black/ranger green/coyote/olive/gray are offered, and whether size Large is shown or not shown.","description":"Each of the 12 jackets includes the requested recorded details: product name, listed price, insulation or warmth information if shown, shell or weather-protection details, available colors, whether black/ranger green/coyote/olive/gray are offered, and whether size Large is shown or not shown.\n\nHow a grader verifies this: The final output contains one entry per jacket with all requested fields populated or marked not shown where unavailable.","max_points":240},{"criterion":"Sizing and purchase-risk checks are completed by reviewing sizing chart or fit notes where available and the return policy or warranty page for each brand.","description":"Sizing and purchase-risk checks are completed by reviewing sizing chart or fit notes where available and the return policy or warranty page for each brand.\n\nHow a grader verifies this: The final output includes fit or sizing-chart notes for the compared jackets where available and a return-policy or warranty note for each represented brand.","max_points":160},{"criterion":"The comparison stays focused on military or tactical-looking winter jackets rather than drifting into generic fashion outerwear.","description":"The comparison stays focused on military or tactical-looking winter jackets rather than drifting into generic fashion outerwear.\n\nHow a grader verifies this: The selected jackets' product pages and the final reasoning clearly indicate tactical or military-style positioning and visual styling fit the requested para-military lane.","max_points":120},{"criterion":"The browsing keeps the work visual and evidence-based by opening the strongest product pages in separate tabs and comparing product photos closely.","description":"The browsing keeps the work visual and evidence-based by opening the strongest product pages in separate tabs and comparing product photos closely.\n\nHow a grader verifies this: At least the strongest candidate product pages remain open at the end, and the final notes reference visual/photo-based style comparisons drawn from those public pages.","max_points":100},{"criterion":"The best 4 jacket product pages are left open at the end, along with 2 useful sizing or return-policy pages.","description":"The best 4 jacket product pages are left open at the end, along with 2 useful sizing or return-policy pages.\n\nHow a grader verifies this: The final browser state includes 4 open product tabs for the top jackets and 2 open sizing or return-policy tabs relevant to the purchase decision.","max_points":100},{"criterion":"A final ranked shortlist of 5 jackets is produced, including a clear overall winner, a best value pick, and a best severe-cold pick, with tradeoffs explained in warmth, style, price, and purchase risk, and a CryptoPad Spreadsheet tracker of all options, prices, and links is also created.","description":"A final ranked shortlist of 5 jackets is produced, including a clear overall winner, a best value pick, and a best severe-cold pick, with tradeoffs explained in warmth, style, price, and purchase risk, and a CryptoPad Spreadsheet tracker of all options, prices, and links is also created.\n\nHow a grader verifies this: The final response contains a ranked top-5 shortlist and explicitly labels the overall winner, best value, and best severe-cold option with rationale covering the requested tradeoffs; an open CryptoPad Spreadsheet records the jacket options, prices, and links.","max_points":100}]}} +{"task_id":"4f1f73202b21d2f30cec9737538f1184f119d5ec","confirmed_task":"I already have a show on YouTube and want to turn it into a real podcast setup, with Spotify as a priority but not necessarily by blindly using Spotify’s own hosting if another path is better. Please use public pages only to compare exactly 8 realistic podcast hosting or distribution options for a YouTube-first creator, including Spotify for Creators and 7 other major platforms you judge to be good fits. For each option, check the official site and record whether I can create a new show or import an existing one, whether distribution to Spotify and other podcast apps is supported, whether video podcasts are supported, the publicly shown free tier or lowest paid price, what public analytics or monetization features are advertised, and any obvious limitation for repurposing an existing YouTube show into podcast episodes; if a field is not shown, say 'not shown.' While doing this, keep the official pricing or features pages open for the 4 strongest options so I can compare them later. Then go through Spotify for Creators on its public pages far enough to confirm the current onboarding path for adding a podcast, including the live entry point that corresponds to 'Add your podcast' or its current equivalent, and also find Spotify’s public guidance about creating a new show versus importing one by RSS. Finally, give me a clear recommendation for the best low-cost path if my goal is to repurpose an existing YouTube show into audio now without boxing myself in later, explain why the top 2 options beat the rest, and leave the key Spotify and top-alternative tabs open.","website":"https://creators.spotify.com","level":"hard","reference_length":18,"precomputed_rubric":{"items":[{"criterion":"Exactly 8 realistic podcast hosting or distribution options are compared, including Spotify for Creators and 7 other major platforms judged to fit a YouTube-first creator.","description":"Exactly 8 realistic podcast hosting or distribution options are compared, including Spotify for Creators and 7 other major platforms judged to fit a YouTube-first creator.\n\nHow a grader verifies this: The final comparison includes 8 distinct options total, one of which is Spotify for Creators, with no missing platform entries.","max_points":180},{"criterion":"For each of the 8 options, the comparison records whether a new show can be created or an existing one imported, whether distribution to Spotify and other podcast apps is supported, whether video podcasts are supported, the public free tier or lowest paid price, public analytics or monetization features, and any obvious limitation for repurposing an existing YouTube show, using 'not shown' where needed.","description":"For each of the 8 options, the comparison records whether a new show can be created or an existing one imported, whether distribution to Spotify and other podcast apps is supported, whether video podcasts are supported, the public free tier or lowest paid price, public analytics or monetization features, and any obvious limitation for repurposing an existing YouTube show, using 'not shown' where needed.\n\nHow a grader verifies this: Each platform entry contains all requested fields, and missing public information is explicitly marked as 'not shown' rather than omitted.","max_points":270},{"criterion":"The official pricing or features pages for the 4 strongest options are kept open so the user can compare them later.","description":"The official pricing or features pages for the 4 strongest options are kept open so the user can compare them later.\n\nHow a grader verifies this: There is browser evidence that 4 official pricing/features tabs corresponding to the selected strongest options remain open at the end.","max_points":120},{"criterion":"Spotify for Creators is checked on public pages far enough to confirm the current onboarding path for adding a podcast, including the live entry point matching 'Add your podcast' or its current equivalent.","description":"Spotify for Creators is checked on public pages far enough to confirm the current onboarding path for adding a podcast, including the live entry point matching 'Add your podcast' or its current equivalent.\n\nHow a grader verifies this: The final result describes the current Spotify onboarding path and shows browser evidence from the relevant public Spotify for Creators page(s) that the add-podcast entry point was reached or verified.","max_points":160},{"criterion":"Spotify’s public guidance about creating a new show versus importing one by RSS is found and incorporated.","description":"Spotify’s public guidance about creating a new show versus importing one by RSS is found and incorporated.\n\nHow a grader verifies this: The final answer includes both Spotify paths—new show creation and RSS import—based on public Spotify guidance, with the supporting Spotify guidance page(s) open or clearly referenced in the browsing evidence.","max_points":120},{"criterion":"A clear final recommendation is given for the best low-cost path to repurpose an existing YouTube show into audio without boxing the creator in later, and it explains why the top 2 options beat the rest.","description":"A clear final recommendation is given for the best low-cost path to repurpose an existing YouTube show into audio without boxing the creator in later, and it explains why the top 2 options beat the rest.\n\nHow a grader verifies this: The conclusion names one best option and one runner-up, ties both directly to the stated low-cost and future-flexibility goal, and compares them against the other options rather than giving a generic summary.","max_points":150}]}} +{"task_id":"6e19b9c15e44d2971216e6cd3eee212e33df6586","confirmed_task":"I’m putting together a serious background brief on ongoing U.S. health system problems, and I don’t want a quick two-paragraph summary. Please use public sources to identify 5 ongoing national health system problems that are still being actively discussed or measured—things like affordability, insurance access, maternal health, mental health access, chronic disease burden, provider shortages, or similar issues if the evidence is stronger. For each of the 5 problems, compare how it affects at least 4 demographic groups differently, choosing from groups such as race/ethnicity, income level, age, sex, rural versus urban residents, disability status, or insurance status depending on what the sources actually show. Use high-quality public sources like CDC, CMS, NIH, AHRQ, KFF, Commonwealth Fund, and major peer-reviewed or academic-health sources when needed, and prefer pages with charts, maps, tables, or survey results that make the differences visible. Open and keep the strongest evidence tabs available for each problem, including at least one chart- or data-heavy page for every problem if public evidence exists. Then create one organized briefing document in CryptoPad that has exactly 5 sections, one per problem, and for each section include: a plain-language description of the problem, why it is ongoing, at least 2 supporting sources, at least 1 concrete way the burden differs across at least 4 demographic groups, and a short note on any important caveat like outdated data, differing definitions, or missing subgroup detail. End the briefing with a cross-cutting comparison section that names which 2 problems seem most unequal across demographic lines based on the evidence you found and explains why. Leave the finished briefing open, and also leave open the most useful evidence tabs so I can review the charts and source pages myself. Generate a CryptoPad Presentation for this information as well, so I can present on it.","website":"https://www.google.com","level":"hard","reference_length":10,"precomputed_rubric":{"items":[{"criterion":"A finished briefing document is created and left open, with exactly 5 sections covering 5 ongoing national U.S. health system problems plus a final cross-cutting comparison section, and the finished briefing is created in CryptoPad and left open.","description":"A finished briefing document is created and left open, with exactly 5 sections covering 5 ongoing national U.S. health system problems plus a final cross-cutting comparison section, and the finished briefing is created in CryptoPad and left open.\n\nHow a grader verifies this: Confirm the open final document contains 5 distinct problem sections and one ending comparison section, with no extra or missing problem sections; the open final deliverable is a CryptoPad briefing document with the requested sections.","max_points":180},{"criterion":"Each of the 5 problem sections includes a plain-language description of the problem and an explanation of why it is ongoing.","description":"Each of the 5 problem sections includes a plain-language description of the problem and an explanation of why it is ongoing.\n\nHow a grader verifies this: Review each section of the briefing document for both elements: a description and a why-it-is-ongoing explanation.","max_points":140},{"criterion":"For each of the 5 problems, the briefing uses at least 2 supporting public sources from high-quality health or policy sources such as CDC, CMS, NIH, AHRQ, KFF, Commonwealth Fund, or comparable academic-health sources.","description":"For each of the 5 problems, the briefing uses at least 2 supporting public sources from high-quality health or policy sources such as CDC, CMS, NIH, AHRQ, KFF, Commonwealth Fund, or comparable academic-health sources.\n\nHow a grader verifies this: Check that each section cites at least 2 public sources and that the cited sources are from the requested source types.","max_points":180},{"criterion":"For each of the 5 problems, the briefing describes at least 1 concrete way the burden differs across at least 4 demographic groups, using groups actually supported by the source evidence.","description":"For each of the 5 problems, the briefing describes at least 1 concrete way the burden differs across at least 4 demographic groups, using groups actually supported by the source evidence.\n\nHow a grader verifies this: Inspect each section for a disparity comparison involving at least 4 demographic groups and confirm the comparison is tied to cited evidence.","max_points":200},{"criterion":"The browsing session keeps the strongest evidence tabs available for each problem, including at least 1 chart-, map-, table-, or data-heavy public page for every problem when such evidence exists.","description":"The browsing session keeps the strongest evidence tabs available for each problem, including at least 1 chart-, map-, table-, or data-heavy public page for every problem when such evidence exists.\n\nHow a grader verifies this: Check that useful evidence tabs remain open across the 5 problems and that each problem has an open visual or data-heavy source page when available.","max_points":140},{"criterion":"Each of the 5 problem sections includes a short caveat note covering an important limitation such as outdated data, differing definitions, or missing subgroup detail.","description":"Each of the 5 problem sections includes a short caveat note covering an important limitation such as outdated data, differing definitions, or missing subgroup detail.\n\nHow a grader verifies this: Review the final document and confirm that every problem section has a caveat note of the requested type.","max_points":80},{"criterion":"The final cross-cutting comparison section identifies which 2 of the 5 problems appear most unequal across demographic lines based on the gathered evidence and explains why, and a CryptoPad Presentation is also created for presenting the findings.","description":"The final cross-cutting comparison section identifies which 2 of the 5 problems appear most unequal across demographic lines based on the gathered evidence and explains why, and a CryptoPad Presentation is also created for presenting the findings.\n\nHow a grader verifies this: Check the ending section of the briefing document for a clear selection of 2 problems and an evidence-based explanation for that judgment; an open CryptoPad Presentation summarizing the five problems and cross-cutting conclusions is available.","max_points":80}]}} +{"task_id":"d17c0e2905a166df99fa492edc8f7cc5a641de4e","confirmed_task":"I’m helping shape a future women’s cancer-equity outreach project, and I don’t want a generic summary — I need a browser-based evidence sweep that helps me decide which inequities are most important to focus on first. Please use public pages from authoritative sources such as NCI, CDC, ACS, NIH, major cancer centers, and peer-reviewed or professional society sources, and build me one organized briefing document that I could actually use. I want you to identify at least 12 distinct drivers of cancer inequities affecting women, but also group them by where they show up in the care pathway: prevention or risk exposure, screening and early detection, diagnosis, treatment access or quality, and survivorship or follow-up. To keep this grounded, compare at least 4 cancer areas that are especially relevant for women — for example breast, cervical, ovarian, and endometrial or another clearly justified substitution if the evidence is stronger — and note which drivers appear across multiple cancers versus which seem more specific to one cancer type. I also want at least 6 concrete intervention or program examples from reputable public sources that aim to reduce these inequities, such as navigation programs, screening access efforts, trial-inclusion initiatives, rural access models, or culturally tailored outreach, and for each one note what inequity it is trying to address and whether any outcome or evaluation is publicly described. As you work, keep the most important evidence tabs open, including at least 6 key source pages that directly support the final priorities, and leave open at least 2 public data or dashboard pages that help show the burden or disparity patterns. In the final briefing, end with a ranked top-5 priority list for where a women’s cancer-equity project should focus first, and make each priority include the main driver, the affected care stage, the cancer areas it shows up in, the strongest supporting sources, and a short explanation of why it belongs near the top.","website":"https://www.google.com","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"An organized briefing document is produced using public authoritative sources and is structured around the requested care-pathway stages: prevention or risk exposure, screening and early detection, diagnosis, treatment access or quality, and survivorship or follow-up.","description":"An organized briefing document is produced using public authoritative sources and is structured around the requested care-pathway stages: prevention or risk exposure, screening and early detection, diagnosis, treatment access or quality, and survivorship or follow-up.\n\nHow a grader verifies this: Check that the final document exists, is organized by the five requested stages, and cites public sources from authoritative organizations or peer-reviewed/professional sources.","max_points":160},{"criterion":"The briefing identifies at least 12 distinct drivers of cancer inequities affecting women and groups them into the requested care-pathway categories.","description":"The briefing identifies at least 12 distinct drivers of cancer inequities affecting women and groups them into the requested care-pathway categories.\n\nHow a grader verifies this: Count the listed drivers in the document and confirm they total 12 or more, with each assigned to one of the specified care-pathway stages.","max_points":180},{"criterion":"At least 4 cancer areas especially relevant for women are compared, including breast, cervical, ovarian, and endometrial unless a clearly justified substitution is explicitly explained, and the document distinguishes cross-cutting drivers from cancer-specific ones.","description":"At least 4 cancer areas especially relevant for women are compared, including breast, cervical, ovarian, and endometrial unless a clearly justified substitution is explicitly explained, and the document distinguishes cross-cutting drivers from cancer-specific ones.\n\nHow a grader verifies this: Check that four cancer areas are covered, that any substitution is justified in the document, and that the write-up explicitly notes which drivers recur across multiple cancers versus which are more specific.","max_points":180},{"criterion":"The briefing includes at least 6 concrete intervention or program examples from reputable public sources, and each example notes the inequity addressed plus whether any public outcome or evaluation is described.","description":"The briefing includes at least 6 concrete intervention or program examples from reputable public sources, and each example notes the inequity addressed plus whether any public outcome or evaluation is described.\n\nHow a grader verifies this: Count the intervention examples in the document and confirm there are at least six, each with the targeted inequity and an outcome/evaluation note or an explicit indication that it is not shown.","max_points":160},{"criterion":"The browser session keeps open at least 6 key source pages that directly support the final priorities and at least 2 public data or dashboard pages showing burden or disparity patterns.","description":"The browser session keeps open at least 6 key source pages that directly support the final priorities and at least 2 public data or dashboard pages showing burden or disparity patterns.\n\nHow a grader verifies this: Inspect the open tabs at the end and confirm there are at least six substantive evidence tabs plus two data/dashboard tabs relevant to the final briefing.","max_points":120},{"criterion":"The final briefing ends with a ranked top-5 priority list for where a women’s cancer-equity project should focus first, and each priority includes the main driver, affected care stage, cancer areas involved, strongest supporting sources, and a short why-it-ranks-high explanation.","description":"The final briefing ends with a ranked top-5 priority list for where a women’s cancer-equity project should focus first, and each priority includes the main driver, affected care stage, cancer areas involved, strongest supporting sources, and a short why-it-ranks-high explanation.\n\nHow a grader verifies this: Check that exactly five ranked priorities are present and that each contains all five requested elements.","max_points":200}]}} +{"task_id":"1f90184fb61690b3d2f0350196418d726a850a7d","confirmed_task":"I’m seriously considering becoming a personal trainer, but I don’t want a shallow summary of one NASM page. Please build me a real browser-based decision guide for choosing the best certification path to start this career. Start with NASM, ACE, ISSA, ACSM, and NSCA, and use each organization’s public official pages to capture the core CPT option, minimum eligibility requirements, study format, exam or test details, whether CPR/AED is required, renewal cycle, continuing-education expectations, and the current public price or ‘not shown’ if it isn’t listed. For NASM specifically, go deeper and compare at least 3 distinct CPT package options side by side so I can tell what extra support or materials I’d actually be paying for. Then pressure-test the market by checking at least 24 current public personal-trainer job listings across 6 major U.S. metro areas, and record which certifications employers explicitly accept or prefer, plus whether they mention CPR/AED, experience, or specialties. Keep the key official certification pages open for all 5 organizations, and also keep open at least 6 representative job-listing tabs that show the employer demand evidence. Finally, put everything into one organized comparison sheet or document with one section for the 5 certification bodies, one section for the NASM package comparison, one section summarizing the 24 job listings, and a final recommendation for the best option in each of these 3 scenarios: cheapest credible path, best-supported beginner path, and strongest choice for broad employer recognition. Leave the finished comparison open at the end along with the most useful evidence tabs.","website":"https://pages-delivery.nasm.org","level":"hard","reference_length":23,"precomputed_rubric":{"items":[{"criterion":"A finished comparison sheet or document is created and left open, with separate sections for the 5 certification bodies, the NASM package comparison, the 24 job listings, and the final 3-scenario recommendation.","description":"A finished comparison sheet or document is created and left open, with separate sections for the 5 certification bodies, the NASM package comparison, the 24 job listings, and the final 3-scenario recommendation.\n\nHow a grader verifies this: Check that the final artifact is open and visibly organized into the four requested sections, and that it contains entries for all required comparisons and recommendations.","max_points":180},{"criterion":"The task compares NASM, ACE, ISSA, ACSM, and NSCA using official public pages, capturing each organization’s CPT option, minimum eligibility requirements, study format, exam or test details, CPR/AED requirement status, renewal cycle, continuing-education expectations, and current public price or 'not shown.'","description":"The task compares NASM, ACE, ISSA, ACSM, and NSCA using official public pages, capturing each organization’s CPT option, minimum eligibility requirements, study format, exam or test details, CPR/AED requirement status, renewal cycle, continuing-education expectations, and current public price or 'not shown.'\n\nHow a grader verifies this: Review the final artifact and the open provider tabs to confirm all 5 organizations are covered and each requested field is present from official public pages.","max_points":220},{"criterion":"NASM is examined in extra depth with a side-by-side comparison of at least 3 distinct CPT package options, showing what added support or materials each package includes.","description":"NASM is examined in extra depth with a side-by-side comparison of at least 3 distinct CPT package options, showing what added support or materials each package includes.\n\nHow a grader verifies this: Confirm the artifact contains at least 3 NASM package entries with package-specific inclusions, and that relevant NASM tabs are open as evidence.","max_points":150},{"criterion":"At least 24 current public personal-trainer job listings across 6 major U.S. metro areas are checked, and the results record which certifications employers explicitly accept or prefer, plus whether CPR/AED, experience, or specialties are mentioned.","description":"At least 24 current public personal-trainer job listings across 6 major U.S. metro areas are checked, and the results record which certifications employers explicitly accept or prefer, plus whether CPR/AED, experience, or specialties are mentioned.\n\nHow a grader verifies this: Count the recorded job listings and metro areas in the final artifact, and confirm the requested employer-demand fields are captured for the listings reviewed.","max_points":200},{"criterion":"Key browser evidence is preserved by leaving open the official certification pages for all 5 organizations and at least 6 representative job-listing tabs showing employer demand.","description":"Key browser evidence is preserved by leaving open the official certification pages for all 5 organizations and at least 6 representative job-listing tabs showing employer demand.\n\nHow a grader verifies this: Inspect the open tabs at the end to confirm that all 5 official provider pages and at least 6 job-listing pages remain open.","max_points":100},{"criterion":"The final recommendation explicitly names the best option for each of these 3 scenarios: cheapest credible path, best-supported beginner path, and strongest choice for broad employer recognition.","description":"The final recommendation explicitly names the best option for each of these 3 scenarios: cheapest credible path, best-supported beginner path, and strongest choice for broad employer recognition.\n\nHow a grader verifies this: Check the recommendation section of the final artifact for 3 distinct scenario-based picks, each tied back to the comparison and job-listing evidence.","max_points":150}]}} +{"task_id":"d6007c19e6419c9eefdd57996fc151a2263b22fa","confirmed_task":"I want to choose an AI note-taking service I could actually subscribe to without regretting it, and I do not want ChatGPT or Grok. Please do a real browser-based comparison of 10 to 12 AI note-taking tools that are publicly available and plausibly suitable for a beginner, including meeting-note tools, voice-note tools, and general AI note apps if they clearly support note capture and summarization. For each one, use official public pages first to record the lowest paid plan price, whether there is a free tier or trial, what the cancellation flow seems to be from public billing/help pages, supported platforms, and the clearest beginner-onboarding evidence you can find from the product site or official app listing. Also check public founder or leadership pages and note whether the company appears to be woman-founded or woman-led; if that is not clearly shown, say not shown rather than guessing. Keep at least 6 of the strongest evidence tabs open at the end, including a mix of pricing, cancellation/help, and product/demo pages from the finalists. Put the results into one organized comparison sheet or document with one entry per tool, then rank your top 5 overall for a beginner on a budget, call out the best cheapest option, the best easiest-to-cancel option, and the best woman-founded or woman-led option if any qualifying candidate is clearly supported by public evidence. End with a short recommendation for which one I should try first and why, and a spreadhseet of the different options and their pricing.","website":"https://34.170.30.232","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"An organized comparison sheet or document exists with one entry per tool for 10 to 12 publicly available AI note-taking tools, excluding ChatGPT and Grok.","description":"An organized comparison sheet or document exists with one entry per tool for 10 to 12 publicly available AI note-taking tools, excluding ChatGPT and Grok.\n\nHow a grader verifies this: Check that the final artifact is open and contains 10 to 12 distinct tools, each relevant to AI note-taking, with ChatGPT and Grok excluded.","max_points":180},{"criterion":"For each tool, the artifact records the lowest paid plan price, whether there is a free tier or trial, supported platforms, and the clearest beginner-onboarding evidence from an official product page or official app listing.","description":"For each tool, the artifact records the lowest paid plan price, whether there is a free tier or trial, supported platforms, and the clearest beginner-onboarding evidence from an official product page or official app listing.\n\nHow a grader verifies this: Inspect entries in the artifact for all required fields and confirm they are populated from official public sources or marked clearly when unavailable.","max_points":180},{"criterion":"For each tool, the artifact records what the cancellation flow seems to be from public billing, subscription, or help pages, using official sources where possible.","description":"For each tool, the artifact records what the cancellation flow seems to be from public billing, subscription, or help pages, using official sources where possible.\n\nHow a grader verifies this: Check that each tool has a cancellation or subscription-management note tied to public policy/help content, with unclear cases marked clearly instead of inferred.","max_points":160},{"criterion":"For each tool, the artifact notes whether the company appears to be woman-founded or woman-led from public founder or leadership pages, and uses 'not shown' when that is not clearly supported.","description":"For each tool, the artifact notes whether the company appears to be woman-founded or woman-led from public founder or leadership pages, and uses 'not shown' when that is not clearly supported.\n\nHow a grader verifies this: Review the ownership or leadership field for every tool and confirm it is either supported by public evidence or explicitly marked 'not shown' without speculation.","max_points":140},{"criterion":"At least 6 strong evidence tabs are left open at the end, including a mix of pricing pages, cancellation/help pages, and product/demo or official app-listing pages from the finalists.","description":"At least 6 strong evidence tabs are left open at the end, including a mix of pricing pages, cancellation/help pages, and product/demo or official app-listing pages from the finalists.\n\nHow a grader verifies this: Confirm that at least 6 relevant tabs remain open and that they visibly cover the requested evidence types across finalist products.","max_points":140},{"criterion":"The final output ranks the top 5 overall tools for a beginner on a budget and explicitly identifies the best cheapest option, the best easiest-to-cancel option, and the best woman-founded or woman-led option if any qualifying candidate is clearly supported, and a spreadsheet of the compared options and their pricing is also produced.","description":"The final output ranks the top 5 overall tools for a beginner on a budget and explicitly identifies the best cheapest option, the best easiest-to-cancel option, and the best woman-founded or woman-led option if any qualifying candidate is clearly supported, and a spreadsheet of the compared options and their pricing is also produced.\n\nHow a grader verifies this: Check the final ranked section for exactly these recommendation categories and ensure each is directly traceable to the comparison findings; the deliverables include a spreadsheet listing the tools and their pricing details.","max_points":120},{"criterion":"The task ends with a short recommendation of which tool to try first and why.","description":"The task ends with a short recommendation of which tool to try first and why.\n\nHow a grader verifies this: Confirm there is a concise final recommendation naming one tool to try first with a reason grounded in the comparison.","max_points":80}]}} +{"task_id":"8e358ff165d3ade4cabe8a64cc95b6058a9aa107","confirmed_task":"I’m trying to decide whether the adidas Everyset Training Shoes for Men on Sierra are actually the best buy for me in men’s size 12, or just one decent option in a crowded field. Please start on Sierra, confirm the live product page for the adidas Everyset, and note the current price, any strike-through/original price if shown, available colors, and whether size 12 is in stock. Then build me a serious comparison set of exactly 10 men’s training or cross-training shoes in size 12 total, including the Sierra adidas pair plus 9 comparable options from public product pages on Sierra and other major retailers or brand sites. For each shoe, capture the product name, retailer, current price, original price if shown, whether size 12 is available, intended use cues from the page, and one or two practical notes from product details or review summaries such as stability, cushioning, versatility, or durability; if any field is missing, write not shown. I also want you to sanity-check the buying experience, so for each retailer represented in your comparison, open the public return-policy or shipping-information page and note the key basics that would matter to a normal buyer, like return window, obvious return fees if stated, and any free-shipping threshold if publicly shown. After that, compare the 10 shoes and narrow them to a final shortlist of 4 that cover different priorities: best overall gym shoe, best value, best for heavier lifting/stability, and best for mixed cardio-and-weights use. Keep the most useful product tabs open for those 4 finalists, plus the Sierra adidas Everyset page and the key policy pages you used, so I can review the evidence myself. End with a concise recommendation telling me whether I should buy the Sierra adidas Everyset now or pick one of the alternatives instead, and why.","website":"https://www.sierra.com","level":"hard","reference_length":7,"precomputed_rubric":{"items":[{"criterion":"The browsing session confirms the live Sierra product page for the adidas Everyset Training Shoes for Men and records its current price, original/strike-through price if shown, available colors, and whether men's size 12 is in stock.","description":"The browsing session confirms the live Sierra product page for the adidas Everyset Training Shoes for Men and records its current price, original/strike-through price if shown, available colors, and whether men's size 12 is in stock.\n\nHow a grader verifies this: Check that the final output includes these Sierra-specific details and that the Sierra adidas product page is left open as requested.","max_points":160},{"criterion":"The comparison set contains exactly 10 men's training or cross-training shoes in size 12 total, including the Sierra adidas pair plus 9 comparable options from public product pages on Sierra and/or other major retailers or brand sites.","description":"The comparison set contains exactly 10 men's training or cross-training shoes in size 12 total, including the Sierra adidas pair plus 9 comparable options from public product pages on Sierra and/or other major retailers or brand sites.\n\nHow a grader verifies this: Count the shoes in the final comparison and confirm there are exactly 10 total with the Sierra adidas included and all sourced from public product pages.","max_points":220},{"criterion":"For each of the 10 shoes, the final comparison records product name, retailer, current price, original price if shown, whether size 12 is available, intended-use cues from the page, and one or two practical notes from product details or review summaries, using 'not shown' where needed.","description":"For each of the 10 shoes, the final comparison records product name, retailer, current price, original price if shown, whether size 12 is available, intended-use cues from the page, and one or two practical notes from product details or review summaries, using 'not shown' where needed.\n\nHow a grader verifies this: Review all 10 entries and confirm each required field is present or explicitly marked 'not shown' when unavailable.","max_points":220},{"criterion":"For each retailer represented in the comparison, a public return-policy or shipping-information page is opened and the final output notes the buyer-relevant basics: return window, obvious return fees if stated, and any free-shipping threshold if publicly shown.","description":"For each retailer represented in the comparison, a public return-policy or shipping-information page is opened and the final output notes the buyer-relevant basics: return window, obvious return fees if stated, and any free-shipping threshold if publicly shown.\n\nHow a grader verifies this: Match the retailers in the shoe comparison to corresponding policy pages and confirm the requested policy basics are summarized for each represented retailer.","max_points":140},{"criterion":"The final output narrows the 10 shoes to a shortlist of exactly 4 finalists covering the four requested priorities: best overall gym shoe, best value, best for heavier lifting/stability, and best for mixed cardio-and-weights use.","description":"The final output narrows the 10 shoes to a shortlist of exactly 4 finalists covering the four requested priorities: best overall gym shoe, best value, best for heavier lifting/stability, and best for mixed cardio-and-weights use.\n\nHow a grader verifies this: Check that there are exactly 4 finalists and that each of the four named priority categories is filled by one finalist.","max_points":140},{"criterion":"The most useful product tabs are left open for the 4 finalists, along with the Sierra adidas Everyset page and the key policy pages used, and the session ends with a concise buy-now recommendation on whether to choose the Sierra adidas Everyset or an alternative.","description":"The most useful product tabs are left open for the 4 finalists, along with the Sierra adidas Everyset page and the key policy pages used, and the session ends with a concise buy-now recommendation on whether to choose the Sierra adidas Everyset or an alternative.\n\nHow a grader verifies this: Confirm the requested evidence tabs remain open and that the final recommendation explicitly answers whether to buy the Sierra adidas Everyset now or choose another option instead, with a brief why.","max_points":120}]}} +{"task_id":"ddfec1f140fd9fb3611e241c87c220ab77985b01","confirmed_task":"I’m planning a future climate-controlled shed that I might use as a backyard office or workshop, and I don’t want a shallow 'watch two videos and guess' answer. Please do a real browser-based comparison of insulation approaches and help me choose one assembly I could actually build. Start by reviewing at least 8 strong public sources total: at least 3 YouTube videos that show or explain shed or small outbuilding insulation, at least 3 non-video sources from building-science, manufacturer, or energy-efficiency guidance pages, and at least 2 retail product pages with current specs or pricing. Use those sources to compare exactly 3 full insulation strategies for a climate-controlled shed—for example some mix of fiberglass or mineral wool batts, rigid foam board, and spray foam—covering walls, roof/ceiling, and floor, not just one cavity type. For each strategy, note the stated or implied R-value approach, how moisture or vapor control is handled, how much interior space it consumes, the likely DIY difficulty, and the main failure risks or watch-outs; if a source does not show something, write 'not shown.' Then build one practical recommendation for a common small shed size like 10x12, including a suggested wall assembly, roof/ceiling assembly, and floor assembly that work together for a climate-controlled setup. After that, use public retail pages to price out a rough materials basket for each of the 3 strategies as realistically as you can from visible package sizes or unit pricing, and compare the total estimated material cost with 'not shown' where needed. Keep the most convincing evidence tabs open for the final winner, including at least one video tab, one technical/spec page, and one or two retail product pages, so I can sanity-check the recommendation visually afterward. Finish with a concise decision memo in Cryptopad Documents that names the best overall option, the best budget option, and the option you would avoid unless a special condition makes it worthwhile.","website":"https://www.youtube.com","level":"hard","reference_length":17,"precomputed_rubric":{"items":[{"criterion":"The browsing session uses at least 8 public sources total, including at least 3 YouTube videos, at least 3 non-video building-science/manufacturer/energy-efficiency pages, and at least 2 retail product pages.","description":"The browsing session uses at least 8 public sources total, including at least 3 YouTube videos, at least 3 non-video building-science/manufacturer/energy-efficiency pages, and at least 2 retail product pages.\n\nHow a grader verifies this: Count the sources actually used in the final synthesis and confirm they meet the requested source-type minimums.","max_points":160},{"criterion":"Exactly 3 full insulation strategies are compared, and each strategy covers walls, roof/ceiling, and floor rather than discussing only one part of the shed.","description":"Exactly 3 full insulation strategies are compared, and each strategy covers walls, roof/ceiling, and floor rather than discussing only one part of the shed.\n\nHow a grader verifies this: Check that the final comparison has three and only three strategies, with wall, roof/ceiling, and floor treatment described for each one.","max_points":180},{"criterion":"For each of the 3 strategies, the comparison records the R-value approach, moisture or vapor-control approach, interior space impact, DIY difficulty, and main failure risks or watch-outs, using 'not shown' where a source does not provide a field.","description":"For each of the 3 strategies, the comparison records the R-value approach, moisture or vapor-control approach, interior space impact, DIY difficulty, and main failure risks or watch-outs, using 'not shown' where a source does not provide a field.\n\nHow a grader verifies this: Review each strategy entry and confirm all requested fields are present, with 'not shown' used when information is missing.","max_points":180},{"criterion":"A single recommended climate-controlled shed assembly for a 10x12 shed is produced, including a wall assembly, roof/ceiling assembly, and floor assembly that are presented as one coherent system.","description":"A single recommended climate-controlled shed assembly for a 10x12 shed is produced, including a wall assembly, roof/ceiling assembly, and floor assembly that are presented as one coherent system.\n\nHow a grader verifies this: Confirm the final recommendation names one overall winner and specifies all three assembly parts for the 10x12 shed.","max_points":180},{"criterion":"A rough materials basket and estimated material cost is produced for each of the 3 strategies using visible public retail specs or pricing, with 'not shown' for anything unavailable.","description":"A rough materials basket and estimated material cost is produced for each of the 3 strategies using visible public retail specs or pricing, with 'not shown' for anything unavailable.\n\nHow a grader verifies this: Check that each strategy has a materials/pricing section grounded in retail product pages and that missing values are marked 'not shown' rather than invented.","max_points":140},{"criterion":"The final output includes a concise decision memo naming the best overall option, the best budget option, and the option to avoid unless a special condition makes it worthwhile, and the concise decision memo is written in CryptoPad Documents.","description":"The final output includes a concise decision memo naming the best overall option, the best budget option, and the option to avoid unless a special condition makes it worthwhile, and the concise decision memo is written in CryptoPad Documents.\n\nHow a grader verifies this: Confirm the memo includes all three requested judgments and that each is directly tied to the comparison findings; the final decision memo is present as an open CryptoPad Document.","max_points":80},{"criterion":"Useful browser evidence is left open for the winning approach, including at least one video tab, one technical/spec page, and one or two retail product pages.","description":"Useful browser evidence is left open for the winning approach, including at least one video tab, one technical/spec page, and one or two retail product pages.\n\nHow a grader verifies this: Inspect the remaining open tabs and confirm they include the requested mix of evidence pages relevant to the recommended winner.","max_points":80}]}} +{"task_id":"fb2f8bea3fa9528a581ce9e46bcc552c93e186a6","confirmed_task":"I’m trying to choose a bar-catering company for a future private event in the Minneapolis area, and Steady Pour is one of the vendors I want seriously evaluated rather than just looked up in isolation. Please start with Steady Pour’s public site, pull every bar-catering or home-bar service option you can find, and record the listed pricing if it’s published; if a price is missing, mark it clearly as not shown instead of guessing. Then broaden this into a real comparison by finding 7 other Minneapolis-area mobile bar or bar-catering companies with public pages, so we end up with exactly 8 total vendors including Steady Pour. For each vendor, use only public pages to capture the core offering types, whether alcohol is included or BYO, staffing or bartender details if shown, minimum guest count or event minimum if shown, service area, pricing or starting price if shown, and any obvious extras like mixers, glassware, mocktails, coffee cart, trailer bar, or dry hire options; use not shown wherever the site does not publish something. As you work, keep the strongest evidence visible by opening the main service or pricing page for all 8 vendors, and also open photo, gallery, or social-proof pages for at least 4 of them so I can visually compare presentation quality. Compare the vendors for two realistic use cases: a budget-conscious casual party and a more polished wedding-style event, and tell me which vendors look strongest for each case based only on what their public pages support. At the end, give me one organized comparison with all 8 vendors, call out where Steady Pour stands on pricing transparency and service fit versus the others, recommend a top 3 shortlist, and leave the most useful service/pricing tabs open for Steady Pour plus the two best alternatives.","website":"https://www.steadypour.com","level":"hard","reference_length":7,"precomputed_rubric":{"items":[{"criterion":"The final comparison covers exactly 8 total Minneapolis-area bar-catering vendors, including Steady Pour plus 7 other vendors with public pages.","description":"The final comparison covers exactly 8 total Minneapolis-area bar-catering vendors, including Steady Pour plus 7 other vendors with public pages.\n\nHow a grader verifies this: Check the final output for exactly 8 distinct vendors and confirm Steady Pour is included alongside 7 additional Minneapolis-area mobile bar or bar-catering companies.","max_points":160},{"criterion":"Steady Pour’s public site is used to capture every bar-catering or home-bar service option found, with listed pricing recorded when published and 'not shown' used when pricing is missing.","description":"Steady Pour’s public site is used to capture every bar-catering or home-bar service option found, with listed pricing recorded when published and 'not shown' used when pricing is missing.\n\nHow a grader verifies this: Review the Steady Pour entry and supporting open tab(s) to confirm all relevant service options found on the site are summarized and each has either listed pricing or an explicit 'not shown' note.","max_points":170},{"criterion":"For each of the 8 vendors, the comparison records the requested public-page fields: core offering types, whether alcohol is included or BYO, staffing or bartender details if shown, minimum guest count or event minimum if shown, service area, pricing or starting price if shown, and notable extras, with 'not shown' used for missing fields.","description":"For each of the 8 vendors, the comparison records the requested public-page fields: core offering types, whether alcohol is included or BYO, staffing or bartender details if shown, minimum guest count or event minimum if shown, service area, pricing or starting price if shown, and notable extras, with 'not shown' used for missing fields.\n\nHow a grader verifies this: Inspect the final comparison row-by-row or vendor-by-vendor to confirm every requested field is present for all 8 vendors and that missing information is marked 'not shown' rather than inferred.","max_points":240},{"criterion":"The browser session keeps open the main service or pricing page for all 8 vendors, plus photo, gallery, or social-proof pages for at least 4 of them so presentation quality can be visually compared.","description":"The browser session keeps open the main service or pricing page for all 8 vendors, plus photo, gallery, or social-proof pages for at least 4 of them so presentation quality can be visually compared.\n\nHow a grader verifies this: Confirm that open tabs include one primary service/pricing page for each of the 8 vendors and at least 4 additional tabs showing gallery, photos, reviews, or similar visual/social-proof pages.","max_points":140},{"criterion":"The final synthesis explicitly compares the 8 vendors for two use cases: a budget-conscious casual party and a polished wedding-style event.","description":"The final synthesis explicitly compares the 8 vendors for two use cases: a budget-conscious casual party and a polished wedding-style event.\n\nHow a grader verifies this: Check that both use cases are addressed separately and that the comparison explains which vendors fit each scenario based on evidence from public pages.","max_points":110},{"criterion":"The final recommendation identifies a top 3 shortlist and explicitly states where Steady Pour stands on pricing transparency and service fit versus the other vendors.","description":"The final recommendation identifies a top 3 shortlist and explicitly states where Steady Pour stands on pricing transparency and service fit versus the other vendors.\n\nHow a grader verifies this: Review the conclusion to confirm a ranked or clearly named top 3 exists and that Steady Pour is directly positioned against competitors on transparency and fit.","max_points":100},{"criterion":"The most useful tabs are left open for Steady Pour and the two best alternative vendors, centered on their service or pricing pages.","description":"The most useful tabs are left open for Steady Pour and the two best alternative vendors, centered on their service or pricing pages.\n\nHow a grader verifies this: Confirm that the ending browser state retains the key service/pricing tabs for Steady Pour and the two recommended alternative vendors.","max_points":80}]}} +{"task_id":"7bfd58a150de3345ae03d53d828424d95a91d301","confirmed_task":"I’m trying to figure out whether Netflix alone can carry a really good mystery-TV run for me over the next few months, not just hand me a random list of 5 shows. Please start on Netflix and identify at least 20 mystery series that are actually available there, using Netflix’s browse/search/title pages as the backbone of the sweep. Then compare them on the public web so I can make a real decision: for each title, capture the basic hook, whether it’s more detective / crime mystery / thriller / supernatural mystery / puzzle-box, whether it looks finished or still ongoing if that is publicly shown, and the rough viewing commitment in episodes or seasons when available. I also want quality and fit, so cross-check each title with public review or metadata pages and note an IMDb score, a Rotten Tomatoes score, or ‘not shown’ if one of those isn’t available. From that larger pool, narrow it to exactly 10 strongest options for me, making sure the final 10 are not all the same kind of mystery and include at least 2 non-English series if Netflix has enough good candidates. For those final 10, open and keep the key evidence tabs available so I can inspect them later: keep the Netflix title pages open for the final 10, plus at least 5 outside pages that were especially useful for comparing reception or episode counts. End with a clear ranked watchlist from 1 to 10 that tells me which series to start with first, which ones are best if I want something short and high-quality, which ones are best for a long binge, and whether Netflix’s mystery lineup actually looks deep enough that I would not need another streaming service right now.","website":"https://www.netflix.com","level":"hard","reference_length":6,"precomputed_rubric":{"items":[{"criterion":"At least 20 mystery TV series available on Netflix are identified from Netflix browse/search/title pages as the comparison pool.","description":"At least 20 mystery TV series available on Netflix are identified from Netflix browse/search/title pages as the comparison pool.\n\nHow a grader verifies this: The final output lists 20 or more distinct mystery series and shows they were sourced from Netflix pages rather than only outside summaries.","max_points":180},{"criterion":"Each title in the comparison pool includes the requested decision details: basic hook, subgenre classification, finished/ongoing status when publicly shown, and rough viewing commitment in episodes or seasons when available.","description":"Each title in the comparison pool includes the requested decision details: basic hook, subgenre classification, finished/ongoing status when publicly shown, and rough viewing commitment in episodes or seasons when available.\n\nHow a grader verifies this: For each of the 20+ titles, the recorded entry contains those fields or explicitly says 'not shown' where the public pages do not provide them.","max_points":180},{"criterion":"Each title is cross-checked on the public web for quality signals, with an IMDb score, a Rotten Tomatoes score, or 'not shown' when unavailable.","description":"Each title is cross-checked on the public web for quality signals, with an IMDb score, a Rotten Tomatoes score, or 'not shown' when unavailable.\n\nHow a grader verifies this: The comparison pool includes public-source quality metadata for every title, and missing values are marked 'not shown' rather than omitted.","max_points":160},{"criterion":"The larger pool is narrowed to exactly 10 strongest options, and the final 10 are meaningfully varied rather than all being the same kind of mystery, including at least 2 non-English series if enough good candidates were found.","description":"The larger pool is narrowed to exactly 10 strongest options, and the final 10 are meaningfully varied rather than all being the same kind of mystery, including at least 2 non-English series if enough good candidates were found.\n\nHow a grader verifies this: The final shortlist contains exactly 10 titles, reflects multiple mystery subtypes, and includes 2 or more non-English entries unless the output explicitly explains that Netflix did not provide enough strong candidates.","max_points":170},{"criterion":"Key browser evidence is kept open: the Netflix title pages for the final 10, plus at least 5 outside pages that were especially useful for comparing reception or episode counts.","description":"Key browser evidence is kept open: the Netflix title pages for the final 10, plus at least 5 outside pages that were especially useful for comparing reception or episode counts.\n\nHow a grader verifies this: The session ends with those evidence tabs still open and aligned with the titles used in the final shortlist.","max_points":130},{"criterion":"A clear ranked watchlist from 1 to 10 is produced, including which series to start first, which are best for a short high-quality watch, and which are best for a long binge.","description":"A clear ranked watchlist from 1 to 10 is produced, including which series to start first, which are best for a short high-quality watch, and which are best for a long binge.\n\nHow a grader verifies this: The final synthesis contains a numbered ranking of all 10 shortlisted series and explicitly identifies the best starting pick, short-watch picks, and long-binge picks.","max_points":100},{"criterion":"The final recommendation explicitly answers whether Netflix’s mystery lineup looks deep enough that the user would not need another streaming service right now.","description":"The final recommendation explicitly answers whether Netflix’s mystery lineup looks deep enough that the user would not need another streaming service right now.\n\nHow a grader verifies this: The conclusion makes a direct yes/no or nuanced recommendation about Netflix-only viability and ties it to the evidence gathered from the 20-title sweep and 10-title shortlist.","max_points":80}]}} +{"task_id":"ad918b380d5fb668438fada2d0ca9cad172f759c","confirmed_task":"I’m planning a special-occasion future trip to New York City and I don’t want just three fancy hotel names — I want a real luxury-hotel decision I could actually book from. Start with Google Hotels or Google Travel and build me a comparison set of exactly 12 NYC luxury hotels that are clearly positioned at the top end of the market, ideally true 5-star properties when that’s shown. Use the same future 3-night stay window for all 12 so the prices are comparable, and record the nightly rate or average nightly rate shown there, plus any resort, destination, or mandatory fees if they’re publicly shown. Make sure the 12 hotels cover at least 4 distinct NYC luxury-stay areas such as Central Park South, Midtown/Fifth Avenue, Downtown/Tribeca/SoHo, and one other area if relevant. Then open the official property pages for the 8 strongest contenders and verify the core details that matter for an actual decision: room style and size cues if shown, standout amenities like spa/pool, dining on-site, view/location advantages, and cancellation flexibility if publicly listed. For those 8 contenders, also use maps and photos to sanity-check what the surrounding area feels like and how convenient each one is for a luxury leisure trip. After that, narrow the list to a final 6-hotel shortlist and give me a structured recommendation: best overall luxury stay, best classic NYC splurge, best quieter or more private option, and best value-for-luxury among the finalists. Please keep the most useful tabs open at the end: the Google hotel results page for the search, the official pages for the final 6 hotels, and at least 2 map or photo views that help show the neighborhood tradeoffs.","website":"https://www.google.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"A comparison set of exactly 12 NYC luxury hotels is created from Google Hotels or Google Travel using the same future 3-night stay window for all 12, and each entry includes the nightly rate or average nightly rate shown there.","description":"A comparison set of exactly 12 NYC luxury hotels is created from Google Hotels or Google Travel using the same future 3-night stay window for all 12, and each entry includes the nightly rate or average nightly rate shown there.\n\nHow a grader verifies this: Check that the final output lists exactly 12 hotels, all tied to one consistent 3-night future stay window, with a displayed nightly or average nightly rate recorded for each from the Google hotel search workflow.","max_points":200},{"criterion":"The 12 hotels span at least 4 distinct NYC luxury-stay areas, including Central Park South, Midtown/Fifth Avenue, Downtown/Tribeca/SoHo, and one additional relevant area if used.","description":"The 12 hotels span at least 4 distinct NYC luxury-stay areas, including Central Park South, Midtown/Fifth Avenue, Downtown/Tribeca/SoHo, and one additional relevant area if used.\n\nHow a grader verifies this: Check the final comparison for neighborhood labeling or grouping that covers at least 4 distinct areas and includes the specifically requested core areas.","max_points":140},{"criterion":"For all 12 hotels, any resort, destination, or mandatory fees are recorded when publicly shown.","description":"For all 12 hotels, any resort, destination, or mandatory fees are recorded when publicly shown.\n\nHow a grader verifies this: Inspect the comparison and confirm fee information is included wherever publicly shown, with missing cases left absent or clearly noted rather than invented.","max_points":120},{"criterion":"The official property pages for the 8 strongest contenders are opened and used to verify decision-critical details: room style and size cues if shown, standout amenities like spa/pool, on-site dining, location or view advantages, and cancellation flexibility if publicly listed.","description":"The official property pages for the 8 strongest contenders are opened and used to verify decision-critical details: room style and size cues if shown, standout amenities like spa/pool, on-site dining, location or view advantages, and cancellation flexibility if publicly listed.\n\nHow a grader verifies this: Confirm that 8 contenders were advanced to official-page review and that the requested categories of details were captured from those official pages for each contender where publicly available.","max_points":200},{"criterion":"For those 8 contenders, maps and photos are used to sanity-check the surrounding area and leisure-trip convenience.","description":"For those 8 contenders, maps and photos are used to sanity-check the surrounding area and leisure-trip convenience.\n\nHow a grader verifies this: Look for evidence that map/photo views were consulted for the 8 contenders and that neighborhood feel or convenience observations were incorporated into the comparison.","max_points":120},{"criterion":"A final 6-hotel shortlist is produced with four explicit recommendations: best overall luxury stay, best classic NYC splurge, best quieter or more private option, and best value-for-luxury among the finalists.","description":"A final 6-hotel shortlist is produced with four explicit recommendations: best overall luxury stay, best classic NYC splurge, best quieter or more private option, and best value-for-luxury among the finalists.\n\nHow a grader verifies this: Check that the output narrows to exactly 6 finalists and includes all four named recommendation categories with a clearly chosen hotel for each.","max_points":140},{"criterion":"The most useful evidence tabs are left open at the end: the Google hotel results page, the official pages for the final 6 hotels, and at least 2 map or photo views showing neighborhood tradeoffs.","description":"The most useful evidence tabs are left open at the end: the Google hotel results page, the official pages for the final 6 hotels, and at least 2 map or photo views showing neighborhood tradeoffs.\n\nHow a grader verifies this: Confirm the retained browser state includes the Google hotel search/results page, 6 official hotel pages corresponding to the finalists, and at least 2 map/photo tabs relevant to comparing neighborhoods.","max_points":80}]}} +{"task_id":"17f506970491ac59150ad919a8cc9fbefce52ff4","confirmed_task":"I’ve been wanting to rebuild my social life and meet new people around Glendale, California, but I don’t want just a couple of random Meetup links. Please do a real browser-based sweep of recurring social options that someone could actually use over the next month or two, centered on Glendale and extending to nearby areas that are still practical like Burbank, Pasadena, Eagle Rock, Atwater Village, Silver Lake, or Los Feliz if needed. Find at least 12 viable ways to meet people, and make sure they come from at least 4 different public sources or categories such as Meetup groups, volunteer opportunities, community classes, hobby clubs, running groups, language exchanges, book clubs, board-game nights, or public event series. For each option, capture the name, what kind of people it seems aimed at, the neighborhood, whether it looks recurring or one-off, the next visible date or schedule if shown, the price or 'not shown,' and one short note on why it seems good for meeting new people rather than just attending passively. Open and compare the actual public pages, and also use maps/photos when helpful so you can sanity-check whether the locations feel practical and active. Then narrow it to the 6 strongest options for someone new in the area who wants genuine repeat social contact, variety, and a reasonable budget, and build a first-month plan using exactly 4 picks that together give a good mix of social styles and neighborhoods without requiring a ridiculous amount of driving. Keep the most useful evidence tabs open, including at least 6 final option pages and at least 2 map or venue pages, and finish with a concise recommendation on which single option looks best for easiest first-time socializing versus which one looks best for building longer-term community.","website":"https://www.meetup.com","level":"hard","reference_length":18,"precomputed_rubric":{"items":[{"criterion":"At least 12 viable ways to meet people are identified, centered on Glendale and only extending to nearby practical areas named in the prompt when needed.","description":"At least 12 viable ways to meet people are identified, centered on Glendale and only extending to nearby practical areas named in the prompt when needed.\n\nHow a grader verifies this: Count the final options listed and confirm they are social options in Glendale or the specified nearby neighborhoods.","max_points":180},{"criterion":"The 12 options come from at least 4 different public sources or categories named in the prompt, such as Meetup groups, volunteer opportunities, community classes, hobby clubs, running groups, language exchanges, book clubs, board-game nights, or public event series.","description":"The 12 options come from at least 4 different public sources or categories named in the prompt, such as Meetup groups, volunteer opportunities, community classes, hobby clubs, running groups, language exchanges, book clubs, board-game nights, or public event series.\n\nHow a grader verifies this: Check that the final set spans at least 4 distinct source/category types explicitly represented in the results.","max_points":160},{"criterion":"For each option, the output includes the name, audience or social fit, neighborhood, whether it appears recurring or one-off, the next visible date or schedule if shown, the price or 'not shown,' and a note on why it seems good for meeting new people.","description":"For each option, the output includes the name, audience or social fit, neighborhood, whether it appears recurring or one-off, the next visible date or schedule if shown, the price or 'not shown,' and a note on why it seems good for meeting new people.\n\nHow a grader verifies this: Inspect each listed option for all required fields and allow 'not shown' only where the public page does not display the information.","max_points":200},{"criterion":"The browsing session uses actual public pages and map/photo sanity checks where helpful to judge practicality and activity.","description":"The browsing session uses actual public pages and map/photo sanity checks where helpful to judge practicality and activity.\n\nHow a grader verifies this: Confirm that evidence tabs/pages were opened for the options and that at least some location judgments are supported by map or venue/photo pages rather than unsupported claims.","max_points":140},{"criterion":"The final synthesis narrows the research to the 6 strongest options for someone new in the area who wants repeat social contact, variety, and a reasonable budget.","description":"The final synthesis narrows the research to the 6 strongest options for someone new in the area who wants repeat social contact, variety, and a reasonable budget.\n\nHow a grader verifies this: Check that exactly 6 finalists are selected and that the reasoning clearly uses the stated criteria of repeat contact, variety, and budget.","max_points":120},{"criterion":"A first-month plan is built using exactly 4 picks that together provide a mix of social styles and neighborhoods without excessive driving.","description":"A first-month plan is built using exactly 4 picks that together provide a mix of social styles and neighborhoods without excessive driving.\n\nHow a grader verifies this: Confirm that exactly 4 options are chosen for the plan and that the writeup explains the mix of styles/neighborhoods and driving practicality.","max_points":100},{"criterion":"The most useful evidence tabs are left open, including at least 6 final option pages and at least 2 map or venue pages, and the final recommendation distinguishes the best option for easiest first-time socializing from the best option for longer-term community.","description":"The most useful evidence tabs are left open, including at least 6 final option pages and at least 2 map or venue pages, and the final recommendation distinguishes the best option for easiest first-time socializing from the best option for longer-term community.\n\nHow a grader verifies this: Check the open tabs against the minimum counts and confirm the final recommendation explicitly names both the easiest first-time option and the best longer-term community option.","max_points":100}]}} +{"task_id":"437482860f2bd9d99a668952d4516a9d2c2c5878","confirmed_task":"I want to find a real board game community in Los Angeles that I could actually start showing up to regularly, not just grab two random Meetup links. Please use public pages to build me a serious shortlist of at least 12 recurring board game groups, events, or game-night venues across the broader Los Angeles area, and pull them from at least 3 different source types where possible, such as Meetup, board game cafes or store calendars, Eventbrite, library or community-center listings, or other public event pages. For each option, record the name, the group or event page, the neighborhood or city area, what kind of play it seems to focus on, whether it looks beginner-friendly or more hardcore, whether there is any obvious cost or purchase requirement, and whether there is visible evidence that it is still active or recently updated; if a field is not shown, say not shown. I also want you to compare the options by region so I can tell whether the best fits are on the Westside, Eastside, central LA/Hollywood, South Bay, San Gabriel Valley, or the Valley, and open map or venue pages for the strongest candidates so I can visually sanity-check the locations. Then narrow the list to the 6 best choices for someone trying to meet people and actually play regularly, making sure the final 6 are not all the same type of scene. For those 6, keep the most useful group or event pages open, plus maps or venue pages where helpful, and tell me why each one made the cut. Finally, give me a practical starter plan: recommend the best 3 options for a newcomer, the best 2 for more serious hobby gaming, and the best 1 low-pressure social option, with a suggested order to try them over a future month so I can test different scenes without committing too quickly.","website":"https://www.meetup.com","level":"hard","reference_length":14,"precomputed_rubric":{"items":[{"criterion":"At least 12 recurring Los Angeles-area board game groups, events, or game-night venues are identified from public pages, using at least 3 different source types where possible.","description":"At least 12 recurring Los Angeles-area board game groups, events, or game-night venues are identified from public pages, using at least 3 different source types where possible.\n\nHow a grader verifies this: Count the final shortlist entries and confirm source diversity from the recorded pages.","max_points":180},{"criterion":"Each of the at least 12 options includes the requested recorded details: name, group or event page, neighborhood or city area, apparent play style, beginner-friendly versus more hardcore impression, any obvious cost or purchase requirement, and visible evidence of recent activity; missing fields are marked not shown.","description":"Each of the at least 12 options includes the requested recorded details: name, group or event page, neighborhood or city area, apparent play style, beginner-friendly versus more hardcore impression, any obvious cost or purchase requirement, and visible evidence of recent activity; missing fields are marked not shown.\n\nHow a grader verifies this: Check each option entry for all required fields and that missing information is explicitly labeled not shown.","max_points":220},{"criterion":"The options are compared by region so the user can distinguish candidates across the Westside, Eastside, central LA/Hollywood, South Bay, San Gabriel Valley, and the Valley.","description":"The options are compared by region so the user can distinguish candidates across the Westside, Eastside, central LA/Hollywood, South Bay, San Gabriel Valley, and the Valley.\n\nHow a grader verifies this: Review the synthesis and confirm that regional grouping or comparison is present and tied to the identified options.","max_points":130},{"criterion":"Map or venue pages are opened for the strongest candidates so the user can visually sanity-check locations, and the most useful candidate pages remain open.","description":"Map or venue pages are opened for the strongest candidates so the user can visually sanity-check locations, and the most useful candidate pages remain open.\n\nHow a grader verifies this: Inspect the open tabs to confirm candidate group/event pages plus supporting map or venue pages are left open for the strongest options.","max_points":140},{"criterion":"The list is narrowed to exactly 6 best choices for someone trying to meet people and play regularly, and those 6 are not all the same type of scene.","description":"The list is narrowed to exactly 6 best choices for someone trying to meet people and play regularly, and those 6 are not all the same type of scene.\n\nHow a grader verifies this: Confirm there are exactly 6 finalists and that the final explanations reflect scene diversity rather than one repetitive category.","max_points":140},{"criterion":"For the final 6, the response explains why each one made the cut and keeps the most useful group or event pages open, plus maps or venue pages where helpful.","description":"For the final 6, the response explains why each one made the cut and keeps the most useful group or event pages open, plus maps or venue pages where helpful.\n\nHow a grader verifies this: Check that each finalist has an explicit rationale and corresponding useful evidence tabs remain open.","max_points":90},{"criterion":"A practical starter plan is provided with the best 3 newcomer options, the best 2 more serious hobby-gaming options, the best 1 low-pressure social option, and a suggested order to try them over a future month.","description":"A practical starter plan is provided with the best 3 newcomer options, the best 2 more serious hobby-gaming options, the best 1 low-pressure social option, and a suggested order to try them over a future month.\n\nHow a grader verifies this: Review the final recommendation section for the exact 3-2-1 breakdown and the month-long suggested trial order.","max_points":100}]}} +{"task_id":"545a1b36d5ada637dea9e0213f0a8c54c31c29dd","confirmed_task":"I’m trying to buy a pair of black calf-high dress boots that actually look sleek on the leg instead of loose or slouchy, and I wear women’s size 5.5, so I want a real browser-based shopping sweep rather than one lucky guess. Please start with DSW since that’s where I began looking, but expand across other major public retail sites if needed and build me a shortlist of exactly 12 viable options. Every candidate should be black, calf-high or very close to calf-high, have a thick or block-style heel, and show women’s size 5.5 or a clear equivalent; if shaft height, calf circumference, or width details are missing, just write “not shown” instead of guessing. As you review each product page, use the photos, description, measurements, and customer-review evidence to judge whether the boot looks form-fitting through the calf rather than wide, slouchy, or obviously gappy. For all 12 options, capture the product name, retailer, current price, heel type and height if shown, shaft height if shown, calf circumference if shown, material, size-5.5 availability status, and a short fit-confidence note. Then compare the retailer’s return-policy basics and any obvious shipping constraints because fit is the biggest risk here. After that, narrow the 12 down to the best 4 finalists: one best overall, one best value, one most form-fitting-looking, and one safest-to-try because of returns. Keep the product tabs open for those final 4, and also keep open the most relevant return-policy pages for the retailers behind them so I can sanity-check the fit risk later. Finish with a concise decision memo in the chat that lists all 12 candidates and clearly tells me which of the 4 I should try first and why.","website":"https://www.dsw.com","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"A shortlist of exactly 12 viable boots is produced, and each candidate matches the requested style constraints: black, calf-high or very close to calf-high, thick or block-style heel, and women’s size 5.5 or a clear equivalent.","description":"A shortlist of exactly 12 viable boots is produced, and each candidate matches the requested style constraints: black, calf-high or very close to calf-high, thick or block-style heel, and women’s size 5.5 or a clear equivalent.\n\nHow a grader verifies this: Check the final comparison for 12 total entries and confirm each entry explicitly records the requested style/size fit rather than drifting into unrelated boot types.","max_points":200},{"criterion":"Each of the 12 candidates includes the requested product details: product name, retailer, current price, heel type and height if shown, shaft height if shown, calf circumference if shown, material, size-5.5 availability status, and a short fit-confidence note, using 'not shown' where needed.","description":"Each of the 12 candidates includes the requested product details: product name, retailer, current price, heel type and height if shown, shaft height if shown, calf circumference if shown, material, size-5.5 availability status, and a short fit-confidence note, using 'not shown' where needed.\n\nHow a grader verifies this: Review the final memo and confirm all listed fields appear for every one of the 12 options, with missing measurements marked as 'not shown' instead of omitted or guessed.","max_points":200},{"criterion":"The comparison uses product-page evidence to judge whether each boot looks form-fitting through the calf rather than wide, slouchy, or obviously gappy.","description":"The comparison uses product-page evidence to judge whether each boot looks form-fitting through the calf rather than wide, slouchy, or obviously gappy.\n\nHow a grader verifies this: Check that each candidate has a fit-confidence note grounded in photos, descriptions, measurements, or customer-review evidence from the product pages.","max_points":160},{"criterion":"Return-policy basics and any obvious shipping constraints are compared for the retailers involved, since fit risk is part of the decision.","description":"Return-policy basics and any obvious shipping constraints are compared for the retailers involved, since fit risk is part of the decision.\n\nHow a grader verifies this: Confirm the final memo includes return-policy basics and shipping notes for the relevant retailers, rather than only product-level attributes.","max_points":140},{"criterion":"The 12 options are narrowed to exactly 4 finalists labeled as best overall, best value, most form-fitting-looking, and safest-to-try because of returns.","description":"The 12 options are narrowed to exactly 4 finalists labeled as best overall, best value, most form-fitting-looking, and safest-to-try because of returns.\n\nHow a grader verifies this: Check that the final recommendation section names exactly 4 finalists and assigns each one to the explicit category requested in the prompt.","max_points":160},{"criterion":"Browser evidence is preserved by keeping open the product tabs for the final 4 boots and the most relevant retailer return-policy pages tied to those finalists.","description":"Browser evidence is preserved by keeping open the product tabs for the final 4 boots and the most relevant retailer return-policy pages tied to those finalists.\n\nHow a grader verifies this: Inspect the open tabs at the end to confirm the 4 finalist product pages remain open along with the corresponding return-policy pages the prompt asked to keep visible.","max_points":140}]}} +{"task_id":"59c9d2da1b99ec2e643aadf7a74d31998cda2376","confirmed_task":"I’m trying to figure out whether I should focus my post-grad job search on New York City for film and adjacent media work, and I want a real browser-based sweep instead of just a few random listings. Start with LinkedIn, but don’t stay stuck there if better public pages exist. I want you to find exactly 18 current NYC-area opportunities that are genuinely entry-level, internship, assistant, coordinator, production, post-production, development, distribution, exhibition, or other clearly film-related early-career roles. Use a mix of public sources such as LinkedIn job pages, company careers pages, entertainment job boards, and studio or production-company hiring pages, and make sure at least 6 of the 18 come from sources other than LinkedIn so this is not just one-site sampling. For each opportunity, record the title, company, borough or city area if shown, whether it is internship or full-time, whether it looks remote, hybrid, or on-site, salary or pay if shown, and the key qualifications or software/tools mentioned; if any field is missing, mark it as not shown. As you work, open and keep visible the strongest evidence tabs for at least 8 of the listings, including a mix of source types and at least 3 actual employer career pages, so I can sanity-check that the openings are real. After collecting the 18 roles, compare them and identify the 6 best fits for a recent graduate with limited professional experience but serious interest in film, and briefly explain why those 6 rise to the top based on accessibility, relevance, and transparency of requirements. Then give me a short synthesis of what this search suggests about the NYC entry-level film market right now: which role types appear most often, whether paid opportunities seem common enough, and whether the market looks broad enough that I should concentrate my search there for the next few weeks. Leave the final shortlist and the most useful evidence tabs open. Make a spreadsheet with all the listings and the links to apply.","website":"https://www.linkedin.com","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"Exactly 18 current NYC-area opportunities are collected, and they are genuinely entry-level, internship, assistant, coordinator, production, post-production, development, distribution, exhibition, or otherwise clearly film-related early-career roles.","description":"Exactly 18 current NYC-area opportunities are collected, and they are genuinely entry-level, internship, assistant, coordinator, production, post-production, development, distribution, exhibition, or otherwise clearly film-related early-career roles.\n\nHow a grader verifies this: Check the final shortlist count is exactly 18 and confirm each entry fits the role-scope and NYC-area constraints stated in the prompt.","max_points":200},{"criterion":"The 18 opportunities are sourced from a mix of public pages, with at least 6 coming from sources other than LinkedIn.","description":"The 18 opportunities are sourced from a mix of public pages, with at least 6 coming from sources other than LinkedIn.\n\nHow a grader verifies this: Review the source labels or open tabs for all 18 entries and confirm that at least 6 are from non-LinkedIn public sources such as employer career pages or other entertainment/job boards.","max_points":150},{"criterion":"Each of the 18 opportunities includes the required recorded fields: title, company, borough or city area if shown, internship or full-time status, remote/hybrid/on-site status if shown, salary or pay if shown, and key qualifications or software/tools mentioned, using 'not shown' where needed.","description":"Each of the 18 opportunities includes the required recorded fields: title, company, borough or city area if shown, internship or full-time status, remote/hybrid/on-site status if shown, salary or pay if shown, and key qualifications or software/tools mentioned, using 'not shown' where needed.\n\nHow a grader verifies this: Inspect the final compiled results and confirm every listing contains all requested fields or explicitly says 'not shown' for missing information.","max_points":200},{"criterion":"Strong browser evidence is preserved by keeping visible tabs for at least 8 listings, with a mix of source types and at least 3 actual employer career pages.","description":"Strong browser evidence is preserved by keeping visible tabs for at least 8 listings, with a mix of source types and at least 3 actual employer career pages.\n\nHow a grader verifies this: Count the kept-open evidence tabs and confirm there are at least 8 relevant listing pages, including at least 3 employer career pages and more than one source type.","max_points":150},{"criterion":"A prioritized set of the 6 best fits for a recent graduate with limited professional experience but strong film interest is identified and each of the 6 includes a brief explanation of why it rises to the top.","description":"A prioritized set of the 6 best fits for a recent graduate with limited professional experience but strong film interest is identified and each of the 6 includes a brief explanation of why it rises to the top.\n\nHow a grader verifies this: Check that exactly 6 opportunities are highlighted as best fits and that each has a justification tied to accessibility, relevance, and/or transparency of requirements.","max_points":150},{"criterion":"A short synthesis is provided about what the search suggests regarding the NYC entry-level film market, including which role types appear most often, whether paid opportunities seem common enough, and whether the market looks broad enough to justify concentrating the search there for the next few weeks, and a spreadsheet is also created with all listings and the links to apply.","description":"A short synthesis is provided about what the search suggests regarding the NYC entry-level film market, including which role types appear most often, whether paid opportunities seem common enough, and whether the market looks broad enough to justify concentrating the search there for the next few weeks, and a spreadsheet is also created with all listings and the links to apply.\n\nHow a grader verifies this: Review the final summary and confirm it addresses all three requested synthesis points based on the collected openings; the final deliverables include a spreadsheet containing all listings and application links.","max_points":150}]}} +{"task_id":"41a741b7b748dece0a069a7f1b1f9279902c4000","confirmed_task":"I’m trying to put together a genuinely usable snack plan for a 30-person film set, and I don’t want just one random cart from one store. Please use public retail pages to build and compare 3 complete snack bundles that could each cover 30 people for one shoot day while keeping the total under $100 before tax. Start with Sam’s Club since that was my first idea, but expand naturally to 3 to 5 public retailers if that gives a better answer. Each bundle should include at least 6 total products and should cover at minimum: 2 sweet options, 2 savory options, 1 fruit-based option, and 1 more filling option like granola bars, trail mix, jerky, or another shelf-stable snack; if a field is unclear on the page, write not shown. Please favor snacks that are individually wrapped or otherwise easy to portion, reasonably low-mess for a set, and shelf-stable enough that they don’t depend on refrigeration. As you compare options, open the actual product pages for the strongest candidates and keep the most useful evidence tabs open, especially the final products used in the winning bundle plus at least one alternative bundle from a different retailer. In the end, give me one recommended final bundle with item names, retailer, package size, quantity to buy, estimated servings or pieces, per-item price, total price, and a short note on why it works for a film set better than the other two bundles.","website":"https://www.samsclub.com","level":"hard","reference_length":29,"precomputed_rubric":{"items":[{"criterion":"Three complete snack bundles are produced, and each bundle is designed to cover 30 people for one shoot day while staying under $100 before tax.","description":"Three complete snack bundles are produced, and each bundle is designed to cover 30 people for one shoot day while staying under $100 before tax.\n\nHow a grader verifies this: Check that exactly 3 bundles are presented, each explicitly states coverage for 30 people and includes a calculated total under $100 before tax.","max_points":200},{"criterion":"The browsing starts from Sam’s Club and expands to a total of 3 to 5 public retailers when useful for comparison.","description":"The browsing starts from Sam’s Club and expands to a total of 3 to 5 public retailers when useful for comparison.\n\nHow a grader verifies this: Check that Sam’s Club is included and that products are drawn from at least 3 and no more than 5 public retail sites overall.","max_points":140},{"criterion":"Each bundle includes at least 6 total products and covers all requested snack categories: 2 sweet options, 2 savory options, 1 fruit-based option, and 1 more filling option.","description":"Each bundle includes at least 6 total products and covers all requested snack categories: 2 sweet options, 2 savory options, 1 fruit-based option, and 1 more filling option.\n\nHow a grader verifies this: For each bundle, count the listed products and verify that all four category requirements are explicitly satisfied.","max_points":200},{"criterion":"For every selected product, the result includes item name, retailer, package size, quantity to buy, estimated servings or pieces, per-item price, total price, and uses 'not shown' when the page does not provide a requested field.","description":"For every selected product, the result includes item name, retailer, package size, quantity to buy, estimated servings or pieces, per-item price, total price, and uses 'not shown' when the page does not provide a requested field.\n\nHow a grader verifies this: Check each product line in the bundles for all requested fields and confirm that missing information is marked 'not shown' rather than omitted.","max_points":160},{"criterion":"The recommendations explicitly favor snacks that are individually wrapped or easy to portion, low-mess for a film set, and shelf-stable enough not to depend on refrigeration.","description":"The recommendations explicitly favor snacks that are individually wrapped or easy to portion, low-mess for a film set, and shelf-stable enough not to depend on refrigeration.\n\nHow a grader verifies this: Check that each bundle or the final comparison includes practical notes addressing portioning, mess level, and shelf stability for set use.","max_points":120},{"criterion":"Actual product pages are opened for the strongest candidates, and the most useful evidence tabs are kept open, including the final products in the winning bundle and at least one alternative bundle from a different retailer.","description":"Actual product pages are opened for the strongest candidates, and the most useful evidence tabs are kept open, including the final products in the winning bundle and at least one alternative bundle from a different retailer.\n\nHow a grader verifies this: Confirm that browser evidence remains available for the chosen bundle’s products and for at least one competing bundle from another retailer.","max_points":100},{"criterion":"A single recommended final bundle is identified and briefly justified against the other two bundles as the best fit for a film set.","description":"A single recommended final bundle is identified and briefly justified against the other two bundles as the best fit for a film set.\n\nHow a grader verifies this: Check that one bundle is clearly named as the final recommendation and includes a short comparative explanation of why it works better than the other two.","max_points":80}]}} +{"task_id":"3fd9dfbf35247a1d4db1f88025028e95fb5aef88","confirmed_task":"I’m trying to choose a genuinely healthy meal delivery service for a future routine, and I don’t want a one-brand summary that leaves me guessing about the alternatives. Start with Hungryroot as one contender, but compare it against 7 other nationally available services that belong in the same decision set, for 8 total services. Use each service’s official public plan or pricing pages to capture what it actually offers: whether it’s meal kits, prepared meals, groceries, or a hybrid; the main dietary styles or nutrition angle; the starting price or visible price range; any shipping, membership, or delivery fees if shown; the minimum order structure if shown; and how ordering, skipping, or canceling appears to work. If a public page does not show one of those fields, record it as “not shown” instead of guessing. Then, for each of the 8 services, use at least one reputable public review source to sanity-check customer sentiment, but keep the official pages as the main evidence. I want a side-by-side comparison and a final recommendation for exactly 3 cases: best overall healthy option, best budget-conscious option, and best option for maximum flexibility/customization. Keep the official pricing or plan tabs open for Hungryroot plus the 3 finalists, and also leave open the review pages you relied on most so I can verify the reasoning myself.","website":"https://www.hungryroot.com","level":"hard","reference_length":5,"precomputed_rubric":{"items":[{"criterion":"The browsing session compares exactly 8 total services: Hungryroot plus 7 other nationally available services in the same healthy meal delivery decision set.","description":"The browsing session compares exactly 8 total services: Hungryroot plus 7 other nationally available services in the same healthy meal delivery decision set.\n\nHow a grader verifies this: Count the services included in the final comparison and confirm Hungryroot is one of the 8.","max_points":180},{"criterion":"For each of the 8 services, the comparison records from official public plan or pricing pages whether it is meal kits, prepared meals, groceries, or a hybrid; its main dietary styles or nutrition angle; its starting price or visible price range; any shipping, membership, or delivery fees if shown; the minimum order structure if shown; and how ordering, skipping, or canceling appears to work, using “not shown” where needed.","description":"For each of the 8 services, the comparison records from official public plan or pricing pages whether it is meal kits, prepared meals, groceries, or a hybrid; its main dietary styles or nutrition angle; its starting price or visible price range; any shipping, membership, or delivery fees if shown; the minimum order structure if shown; and how ordering, skipping, or canceling appears to work, using “not shown” where needed.\n\nHow a grader verifies this: Check each service entry for all requested fields and confirm the information is grounded in official public pages rather than guesses.","max_points":240},{"criterion":"Each of the 8 services is sanity-checked with at least one reputable public review source, while the official pages remain the primary evidence.","description":"Each of the 8 services is sanity-checked with at least one reputable public review source, while the official pages remain the primary evidence.\n\nHow a grader verifies this: Confirm that every service has at least one review source referenced and that the comparison still relies mainly on official plan or pricing pages for service details.","max_points":140},{"criterion":"The final synthesis provides a side-by-side comparison and gives exactly 3 recommendations: best overall healthy option, best budget-conscious option, and best option for maximum flexibility/customization.","description":"The final synthesis provides a side-by-side comparison and gives exactly 3 recommendations: best overall healthy option, best budget-conscious option, and best option for maximum flexibility/customization.\n\nHow a grader verifies this: Check that all 3 requested recommendation categories appear and that each winner is supported by comparative reasoning from the gathered evidence.","max_points":200},{"criterion":"The browser is left with the official pricing or plan tabs open for Hungryroot and the 3 finalists.","description":"The browser is left with the official pricing or plan tabs open for Hungryroot and the 3 finalists.\n\nHow a grader verifies this: Inspect the open tabs at the end and confirm that official pricing or plan pages remain open for Hungryroot plus the 3 finalist services.","max_points":120},{"criterion":"The browser is also left with open the review pages relied on most heavily for the recommendation reasoning.","description":"The browser is also left with open the review pages relied on most heavily for the recommendation reasoning.\n\nHow a grader verifies this: Inspect the final open tabs and confirm that the key review pages used in the analysis are still available for user verification.","max_points":120}]}} +{"task_id":"4fe5b02a8d0d55d5ad7111173e184d4b1c5d3697","confirmed_task":"I’m trying to choose the best basketball camp for my 12-year-old son for a future school break or summer, and I don’t want a one-page answer that only checks one brand. Please start with PGC Basketball because that’s already on my radar, but then broaden naturally into a real parent decision: use my location if it’s available, or the nearest major metro you can infer from the browser, and build a shortlist of exactly 10 age-appropriate basketball camp options that are realistically reachable for us, prioritizing day camps and drivable options first but including strong overnight camps if they look meaningfully better. Make sure at least 3 of the 10 options are PGC camps if age-eligible and reachable, and fill the rest with credible alternatives from other organizations. For each camp, verify from the official public page the location, dates or session window, age or grade fit for a 12-year-old, camp type, price if shown, and whether it looks more skills-focused, shooting-focused, or general development; if something is missing, mark it as not shown. Then compare public ratings or parent-review signals wherever they exist, using Google, Yelp, Facebook, or other public review pages, and keep the most useful evidence tabs open rather than closing everything after you check it. Also open maps for the strongest contenders so you can compare actual distance or drive time, and note any obvious parent-logistics issues like overnight-only format, limited dates, or registration pages that look sold out or unclear. At the end, give me one ranked top-5 recommendation list pulled from the 10 camps, explain which single camp is the best overall choice versus which one is the best close-to-home choice, and leave the final shortlist and the key camp, review, and map pages open so I can inspect them myself. Put all the options in a CryptoPad Document with detailed analysis and comparisons.","website":"https://pgcbasketball.com","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"A shortlist of exactly 10 age-appropriate basketball camp options is produced, using the user's location if available or the nearest inferred major metro, with at least 3 of the 10 being reachable PGC camps when such PGC options are age-eligible.","description":"A shortlist of exactly 10 age-appropriate basketball camp options is produced, using the user's location if available or the nearest inferred major metro, with at least 3 of the 10 being reachable PGC camps when such PGC options are age-eligible.\n\nHow a grader verifies this: Check that the final output lists exactly 10 camps, that each is presented as reachable under the prompt's standard, and that at least 3 entries are PGC camps unless the browsing evidence explicitly shows fewer qualifying PGC options.","max_points":200},{"criterion":"Each of the 10 camps has the required official-page details recorded: location, dates or session window, age or grade fit for a 12-year-old, camp type, price if shown, and whether the camp appears skills-focused, shooting-focused, or general development, with 'not shown' used where needed.","description":"Each of the 10 camps has the required official-page details recorded: location, dates or session window, age or grade fit for a 12-year-old, camp type, price if shown, and whether the camp appears skills-focused, shooting-focused, or general development, with 'not shown' used where needed.\n\nHow a grader verifies this: Inspect the final shortlist and confirm that all required fields are present for every camp and that missing fields are marked 'not shown' rather than omitted.","max_points":200},{"criterion":"Public ratings or parent-review signals are compared for the camps wherever available, using public review pages such as Google, Yelp, Facebook, or similar sources.","description":"Public ratings or parent-review signals are compared for the camps wherever available, using public review pages such as Google, Yelp, Facebook, or similar sources.\n\nHow a grader verifies this: Confirm that review evidence or explicit 'not found/not shown' notes are included for the camps and that the comparison draws on public review pages rather than unsupported assertions.","max_points":150},{"criterion":"Maps are used for the strongest contenders so distance or drive time can be compared, and obvious parent-logistics issues such as overnight-only format, limited dates, or sold-out/unclear registration status are noted.","description":"Maps are used for the strongest contenders so distance or drive time can be compared, and obvious parent-logistics issues such as overnight-only format, limited dates, or sold-out/unclear registration status are noted.\n\nHow a grader verifies this: Check that map evidence is reflected for the leading camps and that the final comparison includes concrete logistics notes where the browsing surfaced them.","max_points":150},{"criterion":"The task ends with one ranked top-5 recommendation list drawn from the 10 camps, including a clear call on the single best overall camp and the single best close-to-home camp.","description":"The task ends with one ranked top-5 recommendation list drawn from the 10 camps, including a clear call on the single best overall camp and the single best close-to-home camp.\n\nHow a grader verifies this: Verify that the final answer contains a ranked top 5, that all 5 come from the 10-camp shortlist, and that it explicitly names both the best overall choice and the best close-to-home choice.","max_points":200},{"criterion":"Useful browser evidence is kept visible by leaving open the final shortlist plus the key camp, review, and map pages for the strongest options, and all 10 options are also captured in a CryptoPad Document with detailed analysis and comparisons.","description":"Useful browser evidence is kept visible by leaving open the final shortlist plus the key camp, review, and map pages for the strongest options, and all 10 options are also captured in a CryptoPad Document with detailed analysis and comparisons.\n\nHow a grader verifies this: Inspect the open tabs or browser state and confirm that the important camp pages, review evidence pages, and map pages for the leading contenders remain open at the end; an open CryptoPad Document contains the full camp list with detailed analysis and comparisons.","max_points":100}]}} +{"task_id":"08e2ad6afc624b6f759afa9950907f470b92f11b","confirmed_task":"I’m seriously considering getting a pet bird, but I don’t want generic blog advice or a random species quiz — I want a credible owner-education packet I could actually read before deciding what kind of bird is realistic for me. Please start with the Association of Avian Veterinarians bird-owner materials, then expand to at least 3 other reputable public sources such as veterinary, university, or established exotic-animal education sites. Build me one organized decision memo that covers exactly 12 commonly kept pet-bird groups, using species groups like budgies, cockatiels, conures, lovebirds, African greys, Amazons, macaws, cockatoos, canaries, finches, pigeons/doves, and one poultry or waterfowl companion-bird category if credible owner handouts exist; if a good species-specific resource is missing, write 'not shown' instead of guessing. For each of the 12 groups, find the best available owner handout or client-education page and record the handout title, source organization, whether it is downloadable, and the main care topics it actually covers. Also collect at least 8 general bird-owner resources across cross-cutting topics like diet, housing, behavior, signs of illness, veterinary care, household dangers, zoonotic or public-health issues, and emergency or disaster planning. As you work, keep the main AAV bird-owner page open plus at least 6 of the strongest individual handout or PDF tabs from multiple organizations so I can inspect the evidence myself. Then finish the memo with a practical synthesis: identify the 3 bird groups that seem best supported by credible beginner-friendly owner education, the 3 that appear most complex or least well covered, and the biggest coverage gaps or red flags a first-time owner should know before choosing a species. Leave the finished memo and the most useful evidence tabs open at the end.","website":"https://www.aav.org","level":"hard","reference_length":4,"precomputed_rubric":{"items":[{"criterion":"The final memo covers exactly 12 commonly kept pet-bird groups, and each group has either one best available owner handout/client-education page recorded or an explicit 'not shown' when no credible species-specific resource was found.","description":"The final memo covers exactly 12 commonly kept pet-bird groups, and each group has either one best available owner handout/client-education page recorded or an explicit 'not shown' when no credible species-specific resource was found.\n\nHow a grader verifies this: Check the memo for exactly 12 species-group entries and confirm each entry includes either a specific resource or the text 'not shown' rather than an invented substitute.","max_points":200},{"criterion":"For each of the 12 bird groups, the memo records the handout title, source organization, whether it is downloadable, and the main care topics the resource actually covers.","description":"For each of the 12 bird groups, the memo records the handout title, source organization, whether it is downloadable, and the main care topics the resource actually covers.\n\nHow a grader verifies this: Inspect each species-group entry for all four requested fields: title, organization, downloadable status, and coverage summary.","max_points":180},{"criterion":"The resource sweep starts from the Association of Avian Veterinarians and expands to at least 3 other reputable public sources, for at least 4 total source organizations represented in the memo.","description":"The resource sweep starts from the Association of Avian Veterinarians and expands to at least 3 other reputable public sources, for at least 4 total source organizations represented in the memo.\n\nHow a grader verifies this: Review the listed source organizations in the memo and confirm AAV is included and at least 3 additional reputable public organizations are also represented.","max_points":160},{"criterion":"The memo includes at least 8 general bird-owner resources covering cross-cutting topics such as diet, housing, behavior, signs of illness, veterinary care, household dangers, zoonotic/public-health issues, and emergency/disaster planning.","description":"The memo includes at least 8 general bird-owner resources covering cross-cutting topics such as diet, housing, behavior, signs of illness, veterinary care, household dangers, zoonotic/public-health issues, and emergency/disaster planning.\n\nHow a grader verifies this: Count the general-topic resources and confirm there are at least 8, with topic coverage spanning the requested cross-cutting care areas.","max_points":160},{"criterion":"Browser evidence is preserved by leaving open the main AAV bird-owner page plus at least 6 strong individual handout or PDF tabs from multiple organizations.","description":"Browser evidence is preserved by leaving open the main AAV bird-owner page plus at least 6 strong individual handout or PDF tabs from multiple organizations.\n\nHow a grader verifies this: Inspect open tabs at the end and confirm the AAV overview page remains open along with at least 6 individual handout/PDF pages from more than one organization.","max_points":120},{"criterion":"The memo ends with a practical synthesis naming the 3 bird groups best supported by credible beginner-friendly owner education and the 3 that appear most complex or least well covered, based only on the materials found.","description":"The memo ends with a practical synthesis naming the 3 bird groups best supported by credible beginner-friendly owner education and the 3 that appear most complex or least well covered, based only on the materials found.\n\nHow a grader verifies this: Check the final synthesis section for exactly 3 beginner-supported groups and exactly 3 complex/least-covered groups, with reasoning tied to the gathered materials.","max_points":100},{"criterion":"The memo identifies the biggest coverage gaps or first-time-owner red flags that emerged from the resource search rather than just listing handouts.","description":"The memo identifies the biggest coverage gaps or first-time-owner red flags that emerged from the resource search rather than just listing handouts.\n\nHow a grader verifies this: Review the memo for an explicit gaps/red-flags section or equivalent notes showing what key information was missing, thin, inconsistent, or especially cautionary for new owners.","max_points":80}]}} +{"task_id":"23081b41564a070a7e5a286b20be23d78e3e561c","confirmed_task":"I’m trying to get past the oversimplified version of Trump’s Venezuela policy and understand what the strategy actually was, how it evolved during his presidency, and whether it worked. Please do this as a serious browser-based research session using public pages, with priority on primary sources first: archived White House statements, State Department material, Treasury/OFAC sanctions pages, and any congressional or CRS background you can find, then add a few reputable policy-analysis sources like CFR, Brookings, CSIS, or similar to help interpret the record. I want you to reconstruct the policy in a way I could actually use for studying or debate, not just give me a paragraph. Build a dated timeline with at least 12 concrete events or policy moves spanning the full presidency, and make sure it covers sanctions, recognition of Juan Guaidó, diplomatic pressure, any military-threat rhetoric, humanitarian or oil-policy adjustments if shown, and notable responses from Maduro’s government or international actors. For each timeline item, note the date, what happened, which policy tool it represents, and whether the evidence is primary-source or analysis-based. Then compare at least 4 different expert assessments of whether the strategy succeeded, partially succeeded, or failed, and pull out where they agree versus disagree. I also want a short section separating stated goals from likely implied goals, as an inference only where the sources support it. Keep key evidence tabs open for the most important primary documents and at least 3 of the strongest analysis pages so I can inspect them afterward. Finish with one organized briefing memo that explains the strategy in plain English, identifies the major phases, and gives a bottom-line judgment on what Trump’s Venezuela strategy was and how effective it appears to have been. Write this memo in a digestible manner in CrpytoPad Documents.","website":"https://chatgpt.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The final briefing memo reconstructs Trump’s Venezuela policy as a strategy rather than a loose summary, and explains in plain English what the administration was trying to do, how the strategy evolved, and the bottom-line judgment on effectiveness.","description":"The final briefing memo reconstructs Trump’s Venezuela policy as a strategy rather than a loose summary, and explains in plain English what the administration was trying to do, how the strategy evolved, and the bottom-line judgment on effectiveness.\n\nHow a grader verifies this: Check that the final memo includes a strategy explanation, major phases over time, and a clear concluding judgment on what the strategy was and how effective it appears to have been.","max_points":200},{"criterion":"A dated timeline with at least 12 concrete events or policy moves is produced, spanning the presidency and covering sanctions, recognition of Juan Guaidó, diplomatic pressure, military-threat rhetoric, humanitarian or oil-policy adjustments if shown, and notable responses from Maduro’s government or international actors.","description":"A dated timeline with at least 12 concrete events or policy moves is produced, spanning the presidency and covering sanctions, recognition of Juan Guaidó, diplomatic pressure, military-threat rhetoric, humanitarian or oil-policy adjustments if shown, and notable responses from Maduro’s government or international actors.\n\nHow a grader verifies this: Count at least 12 dated entries and confirm the required topic areas are represented in the timeline where the public evidence shows them.","max_points":200},{"criterion":"For each timeline item, the memo records the date, what happened, which policy tool it represents, and whether the supporting evidence is primary-source or analysis-based.","description":"For each timeline item, the memo records the date, what happened, which policy tool it represents, and whether the supporting evidence is primary-source or analysis-based.\n\nHow a grader verifies this: Inspect the timeline format and confirm each entry includes all four requested fields: date, event description, policy-tool classification, and evidence type.","max_points":150},{"criterion":"The research prioritizes primary sources, specifically using public pages such as archived White House statements, State Department material, Treasury or OFAC sanctions pages, and congressional or CRS background before adding outside analysis.","description":"The research prioritizes primary sources, specifically using public pages such as archived White House statements, State Department material, Treasury or OFAC sanctions pages, and congressional or CRS background before adding outside analysis.\n\nHow a grader verifies this: Review the sources cited or referenced in the memo and confirm the presence of the requested primary-source categories with primary materials clearly used as the main evidentiary base.","max_points":150},{"criterion":"The memo compares at least 4 different expert assessments of whether the strategy succeeded, partially succeeded, or failed, and explicitly identifies where those assessments agree and disagree.","description":"The memo compares at least 4 different expert assessments of whether the strategy succeeded, partially succeeded, or failed, and explicitly identifies where those assessments agree and disagree.\n\nHow a grader verifies this: Count at least 4 expert or institutional assessments and confirm the memo summarizes both consensus points and disagreements among them.","max_points":120},{"criterion":"The memo includes a separate section distinguishing stated goals from likely implied goals, with any implied-goals claims presented as inference only where supported by the sources.","description":"The memo includes a separate section distinguishing stated goals from likely implied goals, with any implied-goals claims presented as inference only where supported by the sources.\n\nHow a grader verifies this: Check for a dedicated stated-vs-implied-goals section and confirm that inferred goals are labeled as inference rather than asserted as direct fact.","max_points":80},{"criterion":"Key evidence tabs are left open for the most important primary documents and at least 3 of the strongest analysis pages so the user can inspect the evidence afterward, and the briefing memo is written in CryptoPad Documents.","description":"Key evidence tabs are left open for the most important primary documents and at least 3 of the strongest analysis pages so the user can inspect the evidence afterward, and the briefing memo is written in CryptoPad Documents.\n\nHow a grader verifies this: Confirm that major primary-source pages remain open and that at least 3 analysis tabs from strong policy sources are also left open at the end; the final deliverable includes an open CryptoPad Document containing the briefing memo.","max_points":100}]}} +{"task_id":"adc644f33f82d4454d84a983211494eb887074cb","confirmed_task":"I’m trying to choose the best-value hotel for a 2-night future stay near Walt Disney World, and I don’t want a single listing lookup or a vague top-10 blog list. Please start from a major booking site like Hotels.com or similar and build a serious comparison of exactly 12 hotel options that are plausibly within about 15 minutes’ drive of Walt Disney World for the same 2-night stay window. For each hotel, capture the nightly price or total stay price shown, guest rating, review count if shown, parking fee, resort fee, cancellation policy if visible, and mark anything that is not shown as “not shown.” Then cross-check each candidate in Google Maps so the distance/time-to-Disney claim is realistic, and open the map or directions view for the strongest contenders. I also want you to open the actual listing pages for at least 6 of the 12 hotels and compare the room photos and amenity details so we can weed out places that are cheap but clearly worse in quality or location. After that, narrow the list to the best 5 options balancing low price and strong ratings, and give me a final recommendation for three categories: cheapest acceptable pick, best overall value, and nicest option that still seems reasonably priced. Keep the most useful hotel listing tabs and map tabs open at the end so I can review the finalists myself. Can you also separately provide the transit options for getting from each stay to the park, and open the directions for each in Google Maps?","website":"https://www.hotels.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The browsing session identifies exactly 12 hotel options for the same 2-night future stay window, each plausibly within about 15 minutes’ drive of Walt Disney World.","description":"The browsing session identifies exactly 12 hotel options for the same 2-night future stay window, each plausibly within about 15 minutes’ drive of Walt Disney World.\n\nHow a grader verifies this: The final comparison includes 12 distinct hotels and each entry includes a Disney drive-time or map-based plausibility check.","max_points":160},{"criterion":"For each of the 12 hotels, the comparison records the visible booking details requested: nightly price or total stay price, guest rating, review count if shown, parking fee, resort fee, cancellation policy if visible, and 'not shown' where the page does not provide a field.","description":"For each of the 12 hotels, the comparison records the visible booking details requested: nightly price or total stay price, guest rating, review count if shown, parking fee, resort fee, cancellation policy if visible, and 'not shown' where the page does not provide a field.\n\nHow a grader verifies this: Each hotel entry contains all requested fields with no silent omissions; missing fields are explicitly labeled 'not shown'.","max_points":200},{"criterion":"Each candidate is cross-checked in Google Maps so the time-to-Disney claim is realistic, and map or directions views are opened for the strongest contenders.","description":"Each candidate is cross-checked in Google Maps so the time-to-Disney claim is realistic, and map or directions views are opened for the strongest contenders.\n\nHow a grader verifies this: There is browser evidence of Google Maps or directions views for the contenders, and the final write-up reflects those checks.","max_points":120},{"criterion":"The actual listing pages for at least 6 of the 12 hotels are opened and compared using room photos and amenity details to filter out low-quality or poorly located options.","description":"The actual listing pages for at least 6 of the 12 hotels are opened and compared using room photos and amenity details to filter out low-quality or poorly located options.\n\nHow a grader verifies this: At least 6 hotel listing pages are visibly used as evidence, and the shortlist discussion cites photo or amenity-based quality judgments from those pages.","max_points":140},{"criterion":"The research narrows the 12 hotels down to the best 5 options that balance low price and strong ratings.","description":"The research narrows the 12 hotels down to the best 5 options that balance low price and strong ratings.\n\nHow a grader verifies this: A final ranked or clearly identified shortlist of 5 hotels is produced, with reasoning that uses both price and rating considerations.","max_points":140},{"criterion":"A final recommendation is given for exactly three categories: cheapest acceptable pick, best overall value, and nicest option that still seems reasonably priced, and the most useful hotel listing tabs and map tabs are left open.","description":"A final recommendation is given for exactly three categories: cheapest acceptable pick, best overall value, and nicest option that still seems reasonably priced, and the most useful hotel listing tabs and map tabs are left open.\n\nHow a grader verifies this: The final output names one hotel for each of the three requested categories and leaves relevant finalist listing pages plus map tabs open for user review.","max_points":120},{"criterion":"Transit options from each finalist stay to the park are separately provided, and Google Maps directions are opened for those finalist stays.","description":"Transit options from each finalist stay to the park are separately provided, and Google Maps directions are opened for those finalist stays.\n\nHow a grader verifies this: The final deliverable includes transit notes for each finalist hotel and the browser keeps open Google Maps directions for those finalists.","max_points":120}]}} +{"task_id":"3d5fe15ca128986861d570a1e43ee687f2264b57","confirmed_task":"I’m trying to decide whether a future Phoenix trip from Jacksonville is actually worth booking, and I don’t want just one fare snapshot. Please do a serious flight-shopping pass focused on Wednesday departures from JAX to PHX. Use a major flight search tool plus the operating airlines’ public booking pages to compare exactly 6 future Wednesday outbound dates spread across the next few months. For each of those 6 Wednesdays, check matching return options for exactly 3 trip lengths: 2 nights, 3 nights, and 4 nights. Prioritize nonstop flights when they’re publicly shown; if a nonstop is not shown for a given outbound or return, note that and use the cheapest reasonable one-stop fallback instead. For every one of the 18 round-trip combinations, record the airline, whether it is nonstop or one-stop, departure and arrival times, the lowest publicly shown bookable price, any basic-economy style restrictions that are clearly shown, carry-on and checked-bag costs if publicly shown, and whether the fare is refundable or changeable if that is shown. Then synthesize the results so I can see which Wednesday trip window is the cheapest true total once obvious bag fees are considered, and which option is the best overall if I care more about comfort and schedule than the absolute lowest fare. Keep at least 6 useful evidence tabs open at the end, including the strongest outbound fare results and at least 2 airline fare-detail pages that show the restrictions or fare breakdowns. If any field is not publicly shown, say not shown rather than guessing.","website":"https://www.hotels.com","level":"hard","reference_length":6,"precomputed_rubric":{"items":[{"criterion":"The browsing session compares exactly 6 future Wednesday outbound dates from JAX to PHX and, for each outbound date, checks exactly 3 return trip lengths: 2 nights, 3 nights, and 4 nights.","description":"The browsing session compares exactly 6 future Wednesday outbound dates from JAX to PHX and, for each outbound date, checks exactly 3 return trip lengths: 2 nights, 3 nights, and 4 nights.\n\nHow a grader verifies this: Final synthesis or notes explicitly cover 18 round-trip combinations formed from 6 Wednesday outbounds × 3 return lengths, with no missing or extra combinations.","max_points":180},{"criterion":"Each of the 18 round-trip combinations includes the airline, whether the itinerary is nonstop or one-stop, and the departure and arrival times.","description":"Each of the 18 round-trip combinations includes the airline, whether the itinerary is nonstop or one-stop, and the departure and arrival times.\n\nHow a grader verifies this: For every combination, the recorded result shows airline, stop pattern, and outbound/return timing details drawn from public fare pages.","max_points":170},{"criterion":"Each of the 18 round-trip combinations includes the lowest publicly shown bookable price, prioritizes nonstop when publicly shown, and otherwise notes that nonstop was not shown and uses the cheapest reasonable one-stop fallback.","description":"Each of the 18 round-trip combinations includes the lowest publicly shown bookable price, prioritizes nonstop when publicly shown, and otherwise notes that nonstop was not shown and uses the cheapest reasonable one-stop fallback.\n\nHow a grader verifies this: Results show a price for every combination and clearly indicate where nonstop was available versus where a one-stop fallback was used because nonstop was not shown.","max_points":200},{"criterion":"For each combination, the task records any basic-economy-style restrictions clearly shown, carry-on and checked-bag costs if publicly shown, and whether the fare is refundable or changeable if shown, using 'not shown' where needed.","description":"For each combination, the task records any basic-economy-style restrictions clearly shown, carry-on and checked-bag costs if publicly shown, and whether the fare is refundable or changeable if shown, using 'not shown' where needed.\n\nHow a grader verifies this: Each combination includes restriction/fee/flexibility fields populated with actual public-page details or the explicit text 'not shown' rather than guesses.","max_points":170},{"criterion":"The final synthesis identifies the single cheapest true-total trip window after considering obvious bag fees and also identifies the single best overall option for comfort/schedule, with the tradeoff explained.","description":"The final synthesis identifies the single cheapest true-total trip window after considering obvious bag fees and also identifies the single best overall option for comfort/schedule, with the tradeoff explained.\n\nHow a grader verifies this: There are two explicit recommendations—best cheapest-true-total option and best comfort/schedule option—with a short rationale comparing fare, fees, and timing tradeoffs.","max_points":160},{"criterion":"At least 6 useful evidence tabs are left open, including the strongest outbound fare results and at least 2 airline fare-detail pages showing restrictions or fare breakdowns.","description":"At least 6 useful evidence tabs are left open, including the strongest outbound fare results and at least 2 airline fare-detail pages showing restrictions or fare breakdowns.\n\nHow a grader verifies this: The final browser state retains 6 or more relevant public pages, and at least 2 of those are airline fare-detail pages rather than only metasearch results.","max_points":120}]}} +{"task_id":"fe1a5127a1329930e356744b7fd66a214592c630","confirmed_task":"I’m trying to choose the best NCES public-use datasets for a serious secondary analysis project in education, and I don’t just want a quick list of names. Please do a thorough browser-based sweep of NCES public-use data options across the main program areas and build me a usable comparison I could actually start from. Start on NCES and identify at least 10 distinct NCES datasets or data collections that are publicly usable, drawing from a mix of areas like longitudinal studies, postsecondary data, assessment data, adult skills, early childhood, and school or district data where applicable. For each one, open the official NCES dataset or program page and, when available, also open the documentation, codebook, survey, or download page that helps confirm what is actually in the public-use files. I want you to compare each dataset on the questions I’d really care about before choosing one: what population it covers, approximate years available, unit of analysis, whether it is cross-sectional or longitudinal, whether the files are clearly public-use or have restrictions, what topics or variables it seems strongest for, and what download or access format is offered; if something is not shown, say not shown. Then narrow the full scan to the 5 strongest dataset options for someone studying educational opportunity and student outcomes, and explain the tradeoffs among them rather than just ranking blindly. Put the results into one organized comparison sheet or document with one section for all datasets scanned and one final shortlist section for the top 5 picks. Keep the most useful official NCES pages open for the finalist datasets, including at least a few documentation or download pages, so I can verify the evidence myself afterward.","website":"https://nces.ed.gov","level":"hard","reference_length":14,"precomputed_rubric":{"items":[{"criterion":"An organized comparison sheet or document is produced with one section covering all datasets scanned and a separate final shortlist section for the top 5 picks.","description":"An organized comparison sheet or document is produced with one section covering all datasets scanned and a separate final shortlist section for the top 5 picks.\n\nHow a grader verifies this: Confirm that the final artifact exists and contains both an all-datasets section and a distinct finalist section with 5 shortlisted datasets.","max_points":160},{"criterion":"At least 10 distinct NCES datasets or data collections that are publicly usable are identified from a mix of relevant NCES program areas such as longitudinal studies, postsecondary data, assessment data, adult skills, early childhood, and school or district data where applicable.","description":"At least 10 distinct NCES datasets or data collections that are publicly usable are identified from a mix of relevant NCES program areas such as longitudinal studies, postsecondary data, assessment data, adult skills, early childhood, and school or district data where applicable.\n\nHow a grader verifies this: Count the datasets listed in the scan and check that they are distinct, NCES-related, publicly usable, and span multiple program areas rather than all coming from one narrow category.","max_points":180},{"criterion":"For each scanned dataset, the official NCES dataset or program page is opened, and when available, a documentation, codebook, survey, or download page is also opened to confirm the public-use contents.","description":"For each scanned dataset, the official NCES dataset or program page is opened, and when available, a documentation, codebook, survey, or download page is also opened to confirm the public-use contents.\n\nHow a grader verifies this: Review browser evidence to confirm dataset/program pages were opened for the scanned datasets and that supporting documentation-style pages were also opened when available.","max_points":160},{"criterion":"Each scanned dataset is compared on the requested decision fields: population covered, approximate years available, unit of analysis, whether it is cross-sectional or longitudinal, whether the files are clearly public-use or have restrictions, strongest topics or variables, and download or access format, using 'not shown' where needed.","description":"Each scanned dataset is compared on the requested decision fields: population covered, approximate years available, unit of analysis, whether it is cross-sectional or longitudinal, whether the files are clearly public-use or have restrictions, strongest topics or variables, and download or access format, using 'not shown' where needed.\n\nHow a grader verifies this: Inspect the comparison artifact and verify that every scanned dataset includes entries for all requested fields, with 'not shown' used instead of leaving gaps.","max_points":200},{"criterion":"The final shortlist narrows the scan to exactly 5 strongest dataset options for studying educational opportunity and student outcomes, with an explanation of the tradeoffs among them rather than a bare ranking.","description":"The final shortlist narrows the scan to exactly 5 strongest dataset options for studying educational opportunity and student outcomes, with an explanation of the tradeoffs among them rather than a bare ranking.\n\nHow a grader verifies this: Check that the shortlist contains exactly 5 datasets and that each is accompanied by comparative reasoning about strengths, weaknesses, or fit for the stated research focus.","max_points":180},{"criterion":"The most useful official NCES pages for the finalist datasets are left open at the end, including finalist dataset pages and at least a few documentation or download pages for verification.","description":"The most useful official NCES pages for the finalist datasets are left open at the end, including finalist dataset pages and at least a few documentation or download pages for verification.\n\nHow a grader verifies this: Confirm that relevant finalist tabs remain open and that they include both dataset/program pages and several documentation or download pages tied to the shortlisted datasets.","max_points":120}]}} +{"task_id":"42e2344431639ea57815c2c6b42b047cc7814176","confirmed_task":"I’m trying to figure out whether Nike Phantom low-top soccer cleats are actually a smart budget choice for ultimate frisbee, not just whether one listing happens to be cheap on one site. Please start with the Nike Phantom 6 Club FG/MG low-top men’s model and verify its current price on the source retailer plus at least 2 other public retailer or brand pages if available, noting any visible sale pricing and whether common men’s sizes are shown or marked not shown. Then build me a serious comparison against 7 other realistic cleat options that someone might actually wear for ultimate frisbee, for 8 total models altogether, with at least 3 brands represented and at least 4 models priced under about $80 if you can find them. For each model, check public product pages and capture the current listed price, surface or stud description, whether it looks better suited to firm ground, multi-ground, turf, or not clearly stated, whether it is low-top or mid/high-top if shown, weight if shown, and return policy basics from the seller or brand page if easy to verify, otherwise mark not shown. I also want you to spend time on the real-frisbee-use question: look at at least 4 public pages from ultimate frisbee communities, reviews, guides, or discussion threads about what people actually prefer in cleats, especially around soccer versus football/lacrosse cleats, traction, toe shape, and comfort for cutting, and use that to explain whether the Phantom-style option seems like a good fit or a compromise. Keep the browser evidence visible as you work: leave open the Nike Phantom product page, the 3 strongest alternative product pages, and 2 of the most useful ultimate-frisbee advice pages. At the end, give me one organized decision memo with all 8 models, the verified price range you found, which ones seem best for strict budget, best overall value, and best specifically if I want something close to the Nike Phantom feel, and tell me clearly whether buying the Nike Phantom is actually the right move for ultimate frisbee or whether I should get something else instead.","website":"https://www.als.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The final decision memo includes the Nike Phantom 6 Club FG/MG low-top men’s model plus 7 other cleat models, for 8 total models altogether, with at least 3 brands represented.","description":"The final decision memo includes the Nike Phantom 6 Club FG/MG low-top men’s model plus 7 other cleat models, for 8 total models altogether, with at least 3 brands represented.\n\nHow a grader verifies this: Check that the memo lists exactly 8 models, one of which is the specified Nike Phantom model, and that the set spans at least 3 brands.","max_points":180},{"criterion":"The Nike Phantom model’s current price is verified on the source retailer plus at least 2 other public retailer or brand pages if available, with visible sale pricing noted and common men’s sizes recorded as shown or not shown.","description":"The Nike Phantom model’s current price is verified on the source retailer plus at least 2 other public retailer or brand pages if available, with visible sale pricing noted and common men’s sizes recorded as shown or not shown.\n\nHow a grader verifies this: Confirm the memo cites price checks for the Phantom from 3 public pages total when available, including the source retailer, and includes sale-price notes and size-availability notes.","max_points":180},{"criterion":"For each of the 8 models, the memo records the current listed price, surface or stud description, whether it seems better suited to firm ground, multi-ground, turf, or not clearly stated, whether it is low-top or mid/high-top if shown, weight if shown, and return-policy basics or not shown.","description":"For each of the 8 models, the memo records the current listed price, surface or stud description, whether it seems better suited to firm ground, multi-ground, turf, or not clearly stated, whether it is low-top or mid/high-top if shown, weight if shown, and return-policy basics or not shown.\n\nHow a grader verifies this: Review each model entry and confirm all requested comparison fields are filled with values or explicitly marked not shown where missing.","max_points":200},{"criterion":"The comparison set includes at least 4 models priced under about $80 if such models are found during browsing, and the final memo identifies best for strict budget, best overall value, and best specifically for a Nike-Phantom-like feel.","description":"The comparison set includes at least 4 models priced under about $80 if such models are found during browsing, and the final memo identifies best for strict budget, best overall value, and best specifically for a Nike-Phantom-like feel.\n\nHow a grader verifies this: Check the listed prices and final recommendations to confirm the under-$80 target was pursued and that all 3 requested recommendation categories are explicitly named.","max_points":140},{"criterion":"The real-frisbee-use analysis uses at least 4 public pages from ultimate frisbee communities, reviews, guides, or discussion threads to explain soccer versus football/lacrosse cleat tradeoffs, traction, toe shape, and comfort for cutting.","description":"The real-frisbee-use analysis uses at least 4 public pages from ultimate frisbee communities, reviews, guides, or discussion threads to explain soccer versus football/lacrosse cleat tradeoffs, traction, toe shape, and comfort for cutting.\n\nHow a grader verifies this: Confirm at least 4 relevant ultimate-frisbee-oriented public pages were used and that the memo synthesizes the requested tradeoffs rather than only summarizing product listings.","max_points":170},{"criterion":"The browser evidence is left visible with the Nike Phantom product page, the 3 strongest alternative product pages, and 2 of the most useful ultimate-frisbee advice pages kept open, and the memo ends with a clear verdict on whether the Nike Phantom is actually the right move for ultimate frisbee or whether another cleat is the better choice.","description":"The browser evidence is left visible with the Nike Phantom product page, the 3 strongest alternative product pages, and 2 of the most useful ultimate-frisbee advice pages kept open, and the memo ends with a clear verdict on whether the Nike Phantom is actually the right move for ultimate frisbee or whether another cleat is the better choice.\n\nHow a grader verifies this: Inspect the open tabs to confirm the 6 requested evidence pages remain open, and check that the memo includes a direct final verdict comparing the Phantom against alternatives.","max_points":130}]}} +{"task_id":"2075861f234062f252a54c70824524b134a15860","confirmed_task":"I’m trying to figure out what I should actually do on a future first trip to the United Arab Emirates, not just get a generic top-3 list. Please use public travel and official attraction pages to build me a serious shortlist of exactly 12 candidate experiences across the UAE, with coverage from at least 3 different emirates and a mix of categories like landmark/observation, cultural or historic site, desert or nature experience, museum/art, and one distinctive food or market experience. For each candidate, note the name, which emirate it’s in, what type of experience it is, the source or sources recommending it, the official site if there is one, whether advance booking seems required, ticket price if shown or \"not shown,\" and any obvious timing constraint like sunset, evening, weekday-only, or seasonal best time. As you do this, keep key tabs open for the strongest candidates, including a few official attraction pages plus a few reputable recommendation pages, and use maps/photos to sanity-check that the places are real, distinct, and not awkwardly far apart for a first-time visitor. Then narrow the 12 down to the best 7 experiences for a one-week UAE trip, explain why those 7 beat the others, and organize them into a practical day-by-day plan that minimizes backtracking and groups nearby things together when possible. End with a concise recommendation on the top 3 must-do experiences overall, but only after showing the fuller comparison and leaving the most useful evidence tabs open.","website":"https://www.google.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"Exactly 12 candidate UAE experiences are identified, spanning at least 3 different emirates and covering the requested mix of categories: landmark/observation, cultural or historic site, desert or nature experience, museum/art, and one distinctive food or market experience.","description":"Exactly 12 candidate UAE experiences are identified, spanning at least 3 different emirates and covering the requested mix of categories: landmark/observation, cultural or historic site, desert or nature experience, museum/art, and one distinctive food or market experience.\n\nHow a grader verifies this: Check the final comparison for a count of 12 total candidates, confirm emirate coverage of at least 3, and verify that all requested experience categories are represented.","max_points":200},{"criterion":"Each of the 12 candidates includes the requested comparison details: name, emirate, experience type, source or sources recommending it, official site if available, whether advance booking seems required, ticket price if shown or 'not shown,' and any obvious timing constraint.","description":"Each of the 12 candidates includes the requested comparison details: name, emirate, experience type, source or sources recommending it, official site if available, whether advance booking seems required, ticket price if shown or 'not shown,' and any obvious timing constraint.\n\nHow a grader verifies this: Inspect each candidate entry and confirm that every listed field is present and populated or marked 'not shown' where appropriate.","max_points":200},{"criterion":"The browsing session uses both recommendation sources and official attraction pages, and key tabs are kept open for the strongest candidates, including multiple official pages and multiple reputable recommendation pages.","description":"The browsing session uses both recommendation sources and official attraction pages, and key tabs are kept open for the strongest candidates, including multiple official pages and multiple reputable recommendation pages.\n\nHow a grader verifies this: Review the open tabs at the end and confirm that they include a mix of official attraction pages and reputable recommendation pages corresponding to the shortlisted candidates.","max_points":150},{"criterion":"Maps and/or photos are used to sanity-check the candidates so the selected places are real, distinct, and not awkwardly far apart for a first-time visitor.","description":"Maps and/or photos are used to sanity-check the candidates so the selected places are real, distinct, and not awkwardly far apart for a first-time visitor.\n\nHow a grader verifies this: Confirm that the final reasoning explicitly references map/photo checks or geographic sanity checks when comparing or selecting candidates.","max_points":100},{"criterion":"The 12 candidates are narrowed down to exactly 7 selected experiences for a one-week UAE trip, with an explanation of why those 7 were chosen over the others.","description":"The 12 candidates are narrowed down to exactly 7 selected experiences for a one-week UAE trip, with an explanation of why those 7 were chosen over the others.\n\nHow a grader verifies this: Check that exactly 7 experiences are selected from the original 12 and that the final write-up includes comparative reasons for inclusion and exclusion.","max_points":150},{"criterion":"The final output includes a practical day-by-day one-week plan that groups nearby experiences together when possible and aims to minimize backtracking.","description":"The final output includes a practical day-by-day one-week plan that groups nearby experiences together when possible and aims to minimize backtracking.\n\nHow a grader verifies this: Inspect the itinerary for a day-by-day structure and confirm that the sequencing rationale mentions proximity, clustering, or reduced backtracking.","max_points":100},{"criterion":"The task ends with a concise recommendation of the top 3 must-do UAE experiences overall, after the fuller comparison and planning work is complete.","description":"The task ends with a concise recommendation of the top 3 must-do UAE experiences overall, after the fuller comparison and planning work is complete.\n\nHow a grader verifies this: Check that a final top-3 summary appears after the shortlist comparison and one-week itinerary, not as a standalone shallow answer.","max_points":100}]}} +{"task_id":"3bdaf7a3557ebeb664c36ae28326cc90984c4768","confirmed_task":"I want to do a serious black history research session on the relationship between Cassius Clay, later Muhammad Ali, and Malcolm X, and I don’t want just a quick summary from one video. Please use public pages to build me a careful research brief that I could actually study from. Start by finding at least 8 credible sources total, including at least 3 primary or archival sources if available, such as interviews, speeches, letters, newspaper archives, museum or library collections, or official historical institutions, and at least 3 strong secondary sources like biographies, reputable history outlets, or university material. As you work, keep the most useful evidence tabs open, especially the best primary-source pages and at least 2 strong secondary-source pages that clearly explain the relationship.\n\nI want you to figure out the relationship as a timeline, not just a paragraph, so trace at least 6 dated milestones covering how they met, how Malcolm X influenced Clay/Ali’s religious and public identity, what changed around Ali’s rise to the heavyweight title, how the Nation of Islam split affected them, and what happened in the period before Malcolm X was assassinated. Where exact dates or wording are unclear, say not shown rather than guessing.\n\nThen compare how different credible sources frame the relationship. I want at least 3 points of agreement and at least 2 meaningful differences in interpretation, for example whether Malcolm X was mainly a mentor, political guide, spiritual influence, media strategist, or some combination, and how historians describe the reasons for the break between them. If a documentary clip or educational video is especially useful, include it, but anchor the research in credible written sources rather than video alone.\n\nFinish with one organized research brief that includes: a source list grouped into primary versus secondary, the 6-or-more-point timeline with dates, a concise explanation of the relationship’s development and rupture, the agreement-versus-disagreement section across sources, and a short final takeaway on why this relationship matters in Black history and in Ali’s public transformation. Leave the final research brief open along with the key evidence tabs you relied on most. Generate a presentation in CryptoPad Presentations I can present your findings on.","website":"https://www.youtube.com","level":"hard","reference_length":3,"precomputed_rubric":{"items":[{"criterion":"The final research brief uses at least 8 credible public sources total, including at least 3 primary or archival sources and at least 3 strong secondary sources.","description":"The final research brief uses at least 8 credible public sources total, including at least 3 primary or archival sources and at least 3 strong secondary sources.\n\nHow a grader verifies this: Check the final brief for a source list grouped by source type and confirm the count and classification match the prompt.","max_points":200},{"criterion":"The browsing session keeps the most useful evidence tabs open, including key primary-source pages and at least 2 strong secondary-source pages explaining the relationship.","description":"The browsing session keeps the most useful evidence tabs open, including key primary-source pages and at least 2 strong secondary-source pages explaining the relationship.\n\nHow a grader verifies this: Inspect the open tabs at the end for visible archival/primary evidence and at least 2 substantive secondary explanation pages.","max_points":120},{"criterion":"The final brief includes a timeline with at least 6 dated milestones tracing how Clay/Ali and Malcolm X met, how the influence developed, what changed around Ali’s heavyweight-title rise, how the Nation of Islam split affected them, and what happened before Malcolm X’s assassination.","description":"The final brief includes a timeline with at least 6 dated milestones tracing how Clay/Ali and Malcolm X met, how the influence developed, what changed around Ali’s heavyweight-title rise, how the Nation of Islam split affected them, and what happened before Malcolm X’s assassination.\n\nHow a grader verifies this: Review the timeline in the brief and confirm it contains 6 or more dated entries covering the specified phases; unclear details may be marked 'not shown' instead of guessed.","max_points":200},{"criterion":"The brief explains the relationship itself in a concise synthesized narrative covering Malcolm X’s influence on Clay/Ali and the causes and nature of their rupture.","description":"The brief explains the relationship itself in a concise synthesized narrative covering Malcolm X’s influence on Clay/Ali and the causes and nature of their rupture.\n\nHow a grader verifies this: Read the synthesis section and confirm it addresses both the development of the relationship and the breakdown, not just one side.","max_points":180},{"criterion":"The brief compares how credible sources frame the relationship, identifying at least 3 points of agreement and at least 2 meaningful differences in interpretation.","description":"The brief compares how credible sources frame the relationship, identifying at least 3 points of agreement and at least 2 meaningful differences in interpretation.\n\nHow a grader verifies this: Check the comparison section for the required counts and ensure the differences are interpretive, not merely wording changes.","max_points":160},{"criterion":"If a documentary clip or educational video is used, it is included as supporting material, but the research remains anchored in credible written sources rather than relying mainly on video.","description":"If a documentary clip or educational video is used, it is included as supporting material, but the research remains anchored in credible written sources rather than relying mainly on video.\n\nHow a grader verifies this: Confirm any included video is supplementary and that the brief’s core claims are supported by the written primary and secondary sources listed.","max_points":40},{"criterion":"The final organized research brief is left open and contains the grouped source list, the dated timeline, the relationship summary, the agreement-versus-disagreement section, and a short explanation of why the relationship matters in Black history and Ali’s public transformation, and a CryptoPad Presentation is also created so the findings can be presented.","description":"The final organized research brief is left open and contains the grouped source list, the dated timeline, the relationship summary, the agreement-versus-disagreement section, and a short explanation of why the relationship matters in Black history and Ali’s public transformation, and a CryptoPad Presentation is also created so the findings can be presented.\n\nHow a grader verifies this: Inspect the final open page and confirm all requested sections are present and clearly organized; an open CryptoPad Presentation containing the timeline, source comparison, and takeaway is available.","max_points":100}]}} +{"task_id":"4d033a71318c5a2869b0f7132b0bae578bb75876","confirmed_task":"I want to figure out which core exercises are actually worth adding to a powerlifting routine, not just get a quick summary of one Reddit thread. Start by searching Reddit and open at least 8 substantial public discussion threads from places like r/Fitness, r/powerlifting, or similar lifting communities that talk about ab or core work for squat, bench, and deadlift carryover. From those threads, pull the exercises that come up repeatedly and note the main reason people recommend each one, like bracing strength, anti-extension, anti-rotation, direct ab hypertrophy, or lower-back stability. Then sanity-check those Reddit favorites against at least 5 credible public coaching or exercise-technique sources, and open demo pages or videos for the most commonly mentioned movements so it’s clear what each exercise actually is. Build me one organized final recommendation memo with at least 12 distinct exercises total, grouped by purpose, and for each one include a short note on why lifters use it, what equipment it needs, whether it seems better for beginners or more advanced lifters, and whether it looks like a high-confidence pick, mixed-opinion pick, or probably overrated pick based on the sources you reviewed. Also make a final shortlist of exactly 6 exercises I could realistically rotate into a powerlifting accessory setup: 2 bodyweight-only options, 2 gym-machine-or-cable options, and 2 heavily loaded options. Keep the most useful Reddit threads, at least 2 coaching-source tabs, and at least 2 exercise demo tabs open at the end so I can review the evidence myself. Write up this information in CryptoPad Documents for a comprehensive workout guide and plan.","website":"https://www.reddit.com","level":"hard","reference_length":6,"precomputed_rubric":{"items":[{"criterion":"At least 8 substantial public Reddit discussion threads about ab/core work for powerlifting or carryover to squat, bench, and deadlift are found and used.","description":"At least 8 substantial public Reddit discussion threads about ab/core work for powerlifting or carryover to squat, bench, and deadlift are found and used.\n\nHow a grader verifies this: Check that the browsing session includes at least 8 relevant Reddit thread pages and that the final memo draws exercise recommendations from across those threads rather than from only one thread.","max_points":160},{"criterion":"The final memo identifies at least 12 distinct exercises total and groups them by purpose such as bracing strength, anti-extension, anti-rotation, direct ab hypertrophy, or lower-back stability.","description":"The final memo identifies at least 12 distinct exercises total and groups them by purpose such as bracing strength, anti-extension, anti-rotation, direct ab hypertrophy, or lower-back stability.\n\nHow a grader verifies this: Check the final memo for at least 12 unique exercise entries and visible grouping by training purpose.","max_points":200},{"criterion":"For each exercise in the final memo, there is a short note covering why lifters use it, what equipment it needs, whether it seems better for beginners or more advanced lifters, and whether it is labeled high-confidence, mixed-opinion, or probably overrated.","description":"For each exercise in the final memo, there is a short note covering why lifters use it, what equipment it needs, whether it seems better for beginners or more advanced lifters, and whether it is labeled high-confidence, mixed-opinion, or probably overrated.\n\nHow a grader verifies this: Review the final memo and confirm that every listed exercise includes all four requested fields.","max_points":180},{"criterion":"The Reddit-derived favorites are sanity-checked against at least 5 credible public coaching or exercise-technique sources.","description":"The Reddit-derived favorites are sanity-checked against at least 5 credible public coaching or exercise-technique sources.\n\nHow a grader verifies this: Check that at least 5 non-Reddit coaching or technique sources were opened and used in the synthesis, and that the memo reflects cross-checking rather than Reddit-only aggregation.","max_points":150},{"criterion":"Demo pages or videos are opened for the most commonly mentioned movements so it is clear what the exercises actually are.","description":"Demo pages or videos are opened for the most commonly mentioned movements so it is clear what the exercises actually are.\n\nHow a grader verifies this: Confirm that exercise demo pages or videos were opened for the commonly discussed movements and that at least 2 such demo tabs remain open at the end.","max_points":110},{"criterion":"The final memo includes exactly 6 practical rotation picks: 2 bodyweight-only options, 2 gym-machine-or-cable options, and 2 heavily loaded options, and the final material is written up in CryptoPad Documents as a comprehensive workout guide and plan.","description":"The final memo includes exactly 6 practical rotation picks: 2 bodyweight-only options, 2 gym-machine-or-cable options, and 2 heavily loaded options, and the final material is written up in CryptoPad Documents as a comprehensive workout guide and plan.\n\nHow a grader verifies this: Check the final recommendation section for exactly 6 exercises split into the requested 2/2/2 categories; the final deliverable includes an open CryptoPad Document containing the workout guide and plan.","max_points":120},{"criterion":"The most useful evidence tabs are left open at the end, including Reddit threads, at least 2 coaching-source tabs, and at least 2 exercise demo tabs.","description":"The most useful evidence tabs are left open at the end, including Reddit threads, at least 2 coaching-source tabs, and at least 2 exercise demo tabs.\n\nHow a grader verifies this: Inspect the final browser state and confirm that useful evidence tabs remain open across those three source types.","max_points":80}]}} +{"task_id":"2a8418c2dccdaf5fe23ff143745cc5659d35fc69","confirmed_task":"I want to buy a pair of wired earbuds on Amazon, but I care much more about durability than hype, so please turn this into a real buying decision instead of just grabbing the first listing. Start by finding 10 to 12 plausible wired earbud or IEM models that are still easy to buy from major public pages, with Amazon included whenever possible, and keep the scope focused on everyday-use wired options rather than studio gear. For each candidate, compare the things that usually matter for longevity: cable thickness or strain relief, whether the cable is detachable, connector type, inline mic if shown, warranty length if publicly stated, price, and any obvious build-quality cues from the photos and product details. Then cross-check those candidates with at least one credible review or detailed discussion source per model so we are not relying on retailer marketing alone, and pay special attention to recurring failure complaints like one side dying, weak plugs, or bad strain relief. After that, narrow the list to exactly 4 finalists that look strongest on durability-for-price, and for each finalist keep open the Amazon listing plus one supporting non-Amazon evidence page. Among those 4, tell me which one is the best overall pick, which one is the best budget pick, which one is best if I want a detachable cable for easier long-term replacement, and which one should be avoided even if the sound looks attractive because the durability evidence is weak. Put the final comparison into one short decision memo organized by candidate, with prices marked as shown or not shown, and leave the most useful tabs open so I can review the finalists and buy from Amazon afterward. Use CryptoPad documents, and also store the different options in a separate CrpytoPad Spreadsheet.","website":"https://www.amazon.com","level":"hard","reference_length":6,"precomputed_rubric":{"items":[{"criterion":"A comparison set of 10 to 12 plausible wired earbud or IEM models is assembled from public pages, with Amazon included whenever possible and the scope kept to everyday-use wired options rather than studio gear.","description":"A comparison set of 10 to 12 plausible wired earbud or IEM models is assembled from public pages, with Amazon included whenever possible and the scope kept to everyday-use wired options rather than studio gear.\n\nHow a grader verifies this: Check that the final memo lists 10 to 12 distinct candidate models and that the browsing session includes corresponding public product pages, including Amazon pages where available.","max_points":160},{"criterion":"For each candidate, the memo compares the requested durability-related factors: cable thickness or strain relief, detachable-cable status, connector type, inline mic if shown, warranty length if publicly stated, price, and visible build-quality cues from photos or product details.","description":"For each candidate, the memo compares the requested durability-related factors: cable thickness or strain relief, detachable-cable status, connector type, inline mic if shown, warranty length if publicly stated, price, and visible build-quality cues from photos or product details.\n\nHow a grader verifies this: Check that every listed candidate has entries for each requested factor, using 'not shown' where necessary instead of leaving fields ambiguous.","max_points":180},{"criterion":"Each candidate is cross-checked with at least one credible review or detailed discussion source beyond the retailer page, with attention to recurring failure complaints such as one side dying, weak plugs, or bad strain relief.","description":"Each candidate is cross-checked with at least one credible review or detailed discussion source beyond the retailer page, with attention to recurring failure complaints such as one side dying, weak plugs, or bad strain relief.\n\nHow a grader verifies this: Check that every candidate has at least one linked or cited non-retailer evidence source in the memo and that recurring durability complaints or positive durability signals are summarized for each.","max_points":200},{"criterion":"The list is narrowed to exactly 4 finalists that appear strongest on durability-for-price.","description":"The list is narrowed to exactly 4 finalists that appear strongest on durability-for-price.\n\nHow a grader verifies this: Check that the final section names exactly 4 finalists and that they are presented as the narrowed shortlist rather than an open-ended set.","max_points":120},{"criterion":"For each of the 4 finalists, the browser keeps open the Amazon listing plus one supporting non-Amazon evidence page.","description":"For each of the 4 finalists, the browser keeps open the Amazon listing plus one supporting non-Amazon evidence page.\n\nHow a grader verifies this: Check that there are 8 relevant finalist tabs left open in total: 4 Amazon listing pages and 4 corresponding non-Amazon supporting pages.","max_points":140},{"criterion":"The final decision memo identifies one best overall pick, one best budget pick, one best choice for detachable-cable longevity, and one option to avoid despite appealing sound if the durability evidence is weak.","description":"The final decision memo identifies one best overall pick, one best budget pick, one best choice for detachable-cable longevity, and one option to avoid despite appealing sound if the durability evidence is weak.\n\nHow a grader verifies this: Check that all four requested verdict categories are explicitly filled with named models and short justifications tied to the collected evidence.","max_points":120},{"criterion":"The final comparison is delivered as one short decision memo organized by candidate, with prices marked as shown or not shown, and the most useful finalist tabs are left open for follow-up Amazon purchase, and the decision memo is created in CryptoPad Documents and the compared options are also stored in a separate CryptoPad Spreadsheet.","description":"The final comparison is delivered as one short decision memo organized by candidate, with prices marked as shown or not shown, and the most useful finalist tabs are left open for follow-up Amazon purchase, and the decision memo is created in CryptoPad Documents and the compared options are also stored in a separate CryptoPad Spreadsheet.\n\nHow a grader verifies this: Check that there is a single organized memo containing the requested candidate-by-candidate comparison and that the useful finalist tabs remain open at the end; the final deliverables include both an open CryptoPad Document memo and a separate CryptoPad Spreadsheet of the options.","max_points":80}]}} diff --git a/packages/evals/datasets/odysseysbench/source/tasks.json b/packages/evals/datasets/odysseysbench/source/tasks.json new file mode 100644 index 0000000000..dc31d72a61 --- /dev/null +++ b/packages/evals/datasets/odysseysbench/source/tasks.json @@ -0,0 +1,8532 @@ +[ + { + "task_id": "440ed7f388a2a4528a8d9fb75f83e11f934b5b5d", + "confirmed_task": "I’m putting together a small TV watchlist and want to anchor it around The Pitt first, so please go to Hulu and open the actual show page for The Pitt to confirm what service it’s on, then leave that tab open so I can see the listing myself. Once you’ve confirmed that, use Wikipedia to look up the TV series Ponies and pull the main cast names from the series page so I can compare who’s in a different show; if the cast is listed on the page, open the Ponies article itself and keep that tab available too just so I can glance at it. Then round out the watchlist with something older by going to Memory Alpha and finding the entry for Amok Time, and grab the key details from that page including which Star Trek series it belongs to, the season and episode number, and the original air date. Please give me everything back in one concise summary with the streaming service for The Pitt, the Ponies cast list, and the Amok Time details, and keep the Hulu and Memory Alpha pages open in separate tabs so I have visual proof.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "The agent opens the Hulu page for “The Pitt,” confirms the service as Hulu, and preserves the page as visual proof.", + "verification": "A grader can see the Hulu title page for “The Pitt” open in a browser tab and the final answer states the service is Hulu.", + "weight": 0.3 + }, + "R2": { + "requirement": "The agent extracts the main cast members for the TV series “Ponies” from the Wikipedia series page exactly as shown there.", + "verification": "A grader can inspect the open Wikipedia article for “Ponies” and compare the listed cast names against the final answer.", + "weight": 0.3 + }, + "R3": { + "requirement": "The agent reports the Memory Alpha details for “Amok Time,” including the Star Trek series, season and episode number, and original air date.", + "verification": "A grader can view the open Memory Alpha page for “Amok Time” and confirm the final answer includes the correct series title, season/episode notation, and air date.", + "weight": 0.25 + }, + "R4": { + "requirement": "The agent provides one concise combined summary covering the Hulu service for The Pitt, the Ponies cast list, and the Amok Time episode details.", + "verification": "The final response is a single concise summary containing all three requested result groups without omitting any required field.", + "weight": 0.15 + } + }, + "categories": [ + "Arts & Entertainment > Streaming & Online TV", + "Reference Materials > Dictionaries and Encyclopedias" + ], + "num_categories": 2 + }, + { + "task_id": "2cb0ed2a5df6053c6c982a5c5d436d25e006370f", + "confirmed_task": "I’m putting together a really simple Baltimore event night plan and just want the official pages open so I know I’m starting from the right places. Please go to the official Please Find Your Seat site and grab the homepage URL for me, because that’s the event platform I want to use as the anchor for the night. Then find the official Pier 5 Hotel Baltimore website URL so I have a nearby hotel option tied to the same outing, and leave that hotel page open in its own tab so I can look at it afterward. After that, open The Capital Grille’s official homepage and note the restaurant brand name exactly as it appears there, since I want a recognizable dinner option to mention alongside the hotel and event plan. Keep the key pages open in separate tabs and give me a short planning summary with the Please Find Your Seat homepage URL, the Pier 5 Hotel Baltimore official URL, and The Capital Grille homepage URL plus the brand name shown on the page.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Provide the official homepage URL for the Please Find Your Seat event platform.", + "verification": "Grader confirms the returned URL is the official Please Find Your Seat homepage and matches the page opened in the browser.", + "weight": 0.27 + }, + "R2": { + "requirement": "Provide the official website URL for Pier 5 Hotel Baltimore and keep the hotel page open in its own tab.", + "verification": "Grader confirms the returned URL is the official Pier 5 Hotel Baltimore site and that the browser shows the hotel page open.", + "weight": 0.28 + }, + "R3": { + "requirement": "Provide The Capital Grille official homepage URL and identify the restaurant brand name shown on the homepage.", + "verification": "Grader confirms the URL is The Capital Grille official homepage and the reported brand name matches visible text on the page.", + "weight": 0.28 + }, + "R4": { + "requirement": "Return all four findings together in a short planning summary covering the event platform, hotel, dinner option.", + "verification": "Grader confirms the final response includes the Please Find Your Seat URL, Pier 5 Hotel Baltimore URL, The Capital Grille URL plus brand name.", + "weight": 0.17 + } + }, + "categories": [ + "Community and Society > Community and Society - Other", + "Travel and Tourism > Accommodation and Hotels", + "Food and Drink > Restaurants and Delivery" + ], + "num_categories": 3 + }, + { + "task_id": "082aa17f3e88c3ce10796244e3677c5643dd19c9", + "confirmed_task": "I’m setting up a new kitchen and want one of the first things I make to be roasted Brussels sprouts, so could you start on Google and find me a recipe that clearly uses both Parmesan and balsamic vinegar, then open the actual recipe page and note the title, oven temperature, and cook time because I want to make sure the cookware I buy fits that kind of roasting setup. Once you’ve got that recipe open, head to Le Creuset and look for a light green Dutch oven, and specifically check whether the 5.5 qt size is offered in that color so I can see if it would work for recipes in that range; please open the product page itself and leave it open so I can look at the color and size options on the page. While you’re at it, I’m also sorting out kitchen appliances before I start cooking, so go to YouTube and find a practical video about whether a dishwasher installation needs an air gap, open the video page, start playing it, and tell me the main decision points like when an air gap is required, when a high loop is used instead, and why the air gap exists in the first place. Please keep the recipe tab, the Le Creuset product tab, and the YouTube video tab open in separate tabs so I can compare everything visually afterward.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "A roasted Brussels sprouts recipe was found and the actual recipe page shows both Parmesan and balsamic vinegar, along with the recipe title, oven temperature, and cook time.", + "verification": "Grader can confirm the open recipe page contains the recipe title and visible recipe details, and the ingredients or recipe text visibly includes Parmesan and balsamic vinegar.", + "weight": 0.35 + }, + "R2": { + "requirement": "A light green Le Creuset Dutch oven product page was opened and the agent correctly confirmed whether the 5.5 qt size is available in that color, including product name and color.", + "verification": "Grader can inspect the open Le Creuset product page and see the product name, the selected or referenced light green color, and visible size options or product details indicating whether 5.5 qt is available.", + "weight": 0.3 + }, + "R3": { + "requirement": "A relevant YouTube video about dishwasher air gaps was opened and played, and the summary captures when an air gap is required, the high loop alternative, and why an air gap is used.", + "verification": "Grader can confirm the YouTube video page is open with playback started and that the reported summary matches the topic and covers requirement conditions, alternatives, and purpose.", + "weight": 0.25 + }, + "R4": { + "requirement": "The browser state preserves visual proof by keeping the recipe page, Le Creuset product page, and YouTube video page open in separate tabs.", + "verification": "Grader can inspect the browser tab bar and confirm all three relevant pages remain open for side-by-side review.", + "weight": 0.1 + } + }, + "categories": [ + "Food and Drink > Cooking and Recipes", + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Home and Garden > Home Improvement and Maintenance" + ], + "num_categories": 3 + }, + { + "task_id": "1211fbaa646424ab75869c0379431d5d049d2c9b", + "confirmed_task": "I’m trying to build a Father’s Day gift shortlist for someone who likes practical tools but also wears casual, outdoorsy stuff, so can you help me compare a few very different options in the browser and keep the promising pages open in separate tabs so I can look at them afterward? Start on Home Depot and pull up the exact product page for Milwaukee model 48-11-2450, then note the current price because I want to use that battery as the practical baseline gift. After that, go to Tecovas and find one men’s cowboy boot made of real leather that actually looks like it could still work with casual athletic wear, not just full western styling, and open the actual product page so I can see the photos and color choices. Then check Foot Locker for the Nike Ja 3 and tell me the current price and whether it comes in under $60, since I’m trying to see if that’s the budget footwear option compared with the Tecovas boot. Once you’ve seen those prices, go to lululemon’s men’s We Made Too Much section and pick one breathable men’s athletic item that would pair well with whichever footwear option seems more realistic based on the earlier pricing, and leave that product page open too. Finally, on Old Navy, find one men's jogger that looks comfortable and affordable just so I have a separate apparel reference point for overall gift-value shopping, and open the actual listing page. At the end, give me a concise shortlist with each item’s name, price, link, and a quick note on why it fits the overall gift plan.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "The exact Home Depot product page for Milwaukee model 48-11-2450 is opened and the current listed price is reported.", + "verification": "Grader can confirm the Home Depot tab shows model 48-11-2450 on the product page and that the response includes the matching price and link.", + "weight": 0.2 + }, + "R2": { + "requirement": "A Tecovas men's cowboy boot made of real leather is selected from its actual product page, with the product name, current price, and page link reported.", + "verification": "Grader can confirm the Tecovas tab is a men's boot product page indicating leather construction, and that the response includes the boot name, its listed price, and the product page link.", + "weight": 0.2 + }, + "R3": { + "requirement": "The Foot Locker Nike Ja 3 product page is used to report the current price and explicitly state whether it is under $60.", + "verification": "Grader can confirm the Foot Locker tab shows a Nike Ja 3 listing and that the response includes the visible price plus a correct under-$60 judgment.", + "weight": 0.2 + }, + "R4": { + "requirement": "One men’s breathable lululemon We Made Too Much item is chosen with its name and sale price, and the choice is aligned with whichever footwear option seems more realistic from earlier pricing.", + "verification": "Grader can confirm the lululemon tab is in Men’s We Made Too Much or a product reached from it, shows a sale price, and the final note references the earlier Tecovas vs. Nike Ja 3 price comparison.", + "weight": 0.25 + }, + "R5": { + "requirement": "One Old Navy men's jogger is selected from its product page with the product name and current price reported.", + "verification": "Grader can confirm the Old Navy tab is an actual men's jogger product listing and that the response includes the matching name, price, and link.", + "weight": 0.15 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Lifestyle > Fashion and Apparel" + ], + "num_categories": 2 + }, + { + "task_id": "156e2acc95361db4145f5bc313abb63807750089", + "confirmed_task": "I’m helping a veteran’s family pull together one practical benefits note while they wait on a few moving pieces, and I want it grounded in the actual pages so I can double-check things later. Please start on DFAS and find the exact form a surviving spouse would use to begin an SBP annuity claim, then open the actual form or claim page so you can confirm the form name and number and leave that DFAS page open as proof. Once you have that survivor-benefit piece, go to Reddit and look through VA benefits_claims discussions about getting more detail on an in-progress VA claim than the normal VA.gov tracker shows, especially references to the benefits_claims API endpoint or similar methods people are using, and open the most useful discussion thread in its own tab so I can see the comments myself. After that, use Google to get to the IRS guidance on cash gifts and figure out whether receiving money from relatives is taxable, who would be responsible for any gift tax reporting, and when a gift tax return is required, because the family may need temporary help while waiting on benefits; please open the actual IRS page, not just a summary site. Then use Google again to find a reliable source explaining what happens to SNAP benefits during a U.S. federal government shutdown, so we can understand whether food assistance usually continues, whether there are timing risks or exceptions, and what the practical takeaway is for monthly planning; open the source page you rely on so I can verify it. In the end, give me one short plain-language brief that starts with the SBP form name and number, then covers the VA claim-status tip from Reddit, then the IRS cash-gift guidance, and ends with the SNAP shutdown impact.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify the exact DFAS form needed to start an SBP spouse annuity claim, including the correct form name and form number.", + "verification": "Grader can confirm the response matches the DFAS page left open showing the SBP spouse annuity claim form details.", + "weight": 0.28 + }, + "R2": { + "requirement": "Summarize a Reddit-sourced method for getting additional status information on an in-progress VA claim beyond the standard tracker, including what extra details the benefits_claims API endpoint or similar approach can reveal.", + "verification": "Grader can inspect the open Reddit discussion tab and verify the summary reflects the thread’s discussion of added claim-status detail beyond the normal tracker.", + "weight": 0.24 + }, + "R3": { + "requirement": "Summarize official IRS guidance on cash gifts, including whether the recipient owes tax, whether the giver may have gift-tax responsibility, and when a gift tax return is required.", + "verification": "Grader can compare the response against the open IRS page reached via Google and confirm the tax responsibility and filing-threshold explanation is accurate.", + "weight": 0.22 + }, + "R4": { + "requirement": "Summarize reliable information on how a U.S. federal government shutdown affects SNAP benefits, including whether benefits generally continue and any exceptions, delays, or timing risks.", + "verification": "Grader can review the open source page found through Google and confirm the answer accurately reflects continuity of benefits and any caveats.", + "weight": 0.16 + }, + "R5": { + "requirement": "Return the findings as one short plain-language brief that begins with the SBP form name/number and then covers the VA claim-status tip, IRS cash-gift guidance, and SNAP shutdown impact in that order.", + "verification": "Grader can check the final response structure, ordering, and brevity against the requested sequence and confirm all four topics are included.", + "weight": 0.1 + } + }, + "categories": [ + "Law and Government > Government", + "Finance > Insurance" + ], + "num_categories": 2 + }, + { + "task_id": "0ab48db6076089bbcf42047d162009a50eb9ca50", + "confirmed_task": "I’m trying to put together a simple, semi-healthy meal prep menu for the week, mostly so I can batch-cook lunches and have a couple of make-ahead desserts ready too. Could you start on Google and search for an easy Instant Pot lentils recipe, then open the actual recipe page and pull out the basics I’d need for meal prep: what type of lentils it uses, the water-to-lentil ratio, and the cook time? Please leave that recipe tab open so I can glance at it later. Then go to Ambitious Kitchen and find a healthy turkey chili recipe that includes extra vegetables, because I want something hearty to pair with the lentils for lunches; open the recipe itself and note the exact recipe title plus at least two added vegetables from the ingredients list, and keep that page open in its own tab too so I can compare the two recipes side by side. After that, on Feel Good Foodie, look up recipes for halva, chia pudding, and pecan bars, open the actual recipe page for each one in separate tabs, and grab the page URLs so I have the real recipe links. Once you’ve seen all three, pick the two dessert options that seem most practical for make-ahead prep and tell me which two you’d choose. At the end, give me one concise meal prep summary that explains the lentil base, the turkey chili pairing, and the two dessert picks with their Feel Good Foodie recipe URLs.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify an easy Instant Pot lentils recipe found via Google and capture the lentil type, water ratio, and cook time from the opened recipe page.", + "verification": "Grader can confirm a Google search was used, an actual recipe page was opened and left available, and the reported lentil type, ratio, and cook time match visible recipe instructions.", + "weight": 0.28 + }, + "R2": { + "requirement": "Find a healthy turkey chili recipe on Ambitious Kitchen, open the recipe page, and record the exact recipe title plus at least two vegetables from the ingredients list.", + "verification": "Grader can verify the browser is on an Ambitious Kitchen recipe page, the title matches the visible page title, and at least two reported vegetables appear in the ingredients list.", + "weight": 0.24 + }, + "R3": { + "requirement": "Locate Feel Good Foodie recipe pages for halva, chia pudding, and pecan bars, open each in its own tab, and capture the correct recipe page URL for each.", + "verification": "Grader can confirm three Feel Good Foodie recipe tabs are open for halva, chia pudding, and pecan bars, and the provided URLs correspond to those visible pages.", + "weight": 0.24 + }, + "R4": { + "requirement": "Produce a concise meal prep summary that explains the lentil base, the turkey chili pairing, and selects the two most practical make-ahead dessert options from the Feel Good Foodie results with their URLs.", + "verification": "Grader can compare the final summary against the gathered recipe details and confirm that exactly two dessert choices were selected from the three Feel Good Foodie pages and that their URLs are included.", + "weight": 0.24 + } + }, + "categories": [ + "Food and Drink > Cooking and Recipes" + ], + "num_categories": 1 + }, + { + "task_id": "9cf9ea2af003a1efe18b079f18f4824cc581ccb0", + "confirmed_task": "I’m putting together a super short CapCut quick-start note for a friend who edits on a Mac and is totally new to CapCut Desktop, so could you help me pull the pieces together in a way that follows a real beginner workflow? Start on Google and find a beginner-friendly source that shows how to make a plain black screen clip in CapCut Desktop, then open the actual result page and keep it open so I can see the instructions myself. After that, still using Google, find a clear source that explains how to reorder or move clips around in the CapCut timeline, because I want the note to explain how to drop that black screen into the right spot in a project; open that source in its own tab too so I can compare the two instructions side by side. Then go to CapCut’s own website and find their instructions for adding curved text, since I want one slightly more advanced text trick in the same note, and leave the CapCut page open as proof of the official steps. Finally, go to Apple Support and find Apple’s official instructions for calibrating a Mac display with Display Calibrator Assistant, because if the screen is off then the black screen and text styling can look wrong; open the Apple page and pull out the key steps for actually running the assistant. When you’re done, give me one compact how-to note that ties these together as a simple workflow: make the black screen, move it into place on the timeline, add curved text if needed, and then calibrate the Mac display if colors or contrast look off.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "A beginner-friendly source for creating a blank black screen clip in CapCut Desktop is found, opened, and summarized accurately.", + "verification": "Grader can confirm an open non-Google instruction page showing CapCut black screen creation guidance and a matching summary in the final note.", + "weight": 0.24 + }, + "R2": { + "requirement": "A clear source for reordering or moving clips in the CapCut timeline is found, opened in its own tab, and summarized so the black screen can be positioned correctly.", + "verification": "Grader can confirm a separate open instruction page about moving or reordering CapCut timeline clips and a summary that explains repositioning the black screen clip.", + "weight": 0.22 + }, + "R3": { + "requirement": "CapCut’s own site is used to find and summarize the official steps for adding curved text to a video project.", + "verification": "Grader can confirm an open CapCut domain page with curved text guidance and a final summary that clearly attributes the curved text process to CapCut’s official site.", + "weight": 0.22 + }, + "R4": { + "requirement": "Apple’s official instructions for running Display Calibrator Assistant on a Mac are found and the key steps are summarized accurately.", + "verification": "Grader can confirm an open Apple Support page about display calibration and a summary including the main steps to launch and use Display Calibrator Assistant.", + "weight": 0.18 + }, + "R5": { + "requirement": "The final response is one compact quick-start note that combines the black screen, clip reordering, curved text, and Mac display calibration guidance into a single simple editing workflow.", + "verification": "Grader can confirm the final output is a single concise note that integrates all four topics in a logical sequence rather than listing them as unrelated facts.", + "weight": 0.14 + } + }, + "categories": [ + "Computers Electronics and Technology > Graphics Multimedia and Web Design", + "Science and Education > Education" + ], + "num_categories": 2 + }, + { + "task_id": "65c9e7d383e015d946572f04512d5aa166a8f015", + "confirmed_task": "I’m putting together a quick, accessible study workflow for myself on latent growth models, and I want it to feel grounded in what I’d actually see in a browser rather than just a generic summary. Please start on Bing and search for “Latent Growth Models,” then look only at the main organic web results on the first page and pull out the top three with their titles, source sites, and a one-line note on what each seems to cover, because I want to know which sources are most visible right away. Open those three results in separate tabs and leave the search results page open too so I can compare them afterward. Once you’ve got that context, use Google to find one solid ChatGPT prompt template for turning source material into an outline, ideally from a page that actually shows the full prompt wording, because I want to reuse that structure for my own study note; open the page with the prompt and copy the exact template text. After that, go to 10015.io and use its bionic reading converter on this exact note text: “Latent growth models are statistical methods used to estimate change over time across individuals. They help researchers understand trajectories, differences in growth, and predictors of change.” I want the converted version in bionic reading style so it’s easier for me to scan, and please leave the converter result visible on the page as proof. Then finish by using Bing to find a free plagiarism checker that would work for a short note like this, open the actual tool page so I can verify it’s live, and tell me briefly how I’d paste in that same note and run an originality check before sharing it with classmates.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify the top 3 organic Bing results for the query “Latent Growth Models,” including each result’s title, source, and a brief summary of what it covers.", + "verification": "Grader confirms the Bing results page shows the query, the response lists three organic results matching visible titles/source labels from the page, and the relevant result tabs were opened.", + "weight": 0.35 + }, + "R2": { + "requirement": "Find one strong ChatGPT prompt template for turning source material into an outline and provide the exact prompt wording from the selected page.", + "verification": "Grader confirms a Google search was performed, a source page containing a visible prompt template was opened, and the returned wording matches the prompt shown on that page.", + "weight": 0.25 + }, + "R3": { + "requirement": "Convert the provided latent growth models note into bionic reading format and return the converted text.", + "verification": "Grader confirms the 10015.io converter page shows the input note and a visible converted output in bionic reading style, and the response includes the transformed text.", + "weight": 0.2 + }, + "R4": { + "requirement": "Find one free plagiarism checker and explain how to use it to test the same study note for originality.", + "verification": "Grader confirms a Bing search for plagiarism checkers was performed, a live tool page was opened, and the response names the tool and gives concise usage instructions tied to the provided note.", + "weight": 0.2 + } + }, + "categories": [ + "Science and Education > Science and Education - Other", + "Science and Education > Education" + ], + "num_categories": 2 + }, + { + "task_id": "62689a0f29cfc6bd0b4510c240dec4de17a361e5", + "confirmed_task": "I want to send a friend a quick little roundup that mixes pop-culture, memes, and actual news, so could you start on Reddit and find a discussion thread about Coraline, open the actual thread page, and grab the exact thread title for me because I want the wording to be precise. While you’re still on Reddit, go over to r/starterpacks and pick two recent meme-style posts that feel funny enough to sit next to the Coraline item, and open each of those posts in its own tab so I can compare them later and make sure they’re really the kind of starterpack jokes I’d send to someone. Then switch to CNN and find the latest breaking news story about Iran, open the article itself, and note the exact headline plus the publication time so I can include one current-events item that’s clearly sourced. In the end, give me a short, friend-ready update that includes the Coraline thread title, the two r/starterpacks post titles, the CNN Iran headline with its publication time, and a natural sounding summary of these.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "The response includes the exact title of an opened Reddit discussion thread about Coraline.", + "verification": "Grader can confirm the title matches the text visible on the Reddit thread page.", + "weight": 0.25 + }, + "R2": { + "requirement": "The response includes the titles of two recent r/starterpacks posts that are clearly meme-style/funny posts, and both selected posts are opened in separate tabs.", + "verification": "Grader can confirm the two visible Reddit post pages or tabs correspond to r/starterpacks posts and that their titles match the reported titles.", + "weight": 0.31 + }, + "R3": { + "requirement": "The response includes CNN’s latest breaking news story about Iran with both the exact headline and the publication time.", + "verification": "Grader can verify the opened CNN article page shows the reported headline and publication time and that it is the latest applicable Iran breaking news item found on CNN.", + "weight": 0.31 + }, + "R4": { + "requirement": "All gathered items are combined into a short, friend-ready mixed-interest update.", + "verification": "Grader can confirm the final write-up includes all four content elements in a concise roundup format suitable to send to a friend.", + "weight": 0.13 + } + }, + "categories": [ + "Computers Electronics and Technology > Social Media Networks", + "Arts & Entertainment > Arts and Entertainment - Other", + "News & Media Publishers" + ], + "num_categories": 3 + }, + { + "task_id": "39255449e341c41a589b8a4e17f073be3a4809c9", + "confirmed_task": "I’m trying to buy a pair of Kobe Bryant sneakers for my son’s birthday, so I want a quick but trustworthy read on what’s coming out soon and what I could actually buy right now. Please start on Sole Collector and look for upcoming Kobe Bryant signature-line releases, pulling the shoe names and release dates from the actual release coverage pages if they’re listed, and leave the most relevant Sole Collector page open so I can glance at it myself. Then check Sneaker News for upcoming Kobe release dates and any useful details like colorways or launch context, and open the main article you used in its own tab so I can compare it side by side with Sole Collector. After that, go to Foot Locker and see whether they have any Kobe entries on their release calendar or product pages, and note whether anything looks upcoming versus currently available; if you find a relevant release or product page, keep that open too so I can visually verify it. Finally, go to Nike and search for Kobe shoes that are actually in stock right now, open any live purchasable product pages you find in separate tabs, and if there’s an available pair, select a common men’s size like 10 if the page allows it just so I can see that it’s really buyable. At the end, give me a concise summary that combines the upcoming release names and dates from Sole Collector, Sneaker News, and Foot Locker with source attribution, plus a note on any in-stock Kobe pairs you found on Nike and which product tabs you left open for me.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify upcoming Kobe Bryant signature sneaker releases from Sole Collector with release names and dates when listed.", + "verification": "Grader can confirm the open Sole Collector page shows Kobe release coverage and that the reported summary includes matching shoe names and dates from that page.", + "weight": 0.22 + }, + "R2": { + "requirement": "Identify upcoming Kobe release dates and relevant details from Sneaker News and keep the source page open in its own tab.", + "verification": "Grader can verify a Sneaker News tab is open with Kobe release information and that the summary includes dates and details visible on that page.", + "weight": 0.22 + }, + "R3": { + "requirement": "Check Foot Locker for Kobe release calendar entries or related product pages and determine whether items are upcoming, available, or unavailable.", + "verification": "Grader can confirm an open Foot Locker page relevant to Kobe releases or products and that the reported availability status matches what is visible on the page.", + "weight": 0.18 + }, + "R4": { + "requirement": "Find any currently in-stock Kobe shoes on Nike, open the live product pages in separate tabs, and attempt to select a common men's size like 10 when possible.", + "verification": "Grader can verify one or more Nike product tabs are open for Kobe shoes and that the page shows in-stock/purchasable state, including visible size selection if available.", + "weight": 0.23 + }, + "R5": { + "requirement": "Provide a concise consolidated summary of upcoming Kobe release names and dates from Sole Collector, Sneaker News, and Foot Locker with source attribution, plus note any in-stock Nike pairs found and which tabs were left open.", + "verification": "Grader can compare the final written summary against the open source pages and confirm source attribution, release-date consolidation, and mention of the Nike product pages left open.", + "weight": 0.15 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Sports > Sports - Other", + "Lifestyle > Fashion and Apparel" + ], + "num_categories": 3 + }, + { + "task_id": "0ce94d4e773eff1042a6920232f929a1da98c44d", + "confirmed_task": "I’m trying to put together an all-black going-out outfit and want you to build it around a dress first so I can see whether the whole look feels cohesive. On Princess Polly, find a black short party dress in size Medium that’s actually available for delivery, open the product page so you can verify the size and delivery status, and leave that tab open because I want the dress to be the anchor piece. Then go to DemoniaCult and find a black Mary Jane shoe that would work with that dress, making sure size 9 or 9.5 is available on the actual product page, and open the best option in its own tab so I can compare the vibe side by side with the dress; please note exactly which of those sizes you found in stock. After that, go to Edikted and pick a dark brown leather oversized jacket without a hood to use as the outer layer, and open the actual product page so I can visually confirm it fits the look. At the end, send me a short outfit summary with the product names and links for all three items, include the dress and shoe prices, and mention the shoe size availability you found on DemoniaCult.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify one black short party dress on Princess Polly in size Medium that is available for delivery, and capture its product name, price, and link.", + "verification": "Grader can confirm a Princess Polly product page is open showing a black short party dress with Medium selected or available and delivery availability visible, along with the recorded name, price, and URL.", + "weight": 0.24 + }, + "R2": { + "requirement": "Use the Princess Polly dress as the anchor item for the outfit and keep its product page open for reference.", + "verification": "Grader can confirm the dress page remains open in a tab and that later selections are described as pairing with that dress.", + "weight": 0.11 + }, + "R3": { + "requirement": "Identify at least one black Mary Jane shoe on DemoniaCult that is available in size 9 or 9.5, and capture its product name, price, link, and which size is available.", + "verification": "Grader can confirm a DemoniaCult product page is open showing a black Mary Jane shoe with size 9 or 9.5 available, plus the recorded name, price, URL, and size availability.", + "weight": 0.24 + }, + "R4": { + "requirement": "Keep the chosen DemoniaCult shoe open in its own tab as a pairing for the dress.", + "verification": "Grader can confirm the shoe product page remains open in a separate tab and is referenced as the selected pairing with the dress.", + "weight": 0.09 + }, + "R5": { + "requirement": "Identify one dark brown leather oversized jacket without a hood on Edikted, and capture its product name and link from the actual product page.", + "verification": "Grader can confirm an Edikted product page is open showing a dark brown leather oversized jacket with no hood indicated visually or in the product details, along with the recorded name and URL.", + "weight": 0.17 + }, + "R6": { + "requirement": "Provide a final outfit summary that includes all three selected items with product names and links, includes the dress and shoe prices, and explicitly states the shoe size availability found.", + "verification": "Grader can confirm the final response lists the Princess Polly dress, DemoniaCult shoe, and Edikted jacket with names and links, includes the dress and shoe prices, and clearly notes whether size 9 or 9.5 was available.", + "weight": 0.15 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Lifestyle > Fashion and Apparel" + ], + "num_categories": 2 + }, + { + "task_id": "3e106825aab3db868d3b94eb1bd594f9fd4a89be", + "confirmed_task": "I’m thinking about switching my work setup to a Chromebook, and before I do that I want to sanity-check whether my main tools will actually work. First, please go to Hubstaff’s official site and find their guidance about using the desktop app on a Chromebook or ChromeOS, because I need to know whether this would be a normal install or whether I’d have to use some Chrome extension or browser-based workaround instead. Open the actual Hubstaff help or support page that answers this and leave it open so I can look at the wording myself. If it turns out Chromebook use depends more on browser tools, then go to Brave’s official site and find the real Chromebook install page or instructions page for Brave so I have the official setup link ready; open that in its own tab too so I can compare both pages side by side. After that, head to Chrome’s official developer documentation and look up DevTools AI assistance, then summarize how I would get started with it and what data it uses, since that may matter more if I’m working mostly in the browser on a Chromebook. Please keep the official Chrome docs page open as proof. Finally, use Google to find a clear troubleshooting page for the annoying issue where Chrome keeps opening in Guest mode instead of my normal profile, and click through to the actual help page or forum post that gives step-by-step fixes so I have a recovery reference if this Chromebook browser setup gets weird. Leave that troubleshooting page open too, and then give me a clean summary of the Hubstaff Chromebook compatibility conclusion, the supported ChromeOS option Hubstaff mentions, the official Brave Chromebook install link, the DevTools AI getting-started and data-use summary, and the Guest mode fix steps you found.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Correctly state Hubstaff’s official Chromebook compatibility conclusion for the desktop app.", + "verification": "Grader can confirm the final answer matches the wording or meaning shown on an open Hubstaff help/support page about Chromebook or ChromeOS support.", + "weight": 0.24 + }, + "R2": { + "requirement": "Identify the supported Chrome OS option Hubstaff describes for Chromebook users.", + "verification": "Grader verifies the supported option is visible on the Hubstaff page left open, such as extension-based or browser-based tracking guidance for ChromeOS.", + "weight": 0.16 + }, + "R3": { + "requirement": "Provide the official Brave Chromebook install or instructions page URL from brave.com.", + "verification": "Grader confirms a Brave-owned page is open in its own tab and the returned URL points to the official Brave Chromebook installation/download instructions.", + "weight": 0.14 + }, + "R4": { + "requirement": "Summarize how to get started with Chrome DevTools AI assistance using official Chrome documentation.", + "verification": "Grader checks the open developer.chrome.com page and confirms the summary includes setup or enablement steps described there.", + "weight": 0.16 + }, + "R5": { + "requirement": "Explain what data DevTools AI assistance uses according to official Chrome documentation.", + "verification": "Grader confirms the answer’s data-use description matches the official developer.chrome.com documentation left open.", + "weight": 0.12 + }, + "R6": { + "requirement": "Provide a clear ordered troubleshooting sequence for fixing Chrome opening in Guest mode and restoring the normal profile, based on a source found through Google.", + "verification": "Grader confirms the final troubleshooting page is open from a Google result and that the returned steps reflect the source’s actionable sequence.", + "weight": 0.12 + }, + "R7": { + "requirement": "Return all requested outputs together in one final response: Hubstaff conclusion, supported ChromeOS option, Brave link, DevTools AI summary, and Guest mode fix steps.", + "verification": "Grader checks that the final response includes every requested component and that the referenced pages remain open as browser proof.", + "weight": 0.06 + } + }, + "categories": [ + "Computers Electronics and Technology > Computers Electronics and Technology - Other", + "Computers Electronics and Technology > Programming and Developer Software" + ], + "num_categories": 2 + }, + { + "task_id": "4246dec196c9a3382b4224c7ec3a34a20be9f43f", + "confirmed_task": "I’m trying to put together a budget-friendly iPad Air M3 bundle without overpaying, so could you start on Target and look through the actual iPad Air M3 product pages to see which listed configuration is cheapest right now, including any sale price or visible discount text, and leave the cheapest product page open so I can look at the photos and storage/color details myself. Once you know which Target deal is the lowest, use that as the anchor for the bundle and go to Best Buy to find an Apple Magic Keyboard listing for the iPad Pro 13-inch that’s specifically open-box or refurbished and in Good condition, because I want a lower-cost keyboard option for a large Apple tablet setup; open that listing in its own tab so I can compare it side by side with the iPad. After that, check Amazon for a protective case for the iPad Air M3 that explicitly says it supports portrait-mode stand positioning, and open the actual product page so you can quote the wording that proves portrait support and I can verify the listing details on screen. To round things out, hop over to Slickdeals and see what the current featured top deal for iPads is, then tell me the deal title and price so I can decide whether it makes more sense to buy this bundle now or wait for a broader tech bargain. In the end, give me a short bundle summary anchored on the cheapest Target iPad Air M3 you found, with the keyboard, the portrait-capable case, and the Slickdeals iPad deal.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify the lowest current price shown among Target iPad Air M3 product pages, including any visible discount information and the exact configuration it applies to.", + "verification": "Grader confirms the reported price, discount text, and configuration match the cheapest visible Target iPad Air M3 product page left open.", + "weight": 0.32 + }, + "R2": { + "requirement": "Provide one Best Buy listing for an Apple Magic Keyboard compatible with iPad Pro 13-inch that is open-box or refurbished and specifically in Good condition, along with its current price.", + "verification": "Grader confirms the Best Buy tab shows a qualifying listing with compatibility for iPad Pro 13-inch, condition marked Good, and the reported price.", + "weight": 0.24 + }, + "R3": { + "requirement": "Provide one Amazon protective case for iPad Air (M3), include the product name, and quote listing text that explicitly confirms portrait-mode stand positioning support.", + "verification": "Grader confirms the Amazon product page is open and contains the quoted text explicitly indicating portrait-mode stand support.", + "weight": 0.24 + }, + "R4": { + "requirement": "Report the current featured top deal on Slickdeals for iPads, including its title and displayed price.", + "verification": "Grader confirms the reported deal title and price match the currently featured top deal for iPads visible on Slickdeals.", + "weight": 0.1 + }, + "R5": { + "requirement": "Present the final answer as a budget-oriented bundle summary anchored on the cheapest Target iPad Air M3 deal, incorporating the keyboard option, portrait-capable case, and Slickdeals reference.", + "verification": "Grader confirms the final summary clearly uses the cheapest Target iPad Air M3 as the bundle anchor and includes all required components in a budget comparison framing.", + "weight": 0.1 + } + }, + "categories": [ + "Computers Electronics and Technology > Consumer Electronics", + "Ecommerce & Shopping > Ecommerce and Shopping - Other" + ], + "num_categories": 2 + }, + { + "task_id": "b6b8ad71aa3112840790066d7d62b498babdfa5c", + "confirmed_task": "I’m trying to decide whether driving this week is a bad idea, so can you build me a quick weather risk snapshot that starts with what it feels like right now and then widens out to the bigger trouble spots? First, on Google, search for Baltimore, Maryland weather and grab the current temperature plus the plain-English condition like cloudy, sunny, rain, or whatever it says, just so I have a baseline for home conditions. Then go to Wunderground and look up Syracuse, New York, and check the 10-day/7-day style forecast to find the lowest temperature expected over the next 7 days, including which day it happens, because I want to compare that colder destination against Baltimore. After that, use the National Weather Service forecast page for the Rittman, Ohio area near Marshallville and tell me what the current forecast says and whether there are any active alerts posted there, since that would really affect an Ohio leg of the drive; please open the actual forecast page and leave it visible so I can see the alert area and forecast text myself. Finally, go to the NWS Mount Holly page, find the winter forecast graphic, and report the snowfall amount shown there so I can tell whether the Mid-Atlantic part looks like a nuisance event or something more serious; if the graphic opens separately, leave that tab open too so I can look at the map. In the end, send me a short location-by-location summary with the key weather risk for Baltimore, Syracuse, Rittman/Marshallville, and the Mount Holly region.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Report Baltimore, Maryland’s current temperature and current weather conditions from Google.", + "verification": "Grader can confirm the answer against the Google weather module showing Baltimore weather with a numeric temperature and condition label.", + "weight": 0.18 + }, + "R2": { + "requirement": "Report the lowest forecasted temperature in Syracuse, New York over the next 7 days from Wunderground, including the day it occurs.", + "verification": "Grader can verify the selected low and day on the Syracuse forecast page in Wunderground’s multi-day forecast view.", + "weight": 0.22 + }, + "R3": { + "requirement": "Report the current National Weather Service forecast for the Rittman, Ohio area near Marshallville.", + "verification": "Grader can confirm the forecast wording on the forecast.weather.gov page for the specified area.", + "weight": 0.18 + }, + "R4": { + "requirement": "State whether any active weather alerts are posted for the Rittman, Ohio area near Marshallville.", + "verification": "Grader can verify the presence or absence of alert banners, watches, warnings, or advisories on the same NWS forecast page left open by the agent.", + "weight": 0.14 + }, + "R5": { + "requirement": "Report the predicted snowfall amount shown in the winter forecast graphic on the NWS Mount Holly page.", + "verification": "Grader can confirm the snowfall amount directly from the winter forecast graphic or image tab left open from the Mount Holly page.", + "weight": 0.18 + }, + "R6": { + "requirement": "Return the findings as a short summary organized by location or region, with a brief key weather risk for each one.", + "verification": "Grader can check that the final response includes Baltimore, Syracuse, Rittman/Marshallville, and the Mount Holly region, each with the requested weather detail and a concise risk statement.", + "weight": 0.1 + } + }, + "categories": [ + "Science and Education > Weather", + "Travel and Tourism > Travel and Tourism - Other" + ], + "num_categories": 2 + }, + { + "task_id": "b1bd700090c23df9e9f6b7b9557ac418df602b8d", + "confirmed_task": "I’m trying to put together a realistic poetry submission plan for this month, so could you help me look up a couple of places I might actually submit to and then pair that with some funding opportunities? Start on Google and find the American Poetry Journal submission guidelines, then open the actual guidelines page and tell me how they want submissions sent, whether there’s a fee, and if they mention a reading period, because I want to know if it’s something I can act on right away. After that, still using Google, find the online submissions page for Pidgeonholes and open the direct submissions page itself in a separate tab so I can compare the two options side by side; please give me the exact submission URL and leave that tab open. Once you’ve got those two submission outlets, go to Poets & Writers and find at least three writing contests, awards, or grants that could help support my submission plan, and for each one note the opportunity name and the application deadline, or clearly say if no deadline is listed. In the end, send me a short summary with the American Poetry Journal submission method, fee, and any reading period you found, the direct Pidgeonholes submissions link, and the three Poets & Writers opportunities with their deadlines.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify the American Poetry Journal submission guidelines and report the required submission method, any stated fee, and any reading period information if present.", + "verification": "Grader can confirm the agent opened the actual American Poetry Journal guidelines page and the final response includes the submission method plus fee and reading period details or a clear note if not stated.", + "weight": 0.35 + }, + "R2": { + "requirement": "Provide the direct online submissions URL for Pidgeonholes and keep the submissions page open in its own tab.", + "verification": "Grader can confirm a live Pidgeonholes submissions page is open in a separate browser tab and the exact URL is included in the final response.", + "weight": 0.2 + }, + "R3": { + "requirement": "List at least three writing contests, awards, or grants from Poets & Writers, each with the opportunity name and application deadline, or explicitly note if no deadline is listed.", + "verification": "Grader can confirm the opportunities on Poets & Writers pages and check that the final response includes three names with corresponding deadlines or clear no-deadline notes.", + "weight": 0.3 + }, + "R4": { + "requirement": "Deliver a concise final poetry submission plan summary combining the American Poetry Journal details, the direct Pidgeonholes submissions link, and the three Poets & Writers opportunities with deadlines.", + "verification": "Grader can verify the final answer consolidates all required findings into one concise summary without omitting any requested fields.", + "weight": 0.15 + } + }, + "categories": [ + "Arts & Entertainment > Books and Literature", + "Finance > Finance - Other" + ], + "num_categories": 2 + }, + { + "task_id": "fb3f6eb23fad9b18c6c612d213d32ea40d891092", + "confirmed_task": "I’m thinking about signing up for a couple of online research-study platforms, but before I hand over my info I want a practical sense of what the participant experience is actually like. Please start on Google and look up Respondent.io, then open the actual Respondent site and any clearly relevant public help or participant pages so you can tell me, in plain English, how it works for participants — especially how someone signs up, builds a profile, finds or qualifies for studies, and how payment is handled. Keep the most useful Respondent page open so I can glance at it later. Then go to Terac’s site and figure out how that platform works for participants too, with extra attention to how someone gets set up to take part in studies, because I want to compare whether Terac’s onboarding feels simpler or more involved than Respondent’s. If you find a page that explains joining or participation, leave that open in its own tab as proof. After that, switch over to SurveyMonkey and open the screener page and verify it loads with visible survey questions so I can see what an actual participant flow feels like end to end, and leave that final page visible. When you’re done, give me a short comparison of Respondent versus Terac, say clearly whether Terac seems easier or more involved to get started with, and confirm that the SurveyMonkey screener page loaded with visible survey questions.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Summarize how Respondent.io works for participants, including signup/profile creation, how users find or qualify for studies, and how payment is handled, based on publicly available information from official or clearly relevant pages.", + "verification": "Grader confirms the final response includes all four elements and that a relevant Respondent page is open or was visited from Google search results.", + "weight": 0.3 + }, + "R2": { + "requirement": "Summarize how Terac works and specifically explain how participants get set up to take part in studies using information from Terac’s public site.", + "verification": "Grader confirms the final response describes Terac’s platform purpose and participant setup flow, and that a relevant Terac page explaining participation or joining is open or was visited.", + "weight": 0.25 + }, + "R3": { + "requirement": "Directly compare Terac’s onboarding/setup with Respondent’s baseline and state clearly whether Terac appears easier or more involved for participants.", + "verification": "Grader confirms the final response contains an explicit comparison and a clear easier/more involved judgment grounded in the two platform summaries.", + "weight": 0.2 + }, + "R4": { + "requirement": "Open the SurveyMonkey screener page and verify it loads with visible survey questions, leaving the page visible as proof.", + "verification": "Grader can confirm the SurveyMonkey screener page was opened and that visible survey questions or screener content are displayed on the page.", + "weight": 0.25 + } + }, + "categories": [ + "Business and Consumer Services > Business Services", + "Jobs and Career > Jobs and Employment" + ], + "num_categories": 2 + }, + { + "task_id": "69782bfcfdb3311496bc9048bf66915b33e692cd", + "confirmed_task": "I’m trying to pick a Pilates place in the Fresno/Clovis area and want something concrete I can actually compare on screen, not just a vague list. Please start on Google and find at least two Pilates studios in Clovis, California that clearly offer classes, then open each studio’s actual schedule or booking page in its own tab and leave those tabs open so I can look at the class calendars myself later. Once you’ve got those Clovis options, broaden it into a short Fresno-area comparison by finding at least three Pilates class options around Fresno or Clovis, with each studio’s real schedule or booking link, because I want to see what nearby choices exist if the Clovis spots don’t fit my schedule. After that, go to Title 29 Fitness’s website and figure out what it offers in Fresno, especially anything relevant to Pilates or group fitness, and capture the class schedule details shown there; if there’s a schedule page or booking flow, open that too so I have visual proof it’s current. In the end, give me one concise comparison that includes the two Clovis studios, the broader Fresno-area list, and where Title 29 seems to fit among them based on what you actually saw in the browser.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify at least two Clovis, California Pilates studios that offer classes and provide each studio’s name plus a direct class schedule or booking page link.", + "verification": "Grader can confirm two distinct Clovis studios are listed and that their schedule or booking pages are open in separate tabs or directly referenced with valid links.", + "weight": 0.3 + }, + "R2": { + "requirement": "Provide a broader Fresno-area list of at least three Pilates class options, each with the studio name and a direct schedule or booking page link.", + "verification": "Grader can verify at least three Fresno-area options are named and each includes a schedule or booking link visible from the opened pages or final response.", + "weight": 0.3 + }, + "R3": { + "requirement": "Look up Title 29 Fitness in Fresno and capture what it offers along with the class schedule details shown on its website.", + "verification": "Grader can confirm the response includes offerings described from title29fitness.com and schedule details taken from the visible site pages or booking flow.", + "weight": 0.2 + }, + "R4": { + "requirement": "End with one concise comparison that includes the Clovis options, the wider Fresno-area list, and an explanation of where Title 29 Fitness fits among them.", + "verification": "Grader can verify the final response contains a brief synthesis comparing all gathered options and explicitly situating Title 29 relative to the Clovis and Fresno-area choices.", + "weight": 0.2 + } + }, + "categories": [ + "Health > Nutrition Diets and Fitness", + "Hobbies and Leisure > Hobbies and Leisure - Other" + ], + "num_categories": 2 + }, + { + "task_id": "67ad95421a303ad78cfcd8c3f5a7f6668d2c6a75", + "confirmed_task": "I’m helping a family member shop for an affordable used Toyota and want a realistic comparison across a couple of car sites before we decide what to pursue. On Edmunds, please search around Augusta, Georgia and find the cheapest used Toyota RAV4 listing that has both AWD and heated seats, then open the actual vehicle listing so you can confirm those features on the page and note the price, year, mileage, dealer or seller, and anything else basic that stands out; leave that listing open in its own tab so I can look at the photos and details later. Then, using that RAV4 as the benchmark for what the market looks like, go to Cars.com and search for any used Toyota within 50 miles of Augusta, GA priced at $10,000 or less, and open one matching listing that seems like a good budget reference so I can see what a lower-cost Toyota option looks like on another marketplace; keep that listing open too so I can compare the two tabs side by side. After that, on CarGurus, pull up the comparison details for the Toyota Camry XLE AWD and the Mazda3 Turbo Hatchback and capture the key specs for each — engine, horsepower, drivetrain, fuel economy, and MSRP — because I want to know whether sticking with Toyota’s AWD choices makes more sense than considering a non-Toyota AWD alternative. In the end, send me a concise summary with the Edmunds RAV4 listing details, the Cars.com budget Toyota listing details, and the side-by-side spec comparison.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Find the lowest-priced used Toyota RAV4 on Edmunds near Augusta, GA that includes both AWD and heated seats, and open the actual listing page.", + "verification": "Grader can confirm an Edmunds vehicle detail page is open for a used Toyota RAV4 near Augusta with AWD and heated seats visible in the listing details or features, and that it is the lowest-priced qualifying result found.", + "weight": 0.35 + }, + "R2": { + "requirement": "Capture the Edmunds RAV4 listing’s basic details including price, year, mileage, and dealer or seller information.", + "verification": "Grader can verify the reported Edmunds details against the open listing page fields for price, model year, mileage, and dealer or seller name.", + "weight": 0.2 + }, + "R3": { + "requirement": "On Cars.com, find one used Toyota within 50 miles of Augusta, GA priced at $10,000 or less and open the actual listing page.", + "verification": "Grader can confirm a Cars.com vehicle listing page is open and that the listing meets the Toyota, distance, and price constraints shown in the search or listing context.", + "weight": 0.2 + }, + "R4": { + "requirement": "On CarGurus, provide a side-by-side comparison of the Toyota Camry XLE AWD and Mazda3 Turbo Hatchback covering engine, horsepower, drivetrain, fuel economy, and MSRP.", + "verification": "Grader can verify the extracted specs against the CarGurus comparison or model pages for both vehicles.", + "weight": 0.15 + }, + "R5": { + "requirement": "Return a concise final summary that includes the Edmunds RAV4 listing, the Cars.com budget Toyota listing, and the Camry XLE AWD versus Mazda3 Turbo Hatchback spec comparison.", + "verification": "Grader can confirm the final response includes all three required components with the relevant details from the prior steps.", + "weight": 0.1 + } + }, + "categories": [ + "Vehicles > Makes and Models", + "Ecommerce & Shopping > Ecommerce and Shopping - Other" + ], + "num_categories": 2 + }, + { + "task_id": "5632294e494e6e86eb94739235bfa2373b868868", + "confirmed_task": "I’m getting ready to do a full DIY front and rear brake job on my 2020 Chevrolet Traverse with the 3.6L V6, so could you use RockAuto to look up OEM-style front and rear brake pad and rotor kits for that exact vehicle and jot down the key details like brand, part line, what each kit includes, and anything that helps me tell the options apart? If there are separate front and rear kits, open the actual product pages or info popups so I can visually compare them, and leave the most relevant RockAuto results open in their own tabs. Once you’ve got the parts figured out, go to the Haynes US site and find the repair manual that would actually help with brake pad and rotor replacement for this Traverse, and grab the manual title, link, and coverage years so I know it matches my SUV. Then check Amazon for a hose clamp tightening tool that looks suitable to keep nearby for the job, give me one solid option with its current price, and open the product page so I can see the photos and reviews for myself. In the end, send me a short summary with the RockAuto brake kit details, the Haynes manual name and link, and the Amazon tool name and price.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "RockAuto is used to identify OEM-style front and rear brake pad and rotor kit options for a 2020 Chevrolet Traverse 3.6L V6, with key product details captured for the available options.", + "verification": "Grader can confirm the RockAuto vehicle selection and brake kit listings, plus visible product/info pages or tabs showing front and rear kit details such as brand, line, and included parts.", + "weight": 0.4 + }, + "R2": { + "requirement": "Relevant RockAuto front and rear kit pages or info views are opened and left available for visual comparison.", + "verification": "Grader can confirm multiple RockAuto tabs, product pages, or info popups remain open showing the chosen front and rear kit options.", + "weight": 0.1 + }, + "R3": { + "requirement": "A Haynes manual suitable for a 2020 Chevrolet Traverse brake pad and rotor replacement is found, with the manual title, link, and coverage years recorded.", + "verification": "Grader can confirm the Haynes manual page is open and visibly shows the manual title and coverage information matching the Traverse.", + "weight": 0.2 + }, + "R4": { + "requirement": "One suitable hose clamp tightening tool is found on Amazon, with its product name and current price recorded.", + "verification": "Grader can confirm the Amazon product page is open and shows the selected tool name and visible price.", + "weight": 0.15 + }, + "R5": { + "requirement": "The final response provides a short combined summary of the RockAuto brake kit details, the Haynes manual name and link with coverage years, and the Amazon tool name and price.", + "verification": "Grader can compare the final written summary against the information visible on the RockAuto, Haynes, and Amazon pages.", + "weight": 0.15 + } + }, + "categories": [ + "Vehicles > Makes and Models", + "Ecommerce & Shopping > Ecommerce and Shopping - Other" + ], + "num_categories": 2 + }, + { + "task_id": "eb8a01554a84cf5d16a84a766d0f6cfb55d33c81", + "confirmed_task": "I’m planning a holiday party in Minneapolis and want a quick shortlist of bar caterers that actually look local and usable. Please start on The Knot and look specifically for Minneapolis-area wine or liquor bar service caterers, then open the actual vendor listings in separate tabs so I can compare them visually, and pull together at least five options with each business name and city/state. If Surdyk’s shows up in that shortlist, go to Surdyk’s Catering and look through their site to see whether they feel more full-service than the others, then summarize at least three catering services or service types they offer, including any food and beverage options you can verify on the page, and leave the Surdyk’s page open so I can glance at it later. After that, because I may want a nonalcoholic menu item to pair with whichever caterer seems best, use Google to find a pressure-cooker or Instant Pot mulligatawny soup recipe, open the actual recipe page, and give me the ingredient list plus the basic cooking steps from that one recipe. Please return everything as one concise planning summary so I can compare the caterers, any Surdyk’s details if relevant, and the soup idea all in one place.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Provide at least five Minneapolis-area wine or liquor bar service caterers sourced from The Knot, each with business name and location in city/state format.", + "verification": "Grader confirms at least five vendor names and matching city/state details from The Knot listings, with evidence that vendor pages were opened or reviewed in browser tabs.", + "weight": 0.4 + }, + "R2": { + "requirement": "If Surdyk’s is included in the The Knot shortlist, summarize at least three Surdyk’s Catering full-service offerings or service types from surdykscatering.com, including food and/or beverage offerings.", + "verification": "Grader confirms Surdyk’s appeared in the shortlist and that the summary includes three verified offerings visible on Surdyk’s site; Surdyk’s page remains open as browser proof.", + "weight": 0.25 + }, + "R3": { + "requirement": "Provide one pressure-cooker or Instant Pot mulligatawny soup recipe found via Google, including the ingredient list and basic cooking steps from a single recipe page.", + "verification": "Grader confirms Google was used to reach a recipe page and that the final response includes ingredients and basic steps consistent with one visible recipe source.", + "weight": 0.2 + }, + "R4": { + "requirement": "Return all requested information as one concise planning summary combining the caterer shortlist, any applicable Surdyk’s comparison details, and the soup recipe information.", + "verification": "Grader checks that the response is a single concise summary covering all required outputs without omitting any applicable section.", + "weight": 0.15 + } + }, + "categories": [ + "Food and Drink > Beverages", + "Community and Society > Holidays and Seasonal Events" + ], + "num_categories": 2 + }, + { + "task_id": "ada70bfe81da1cd33bec47d79a9d279d7734a686", + "confirmed_task": "I’m trying to plan a Yorkshire family day out around Christmas and want to compare two festive options properly before I recommend one. Could you start on the official Stockeld Park site and open their Winter or Christmas ticket page, then the main activities page if needed, and pull together exactly what’s included with a standard festive ticket so I can see whether it feels like a full day out for kids; if there are photos or activity sections on the page, open the main ticket page and the activities/details page in separate tabs and leave them open so I can glance at them later. Then use Google to find the current Harrogate Christmas Funland page or official event listing, open the actual event page, and summarize what’s included there too, along with the location and the event dates, because I want to compare whether it sounds more substantial than Stockeld Park; if you find more than one relevant result, open the most official-looking listing in its own tab and verify it’s live before using it. After that, go to the official York Maze site and pull together the practical visitor details for using it as a backup daytime activity nearby, especially where it is, the opening times or seasonal opening info, and any important ticket or visit-planning notes like booking ahead, age guidance, or whether it’s seasonal, and leave the key visitor info page open as well. In the end, give me a concise side-by-side comparison of Stockeld Park versus Harrogate Christmas Funland, then a short recommendation on which festive option seems better for a family day out and whether York Maze sounds like a realistic backup plan.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Accurately summarize the activities included with Stockeld Park Winter/Christmas tickets using the official Stockeld Park pages.", + "verification": "Grader can confirm the summary matches visible included activities on the open Stockeld Park ticket/details tabs.", + "weight": 0.24 + }, + "R2": { + "requirement": "Find and use a live, relevant Harrogate Christmas Funland page or official listing and summarize what is included in the experience.", + "verification": "Grader can see an open Harrogate Christmas Funland page reached via Google and verify the included features against the visible listing content.", + "weight": 0.2 + }, + "R3": { + "requirement": "Include Harrogate Christmas Funland’s event location and event dates.", + "verification": "Grader can verify the location and dates directly on the open event/listing page.", + "weight": 0.14 + }, + "R4": { + "requirement": "Summarize York Maze visitor details including where it is, opening times or seasonal opening information, and important ticket or visit-planning notes from the official York Maze site.", + "verification": "Grader can confirm these details on the open York Maze visitor information page.", + "weight": 0.2 + }, + "R5": { + "requirement": "Present Stockeld Park and Harrogate Christmas Funland as a concise side-by-side comparison focused on what is included and overall suitability for a family festive outing.", + "verification": "Final response clearly compares both attractions using findings from Steps 1 and 2 rather than listing them separately.", + "weight": 0.12 + }, + "R6": { + "requirement": "Provide a short recommendation on which festive attraction seems better and whether York Maze works as a backup daytime activity.", + "verification": "Final response includes a reasoned recommendation grounded in the gathered details from all three sites.", + "weight": 0.1 + } + }, + "categories": [ + "Travel and Tourism > Tourist Attractions", + "Community and Society > Holidays and Seasonal Events" + ], + "num_categories": 2 + }, + { + "task_id": "140960bb7293bdeeb6bcc60931681cb9b815351b", + "confirmed_task": "I'm trying to plan a really simple errand loop around Chapel Hill and Carrboro, so could you start on Google and find at least three public little free pantries or community food boxes in or very close to Chapel Hill/Carrboro, then open the actual map or listing pages for each one in separate tabs so I can visually confirm they're real places and still look active. Once you have those, go to Publix and check the current weekly BOGO deals, and from that ad pick the deals that would work for a carnivore-style dinner, meaning meat, seafood, cheese, eggs, or other animal-based items only, because I want to turn the pantry run into a quick grocery stop too. From those BOGO items, choose one specific dinner pairing made only from the qualifying deals and leave the weekly ad or product pages open so I can look at the prices and packaging myself. After that, go to Bisou Bisou's site and find one cocktail on the menu that is actually green, then open the menu page and tell me the drink name and ingredients so I have an optional treat stop after the errands. Please give me the pantry locations with addresses or clear location descriptions, the exact Publix BOGO items you used and the dinner pairing, plus the green cocktail name and ingredients.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Find at least three public food box or mini pantry locations in or near Chapel Hill/Carrboro and provide an address or clear location description for each.", + "verification": "Grader can confirm three distinct pantry or food box locations from Google results or map/listing pages, with separate tabs or visible listing details showing each location.", + "weight": 0.35 + }, + "R2": { + "requirement": "Identify current Publix BOGO items that fit a carnivore diet.", + "verification": "Grader can confirm the listed items appear in the current Publix weekly BOGO ad or product pages and that the items are animal-based foods such as meat, seafood, cheese, eggs, or similar.", + "weight": 0.25 + }, + "R3": { + "requirement": "Propose one dinner pairing made only from the carnivore-diet-friendly Publix BOGO items identified.", + "verification": "Grader can verify that every component of the proposed dinner pairing comes directly from the qualifying BOGO items found in Step 2.", + "weight": 0.2 + }, + "R4": { + "requirement": "Identify one green cocktail from Bisou Bisou's cocktail menu and provide its name and ingredients.", + "verification": "Grader can confirm the cocktail appears on the Bisou Bisou menu page and that the response includes the exact drink name and ingredient list from the site.", + "weight": 0.2 + } + }, + "categories": [ + "Community and Society > Philanthropy", + "Reference Materials > Maps" + ], + "num_categories": 2 + }, + { + "task_id": "10548585c3214aa1a15f7ceef8aa4fde0c2fcdf7", + "confirmed_task": "I’m putting together a quick set of Chromebook help notes for someone who keeps asking me whether they can use Firefox instead of Chrome and whether they can still get to their saved Apple passwords when they’re in Chrome, so could you check a few things in a real browser for me? Start on Mozilla’s official Firefox site and open the actual Chromebook or ChromeOS instructions so you can confirm whether Firefox can really be installed there and what Mozilla says the install method is; leave that page open because I want the official wording as a reference. Then go to the Chrome Web Store and find the official iCloud Passwords extension from Apple, open the actual listing page, and verify from the page itself that it’s for accessing iCloud passwords in Chrome; keep that tab open too so I can see the publisher and listing URL. Since these notes should also cover a Google Docs issue they run into all the time, use Google Search to look up reliable troubleshooting for images in Google Docs that show an exclamation mark or refuse to load, then open a useful result and pull out the recommended fixes. After that, open the UserTesting contributor sign-in page so I can confirm a normal login screen is reachable for a site where saved credentials might matter, then in another tab open Patreon’s homepage just to verify ordinary browsing works there too and leave both tabs open so I can compare them. Finally, go to YouTube, open the video titled \"Youtube Rewind 2011,\" start playback, and tell me what you see in the first moments so I know media playback works in the browser. At the end, give me a concise summary of what you confirmed on each site, including the Firefox Chromebook answer, the iCloud Passwords extension name and listing URL, the Google Docs image fixes, and whether UserTesting, Patreon, and YouTube all behaved normally.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "The Mozilla Firefox Chromebook/ChromeOS instructions page is opened and the agent correctly confirms whether Firefox can be installed on Chromebook, including Mozilla’s stated install/download method.", + "verification": "Grader can see an official Mozilla Firefox/Support page about Chromebook or ChromeOS open, and the final answer states the installability outcome plus the method described on that page.", + "weight": 0.2 + }, + "R2": { + "requirement": "The official Apple iCloud Passwords extension listing is opened in the Chrome Web Store, and the agent records the exact extension name and listing URL while confirming its purpose is to access iCloud passwords in Chrome.", + "verification": "Grader can see the Chrome Web Store listing page with Apple as publisher or official branding, and the final answer includes the extension name, URL, and purpose.", + "weight": 0.2 + }, + "R3": { + "requirement": "The agent finds troubleshooting guidance for Google Docs images showing an exclamation mark or not loading and summarizes the recommended fixes from a relevant opened result.", + "verification": "Grader can see Google Search results and/or an opened troubleshooting page, and the final answer includes concrete recommended fixes rather than a vague statement.", + "weight": 0.18 + }, + "R4": { + "requirement": "The UserTesting contributor sign-in page is opened and the sign-in screen is confirmed as reachable.", + "verification": "Grader can see a UserTesting sign-in page with login fields or contributor sign-in UI visible, and the final answer explicitly confirms access.", + "weight": 0.1 + }, + "R5": { + "requirement": "Patreon’s homepage is opened in its own tab and normal homepage access is confirmed from visible branding or title.", + "verification": "Grader can see Patreon homepage branding or title in the open tab, and the final answer confirms homepage access worked.", + "weight": 0.1 + }, + "R6": { + "requirement": "The YouTube video titled \"Youtube Rewind 2011\" is opened and playback is started successfully.", + "verification": "Grader can see the YouTube watch page with the specified title and a playing state or progressed timestamp, and the final answer describes the first visible moments.", + "weight": 0.1 + }, + "R7": { + "requirement": "A concise final summary covers all six sites and includes the Firefox Chromebook conclusion, iCloud Passwords extension details, Google Docs image troubleshooting, UserTesting sign-in confirmation, Patreon homepage confirmation, and YouTube playback confirmation.", + "verification": "Grader checks the final response for all required site-specific findings with no major omissions.", + "weight": 0.12 + } + }, + "categories": [ + "Computers Electronics and Technology > Computers Electronics and Technology - Other" + ], + "num_categories": 1 + }, + { + "task_id": "fbcfa176b2e1aa42200d4f3adb66dcf0a6ca62ee", + "confirmed_task": "I’m trying to put together a very small monthly subscription budget and want to compare a couple of creator memberships against ChatGPT so I can see what actually fits. Please start on Patreon with Matt and Shane’s Secret Podcast and note the membership tier names shown on their page, then also open the Patreon pages for Matt and Shane’s Secret Podcast and Chris Sain in separate tabs so I can visually compare the available tier names and prices side by side. After that, go to ChatGPT’s pricing page on chatgpt.com and capture the current plan names and prices, because I want to know whether adding ChatGPT would still be realistic alongside just one creator membership. To round things out, use Google to get to the actual HellHades membership plans page and check whether any plan specifically mentions interface improvements or automation features, since that kind of perk would make the comparison more meaningful than price alone. Please leave the Patreon tabs and the ChatGPT pricing page open so I can glance at them afterward, and then give me a concise summary that groups the Patreon tiers, ChatGPT pricing, and the HellHades feature note into a simple budget-minded recommendation.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify the subscription tier names shown on the Matt and Shane’s Secret Podcast Patreon membership page.", + "verification": "Grader can confirm the agent visited the Patreon page for Matt and Shane’s Secret Podcast and extracted the visible tier names from that page.", + "weight": 0.2 + }, + "R2": { + "requirement": "Record the available membership tiers for both Matt and Shane’s Secret Podcast and Chris Sain on Patreon, including each tier’s name and price as shown on their membership/join pages.", + "verification": "Grader can confirm two Patreon creator pages were opened in separate tabs and that the reported tier names and prices match what is visible on each page.", + "weight": 0.3 + }, + "R3": { + "requirement": "Capture the ChatGPT plan names and prices from the ChatGPT pricing page.", + "verification": "Grader can confirm the chatgpt.com pricing page was opened and that the reported plan names and prices match the visible pricing cards or table.", + "weight": 0.2 + }, + "R4": { + "requirement": "Review the HellHades membership plans page and state whether any plan specifically mentions interface improvements or automation features.", + "verification": "Grader can confirm the agent reached the actual HellHades membership page from Google results and checked the visible plan descriptions for those feature mentions.", + "weight": 0.15 + }, + "R5": { + "requirement": "Provide a concise budget-oriented recommendation that combines the Patreon tier comparison, ChatGPT pricing, and the HellHades feature note into a judgment about whether ChatGPT fits alongside one creator membership.", + "verification": "Grader can confirm the final response synthesizes findings from Patreon, ChatGPT, and HellHades into a short recommendation rather than listing raw data only.", + "weight": 0.15 + } + }, + "categories": [ + "Arts & Entertainment > Streaming & Online TV", + "Computers Electronics and Technology > Computers Electronics and Technology - Other", + "Finance > Finance - Other" + ], + "num_categories": 3 + }, + { + "task_id": "c8d7c6136ca692eac6d7532e275d5f8d11ec971b", + "confirmed_task": "I’m trying to put together a really simple morning routine that starts with a quick math refresher and then shifts into beginner yoga I can actually keep using. Please start on Google and search for a video lesson that clearly teaches both explicit and recursive formulas for arithmetic sequences, then open the actual video page and note the title, creator or channel, and URL so I have a study piece to come back to; if it looks solid, leave that tab open for me. After that, go to YouTube and find a morning yoga video that’s right around 20 minutes long, open the actual video page, and tell me the title and duration so I can see whether it feels short enough for a real weekday routine. If that general option seems reasonable, stay on YouTube and specifically check whether Yoga With Adriene has a morning yoga video under 20 minutes that would fit the same need, and confirm from the video page or description whether it’s actually a vinyasa flow style session; please open that in its own tab too so I can compare the two yoga options side by side. To round this out, use Google to find at least three approachable or funny yoga instructors who post free classes on YouTube, and for each one give me the instructor’s name, a channel or website link, and a short note on why they seem especially beginner-friendly compared with the yoga videos you found earlier. In the end, I just want a concise resource list for this morning routine, with the key links, and please keep the math video plus the two yoga video tabs open so I can look at them myself.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "A video lesson found via Google is opened that teaches both explicit and recursive formulas for arithmetic sequences, with the title, creator/channel name, and URL recorded.", + "verification": "Grader can confirm a Google search was used and the final open tab is the actual video page showing a relevant title/channel and a valid URL.", + "weight": 0.28 + }, + "R2": { + "requirement": "A YouTube morning yoga video of approximately 20 minutes is opened, with its title and duration provided.", + "verification": "Grader can verify the YouTube video page is open and the visible title and runtime are around 20 minutes.", + "weight": 0.22 + }, + "R3": { + "requirement": "A Yoga With Adriene morning yoga video under 20 minutes is opened in its own tab, and the response correctly confirms yes or no whether it is a vinyasa flow style session.", + "verification": "Grader can confirm the separate YouTube tab is a Yoga With Adriene video under 20 minutes and that the vinyasa determination is supported by visible page text such as the title or description.", + "weight": 0.24 + }, + "R4": { + "requirement": "At least three approachable or funny yoga instructors who post free YouTube classes are listed, each with instructor name, channel or website link, and a brief beginner-friendly rationale tied to the earlier yoga options.", + "verification": "Grader can confirm three distinct instructors were found via Google and that each entry includes instructor name, a valid channel or website link, and a comparative note about why the instructor seems approachable for beginners.", + "weight": 0.26 + } + }, + "categories": [ + "Science and Education > Math", + "Health > Nutrition Diets and Fitness" + ], + "num_categories": 2 + }, + { + "task_id": "e15345ed27f1933065af403601876e5f6597a943", + "confirmed_task": "I’m putting together a very simple reading-support mini lesson for a student who does better with easier-to-scan text, and I want you to help me pull the pieces together in the browser. Start on Google and find one online English grammar practice quiz that feels appropriate for about 5th grade, then open the actual quiz page so you can verify it’s really a student-facing practice activity and leave that tab open for me as a reference; I need the quiz title and direct URL. Then use Google again to find a printable worksheet or practice page for an 8th-grade student about basic marketing strategies like product, price, place, and promotion or closely related introductory marketing concepts, because I want it to work as an extension activity after the grammar warm-up; open the real worksheet or resource page in its own tab so I can see that it looks classroom-appropriate and printable, and note the title and URL. After that, go to Slidesgo and pick a fun, classroom-appropriate presentation template that could reasonably hold both the grammar warm-up and the marketing extension in one student lesson deck, and open the template’s actual page so I can see the preview images; please include the template name, URL, and whether it’s available for Google Slides or PowerPoint. Finally, go to 10015.io’s bionic reading converter and convert this exact lesson intro into bionic reading format so I can paste it into the first slide of the template you chose: “Today we will warm up with a short grammar quiz, then practice how people use basic marketing strategies like product, price, place, and promotion. Read each direction carefully and do your best.” Please keep the useful tabs open and send me the quiz, worksheet, template, and the converted text in one clean summary.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "A real online English grammar practice quiz appropriate for about 5th grade is found via Google, and the final response includes the quiz title and direct quiz page URL.", + "verification": "Grader can confirm there is an open tab showing the actual quiz page, not just search results, and the reported title/URL match the visible page.", + "weight": 0.28 + }, + "R2": { + "requirement": "A printable worksheet or practice page about basic marketing strategies or a closely related introductory marketing topic for about 8th grade is found via Google, and the final response includes the resource title and access URL.", + "verification": "Grader can confirm there is an open tab showing the actual worksheet/resource page with printable or classroom-use cues, and the reported title/URL match the visible page.", + "weight": 0.28 + }, + "R3": { + "requirement": "One Slidesgo template that is fun and classroom-appropriate for combining both activities into a single lesson deck is selected, and the final response includes the template name, Slidesgo URL, and use/download option.", + "verification": "Grader can confirm the Slidesgo template detail page is open with visible preview images and that the named template, URL, and Google Slides/PowerPoint option match the page.", + "weight": 0.22 + }, + "R4": { + "requirement": "The provided lesson intro is converted on 10015.io into bionic reading format and the full converted text is returned.", + "verification": "Grader can confirm the converter page shows transformed output corresponding to the provided passage and that the returned text matches the visible converted result.", + "weight": 0.22 + } + }, + "categories": [ + "Science and Education > Education" + ], + "num_categories": 1 + }, + { + "task_id": "38039bd8d8469c245faab531cb508c3c975c4869", + "confirmed_task": "I’m putting together a simple hummingbird-themed flyer draft and want you to help me gather a few references in a practical order so I can actually see what might work together on screen. First, on Google Images, search for a photo of a hummingbird flying among flowers in a sunlit garden and open the actual image result that feels strongest as the main visual reference, then keep that image page open in its own tab and save the image result URL for me. Once you have that nature image as a reference point, go to Canva’s Templates page, search for flyer templates, and pick two template names that would suit a bright, colorful garden-style hummingbird flyer; please open each template in a separate tab too so I can compare the layouts visually later. After that, use Google to find a simple CSS example for styling an HTML unordered list with ul and li selectors, including an example that changes the bullet style with list-style-type, because I may turn the flyer details into a small webpage and want a clean bulleted section style. Finally, go to Purdue OWL and find the APA guidance for citing a PowerPoint presentation so I know how to credit it properly if I turn this flyer concept into slides. At the end, send me the hummingbird image result URL, the two Canva template names, the CSS example you found, and a concise summary of the Purdue OWL APA citation guidance, and leave the image tab plus the two Canva template tabs open so I can look at them.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Provide one Google Images result URL for a photo that matches a hummingbird flying among flowers in a sunlit garden, with the selected image page left open.", + "verification": "Grader confirms the final response includes a Google Images result URL and the browser shows the chosen hummingbird image page open in a tab with imagery matching the described scene.", + "weight": 0.3 + }, + "R2": { + "requirement": "Provide the names of two Canva flyer templates that plausibly suit a bright hummingbird garden flyer, with each template opened in its own tab.", + "verification": "Grader confirms two template names are listed in the response and corresponding Canva template pages are open in separate tabs showing flyer layouts.", + "weight": 0.25 + }, + "R3": { + "requirement": "Provide a basic CSS example for styling an unordered list using ul and li selectors, including a demonstration of list-style-type.", + "verification": "Grader confirms the response includes CSS code with ul/li styling and an explicit use of list-style-type to change bullet appearance.", + "weight": 0.2 + }, + "R4": { + "requirement": "Provide Purdue OWL APA guidance for citing a PowerPoint presentation, including the key citation rules in concise form.", + "verification": "Grader confirms the response summarizes citation guidance drawn from Purdue OWL and includes the essential APA formatting elements for a PowerPoint presentation citation.", + "weight": 0.25 + } + }, + "categories": [ + "Hobbies and Leisure > Photography", + "Arts & Entertainment > Visual Arts and Design" + ], + "num_categories": 2 + }, + { + "task_id": "4d6c838dec27532db3f999755cebc1732f4cbe8b", + "confirmed_task": "I’m putting together a quick note for a friend who does front-end web work on a Mac and is also dealing with a few annoying Apple-device issues, so could you help me verify everything in a browser? First, go to JetBrains and figure out which IDE they position specifically for editing and organizing web code, because that’s the one I want to recommend, and open the actual product page so I can see that you’re on the right tool. Then use Google to look into the problem where HomePods randomly start playing Apple Music and suddenly jump in volume, and base that on public discussions or support threads so we have a likely explanation plus at least one practical fix. After that, still using Google, find one reliable way to share or mirror an iPhone screen to another device like a TV or Mac, since that could help my friend demonstrate the issue, and open the source page in its own tab. Finally, use Google again to find how to switch an iPad from the floating mini keyboard back to the full-size keyboard, ideally from an Apple support page or another clearly trustworthy source, and leave that page open too so I can glance at the exact instructions. Once you’ve checked all of that, send me one compact note that names the JetBrains IDE and includes the HomePod explanation and fix, the iPhone mirroring method with steps, and the iPad keyboard fix.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Correctly identify the JetBrains IDE intended for editing and organizing web code and name it as the recommendation.", + "verification": "Grader can confirm the browser is on the relevant JetBrains product page and the final note names the correct IDE.", + "weight": 0.2 + }, + "R2": { + "requirement": "Provide a plausible explanation for HomePods randomly playing Apple Music and suddenly increasing volume based on public discussions or support-style sources.", + "verification": "Grader can confirm a Google results path to a discussion/support source and see the explanation reflected in the final note.", + "weight": 0.18 + }, + "R3": { + "requirement": "Include at least one suggested fix for the HomePod random playback/volume issue.", + "verification": "Grader can verify a fix was extracted from the researched source and included in the final note.", + "weight": 0.17 + }, + "R4": { + "requirement": "Summarize one reliable method for sharing or mirroring an iPhone screen to another device, including the necessary steps.", + "verification": "Grader can confirm a source page for iPhone mirroring is open in its own tab and the final note includes a usable step summary.", + "weight": 0.17 + }, + "R5": { + "requirement": "Summarize how to switch an iPad from the floating mini keyboard back to the full-size keyboard.", + "verification": "Grader can confirm a trustworthy instruction page is open and the final note includes the correct gesture or keyboard-button method.", + "weight": 0.13 + }, + "R6": { + "requirement": "Return one compact note that combines the JetBrains IDE recommendation with troubleshooting tips for all three Apple-related issues.", + "verification": "Grader can review the final response and confirm it includes the IDE name, HomePod explanation and fix, iPhone mirroring steps, and iPad keyboard steps in one concise note.", + "weight": 0.15 + } + }, + "categories": [ + "Computers Electronics and Technology > Programming and Developer Software", + "Computers Electronics and Technology > Consumer Electronics" + ], + "num_categories": 2 + }, + { + "task_id": "0106b570440ffe4427d5e916f39ec986ab3de917", + "confirmed_task": "I want to make myself a quick bargain roundup and keep it grounded in deals that are actually live on the sites right now. Please start on Slickdeals and open whatever is currently being shown as the featured best deal, then grab the exact title and current price so I have a benchmark for what counts as a standout offer today; leave that deal page open in its own tab so I can look at it afterward. Then go to CheapCharts and browse the current iTunes deals to find one on-sale movie, one on-sale TV season, and one on-sale audiobook that feel like easy low-cost digital add-ons compared with the Slickdeals benchmark, and open each of those actual CheapCharts deal pages in separate tabs so I can visually compare them. After that, head to the Epic Games Store homepage, then open the current seasonal or featured sale page, and also pull up the product page for Split Fiction in another tab so I can include one game-store option alongside the media deals and have proof you actually viewed both Epic pages. When you’re done, give me a concise roundup with the Slickdeals featured deal, the three CheapCharts picks labeled by category with prices, and a short confirmation that you viewed the Epic Games Store homepage, the current seasonal or featured sale page, and the Split Fiction product page.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "The current featured best deal on Slickdeals is identified from the site and its exact title and displayed price are recorded.", + "verification": "Grader can confirm the open Slickdeals deal tab matches the reported title and price visible on the deal page.", + "weight": 0.24 + }, + "R2": { + "requirement": "One current on-sale CheapCharts movie is selected and its title, category, and displayed price are recorded from the actual deal page.", + "verification": "Grader can confirm an open CheapCharts movie tab shows the same title and price and that it is a movie listing.", + "weight": 0.14 + }, + "R3": { + "requirement": "One current on-sale CheapCharts TV season is selected and its title, category, and displayed price are recorded from the actual deal page.", + "verification": "Grader can confirm an open CheapCharts TV season tab shows the same title and price and that it is a TV season listing.", + "weight": 0.14 + }, + "R4": { + "requirement": "One current on-sale CheapCharts audiobook is selected and its title, category, and displayed price are recorded from the actual deal page.", + "verification": "Grader can confirm an open CheapCharts audiobook tab shows the same title and price and that it is an audiobook listing.", + "weight": 0.14 + }, + "R5": { + "requirement": "The Epic Games Store homepage, current seasonal or featured sale page, and Split Fiction product page are all viewed, with the sale page and product page opened for visible browser proof.", + "verification": "Grader can confirm browser history or open tabs show the Epic homepage was visited and that a seasonal or featured sale page and Split Fiction product page were opened.", + "weight": 0.18 + }, + "R6": { + "requirement": "The final roundup is concise and includes the Slickdeals featured deal, all three CheapCharts items with category labels and prices, and explicit confirmation of the Epic page views.", + "verification": "Grader can compare the final response against the captured site data and confirm all required items and confirmations are present.", + "weight": 0.16 + } + }, + "categories": [ + "Ecommerce & Shopping > Coupons and Rebates", + "Ecommerce & Shopping > Ecommerce and Shopping - Other" + ], + "num_categories": 2 + }, + { + "task_id": "c6b29e8564a7ae86dc50a1f074bdc2b5abb3754a", + "confirmed_task": "I want to grab some Pokémon trading cards pretty quickly, but only if they’re actually available to buy right now, so could you check a few retailers for me and keep the product pages open in separate tabs so I can visually compare them afterward? Start on BestBuy.com and search for Pokémon trading card products, then find at least two items that show they’re in stock right now and note each product’s full name, current price, and exactly what the availability message says on the page. After that, go to BarnesandNoble.com and find one Pokémon card item that’s clearly in stock, open the actual product page so I can see the listing itself, and grab the title and listed price. Then head to Walmart.com, search for a Pokémon card product, and make sure the one you pick is sold and shipped by Walmart rather than a marketplace seller, then record the product name and price and leave that product page open too. Once you’ve got those pages, compare the in-stock options across all three stores and tell me which available item is the cheapest overall and where I should buy it. Then, make the tab for this option active.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify at least two Pokémon trading card products on Best Buy that are currently in stock and capture each item’s full product name, current price, and visible availability status.", + "verification": "Grader can confirm two separate Best Buy product pages or listings are open and each shows a Pokémon card item with a price and an in-stock style availability message visible on screen.", + "weight": 0.35 + }, + "R2": { + "requirement": "Identify one Pokémon trading card item on Barnes & Noble that is in stock and record its product title and listed price from the product page.", + "verification": "Grader can confirm a Barnes & Noble product page is open for a Pokémon card item and that the page visibly shows the title, price, and an in-stock/available status.", + "weight": 0.2 + }, + "R3": { + "requirement": "Identify one Pokémon card product on Walmart that is sold and shipped by Walmart and record its product name and price.", + "verification": "Grader can confirm a Walmart product page is open and visibly indicates Walmart as the seller/shipper, along with the product title and price.", + "weight": 0.2 + }, + "R4": { + "requirement": "Keep the selected product pages open in separate tabs for visual comparison.", + "verification": "Grader can confirm multiple retailer product tabs remain open, including at least the chosen Barnes & Noble and Walmart pages and the selected Best Buy product pages.", + "weight": 0.05 + }, + "R5": { + "requirement": "Compare the verified in-stock options across Best Buy, Barnes & Noble, and Walmart and state which available product is the cheapest overall and where to buy it. The cheapest option is the active tab on the page.", + "verification": "Grader can compare the recorded prices from the open pages and confirm the final answer names the lowest-priced in-stock item and the correct retailer, and that this option is the currently active tab on the page.", + "weight": 0.2 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Games > Games - Other" + ], + "num_categories": 2 + }, + { + "task_id": "5e09c06aa92c1e252db5106a72e8d61e59356c7e", + "confirmed_task": "I’m trying to ease into a simple yoga routine here in Fresno and want something that combines one local studio option with one easy at-home session. Could you start on coilyoga.com and look through what Coil Yoga in Fresno offers so I can get a feel for the studio’s vibe, the kinds of classes they teach, and what a beginner might actually be walking into; please make sure you open the actual classes page and pull at least one specific detail from there, and leave that page open so I can look at it later. Then, with that local context in mind, go to toweryogafresno.com and find Tower Yoga Fresno’s schedule, and tell me the next three upcoming classes with their start times so I can see what would realistically fit into my week; if possible, open the schedule in its own tab and keep it visible as proof of the class times. After that, head to YouTube and search for “50 minute yin yoga,” then compare the visible results and pick the one with the highest view count so I have a home practice to pair with the studio option; open the actual video page, tell me the title, channel, and view count, and leave the video tab open so I can reference it. Once you’ve seen all three sites, give me a short beginner-friendly recommendation on whether Coil Yoga or Tower Yoga seems like the better starting point for me based on what you found, and pair that choice with the YouTube session as a simple Fresno yoga starter plan.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Summarize Coil Yoga in Fresno using its website, including the studio’s overall offerings, class types, and at least one specific detail taken from the classes page.", + "verification": "Grader confirms the response includes a Coil Yoga summary with a concrete classes-page detail and that the classes page is opened or referenced as the source.", + "weight": 0.24 + }, + "R2": { + "requirement": "Report the next three upcoming classes from Tower Yoga Fresno’s schedule, each with its start time.", + "verification": "Grader confirms three upcoming Tower Yoga classes and their start times match the visible schedule page left open in the browser.", + "weight": 0.28 + }, + "R3": { + "requirement": "Identify the YouTube search result for “50 minute yin yoga” with the highest visible view count and report its title, channel, and view count.", + "verification": "Grader confirms the YouTube search results were compared by visible view counts and the selected video page shows the reported title, channel, and views.", + "weight": 0.22 + }, + "R4": { + "requirement": "Recommend whether Coil Yoga or Tower Yoga is the better beginner starting point based on the findings from the two local studio websites.", + "verification": "Grader confirms the recommendation explicitly chooses one studio and cites evidence from the studio offerings and/or schedule findings.", + "weight": 0.14 + }, + "R5": { + "requirement": "Provide a final Fresno yoga starter plan that pairs one local studio recommendation with the selected at-home YouTube session.", + "verification": "Grader confirms the final answer combines the chosen studio option with the identified YouTube video details into one coherent starter plan.", + "weight": 0.12 + } + }, + "categories": [ + "Health > Nutrition Diets and Fitness", + "Hobbies and Leisure > Hobbies and Leisure - Other" + ], + "num_categories": 2 + }, + { + "task_id": "170314aa3a93c7ca6e959be7757cad178efc06dc", + "confirmed_task": "I’m sketching out a compact but very high-end PC build and want you to do the first pass of browser research so I have a clean shortlist to look at later. Start on Google and find the official ASUS ROG page for the ROG Swift OLED PG27UCDM monitor, because I want to anchor the build around the exact display model rather than a reseller listing; open the real ASUS product page and leave that tab open so I can glance at the specs and photos myself. Once that’s pinned down, use Amazon or Google search results to track down at least three reputable 2025 “best mini-ITX motherboard” recommendation articles from established tech sites, since I’m trying to match a small-form-factor motherboard to a premium monitor-and-workstation setup; open each recommendation page in its own tab so I can compare them side by side, and make sure you capture the page title, site name, and URL for each one. After that, go to B&H Photo and find the actual product page for the NVIDIA RTX Pro 6000 Blackwell graphics card as the GPU candidate for this same build, and leave that B&H page open too so I can verify it’s the real listing. In the end, send me the ASUS monitor page URL, the three motherboard recommendation entries with titles, sites, and links, and the B&H GPU page URL.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Provide the official ASUS ROG product page URL for the ROG Swift OLED PG27UCDM monitor.", + "verification": "Grader confirms the returned URL is an official ASUS/ROG product page for the exact PG27UCDM model and that the browser shows the ASUS product page open.", + "weight": 0.25 + }, + "R2": { + "requirement": "Identify at least three reputable 2025 best mini-ITX motherboard recommendation pages from established tech sites.", + "verification": "Grader confirms there are at least three distinct recommendation pages focused on 2025 mini-ITX motherboard picks and that each page is open in its own tab.", + "weight": 0.3 + }, + "R3": { + "requirement": "For each motherboard recommendation source, provide the page title, site name, and URL accurately.", + "verification": "Grader compares the returned titles, site names, and URLs against the visible article tabs and page headers.", + "weight": 0.2 + }, + "R4": { + "requirement": "Provide the B&H Photo product page URL for the NVIDIA RTX Pro 6000 Blackwell graphics card.", + "verification": "Grader confirms the returned URL is a B&H product listing for the NVIDIA RTX Pro 6000 Blackwell and that the B&H product page is visibly open.", + "weight": 0.2 + }, + "R5": { + "requirement": "Return a complete final summary containing the ASUS monitor URL, all three motherboard recommendation entries, and the B&H GPU URL.", + "verification": "Grader checks that all requested items are present together in the final response with no missing fields.", + "weight": 0.05 + } + }, + "categories": [ + "Computers Electronics and Technology > Computer Hardware", + "Computers Electronics and Technology > Consumer Electronics" + ], + "num_categories": 2 + }, + { + "task_id": "9220309abe6e209dfedc978078c93f79fbd45ef1", + "confirmed_task": "I’m trying to put together a simple men’s outfit shortlist and want a quick mix of accessories, basics, and one resale piece, so could you help me browse a few sites like you would if you were sitting at my laptop with me? Start on ASOS and search for black men’s watches, then tell me whether the results page shows a total item count and what that number is, because I want to know if watches are actually easy to browse there; leave that results page open so I can glance at it later. After that, on SKIMS, pick a safer basics item by finding one men’s soft cotton boxer-brief product that clearly comes in multiple colors and multiple sizes, and tell me the product name plus a few color and size options; keep the product page open so I can see the swatches and size choices myself. Finally, since I may mix in something secondhand, go to the Poshmark page for men's accessories and identify one listing that’s currently available, making sure you open the actual listing page so you can verify it’s still live. At the end, give me a concise shopping summary with all three findings so I have a usable shortlist.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Report the ASOS search-results total for black men’s watches, or explicitly state that ASOS does not display a total count.", + "verification": "Grader confirms the ASOS results page is open and shows either a visible results count for black men’s watches or evidence that no count is displayed.", + "weight": 0.27 + }, + "R2": { + "requirement": "Select one SKIMS men’s soft cotton boxer-brief product that comes in multiple colors and multiple sizes, and provide the product name along with a few available color options and size options.", + "verification": "Grader confirms the SKIMS product page is open and visibly shows the chosen men’s soft cotton boxer-brief with multiple color swatches and multiple size choices.", + "weight": 0.33 + }, + "R3": { + "requirement": "Identify one currently available listing from the Poshmark men's accessories page.", + "verification": "Grader confirms the opened listing page shows a listing that is available/live in the men's accessories category.", + "weight": 0.2 + }, + "R4": { + "requirement": "Return the findings as a concise shopping summary that includes all three sources and the requested shortlist-oriented details.", + "verification": "Grader checks the final response includes ASOS count status, SKIMS product with colors and sizes, and one available Poshmark listing.", + "weight": 0.2 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Lifestyle > Fashion and Apparel" + ], + "num_categories": 2 + }, + { + "task_id": "328ce861b58b2cf2e6da520040193710f95cfe56", + "confirmed_task": "I want to put together a really simple at-home yoga plan using only free YouTube videos because I’m trying to ease into a routine without paying for an app. Could you start on Google and find at least three yoga instructors who seem especially approachable, beginner-friendly, or a little funny and who clearly post free classes on YouTube, so I have a shortlist of personalities that feel welcoming rather than intimidating. For each one, grab the instructor name, the YouTube channel name, and one example video title. Then head over to YouTube and, using that shortlist, pick one morning yoga video that’s around 20 minutes long from one of those instructors so I have an easy option for weekdays; please note the exact title, duration, and link. After that, find a separate Vinyasa flow video on YouTube that’s about 30 minutes long so I have a slightly longer practice option too, and give me its title and link as well. Please open the two chosen videos in separate tabs so I can compare them, and start playing the 30-minute Vinyasa one long enough to confirm it’s the right video before leaving that tab open. In the end, send me the full yoga plan with the instructor shortlist plus both selected videos and their details.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Provide a shortlist of at least 3 yoga instructors who appear approachable, beginner-friendly, or funny and who post free yoga classes on YouTube, including each instructor’s name, YouTube channel name, and one example video title.", + "verification": "Grader confirms the final response includes 3 or more instructors with all three required fields and that the information could reasonably be sourced from Google results leading to YouTube channels or videos.", + "weight": 0.35 + }, + "R2": { + "requirement": "Select one morning yoga video on YouTube that is approximately 20 minutes long and is from one of the instructors in the shortlist.", + "verification": "Grader confirms the chosen morning video is identified with title, duration, and link, and that the instructor matches one of the shortlist entries.", + "weight": 0.25 + }, + "R3": { + "requirement": "Select one separate YouTube Vinyasa flow video that is approximately 30 minutes long and provide its title and link.", + "verification": "Grader confirms the final response includes a distinct Vinyasa flow video with title and URL and that the runtime is about 30 minutes based on the visible YouTube listing or player.", + "weight": 0.2 + }, + "R4": { + "requirement": "Open the chosen morning yoga video and the chosen Vinyasa flow video in separate browser tabs, and start playing the longer Vinyasa video briefly before leaving it open.", + "verification": "Grader confirms visible browser state shows both YouTube video tabs open and evidence that the 30-minute Vinyasa video player was started.", + "weight": 0.1 + }, + "R5": { + "requirement": "Present the final output as one combined yoga plan containing the instructor shortlist, the chosen morning video with title, duration, and link, and the separate Vinyasa flow video with title and link.", + "verification": "Grader confirms the response combines all requested pieces into a single coherent plan rather than scattered notes.", + "weight": 0.1 + } + }, + "categories": [ + "Health > Nutrition Diets and Fitness" + ], + "num_categories": 1 + }, + { + "task_id": "795687ed918e45a6ad255215aa2a517b3e014aa5", + "confirmed_task": "I’m curious whether any new seasonal drinks are actually landing well right now, so could you open Reddit and check the newest posts in r/starbucks for anything about recent or seasonal drink releases, then read into the comments enough to tell me whether people seem excited, disappointed, or mixed on them. After that, stay on Reddit and do the same thing in r/DunkinDonuts so I can compare whether Dunkin’s newly discussed drinks are getting a warmer or colder reaction than Starbucks at the moment. If one drink post clearly looks the most loved, most upvoted, or just the most viral between the two subreddits, open that specific post in a separate tab and leave it there so I can look at it myself. Then, as a totally separate palate cleanser, go to Bored Panda and open their collection of cute and funny angry cat photos shared by owners, and tell me the exact page title while leaving that page open too so I can glance through the pictures later.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Review recent posts on r/starbucks and identify posts about new or seasonal drink releases with a summary of commenter reactions.", + "verification": "Grader can confirm the agent visited r/starbucks recent/new content and the final response includes at least one relevant new-drink discussion plus sentiment such as positive, negative, or mixed.", + "weight": 0.25 + }, + "R2": { + "requirement": "Review recent posts on r/DunkinDonuts about newly discussed drinks and summarize community sentiment.", + "verification": "Grader can confirm the agent visited r/DunkinDonuts and the final response includes at least one recent drink discussion with sentiment characterization.", + "weight": 0.2 + }, + "R3": { + "requirement": "Explicitly compare Dunkin drink reception against Starbucks drink reception.", + "verification": "Final response states whether Dunkin's newly discussed drinks are being received more positively, more negatively, or about the same relative to Starbucks, based on the subreddit findings.", + "weight": 0.2 + }, + "R4": { + "requirement": "Identify the single standout drink item or post with the highest reviews, strongest positivity, or most virality, and open it in a separate tab.", + "verification": "Grader can confirm a separate Reddit tab is open to the chosen standout post and the final response names that standout item/post.", + "weight": 0.2 + }, + "R5": { + "requirement": "Open the Bored Panda collection of cute and funny angry cats shared by owners and provide the exact page title.", + "verification": "Grader can confirm the Bored Panda page is open and the reported title matches the visible page title.", + "weight": 0.15 + } + }, + "categories": [ + "Food and Drink > Beverages", + "Computers Electronics and Technology > Social Media Networks" + ], + "num_categories": 2 + }, + { + "task_id": "8c30f2f9ceeac75b05c725c5397022bb4f9d32a0", + "confirmed_task": "I’m putting together a super short beginner-friendly AI explainer for someone who doesn’t know much about the topic yet, so I want it to move from simple definitions to a recognizable product and then end with a real hardware example. Please start on Google and find IBM’s page that explains the main types of artificial intelligence and machine learning, then pull out at least three categories and rewrite them in plain English with one sentence each so they sound easy to follow. Once you’ve got that foundation, go to Copilot.com and figure out what the site is, then give me exactly one sentence on its purpose as an AI product a beginner would probably recognize; if there’s a landing page or main homepage, leave that open so I can glance at it myself. After that, use Google again to find the Tom’s Hardware article about Intel Granite Rapids WS competing with AMD Threadripper, open the actual Tom’s Hardware article in a new tab so I can see the headline and page for myself, and summarize the key points including the main competitive claims and any specs the article cites like core count or boost clock. Then tie that hardware example back to why strong chips matter for AI or advanced computing, and give me one concise final write-up that combines the IBM basics, the one-sentence Copilot description, and the Tom’s Hardware takeaway in a way a beginner could understand.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Provide at least three IBM-described categories related to artificial intelligence and/or machine learning, each rewritten in plain English with a one-sentence description.", + "verification": "Final response includes three or more distinct IBM-based categories with simple one-sentence explanations traceable to the IBM page opened from Google results.", + "weight": 0.3 + }, + "R2": { + "requirement": "Identify what Copilot is and summarize its purpose in exactly one sentence as an AI-related product example a beginner would recognize.", + "verification": "Final response contains a single sentence describing Copilot’s purpose, and the Copilot homepage or landing page is visibly open in the browser.", + "weight": 0.15 + }, + "R3": { + "requirement": "Find and summarize the Tom’s Hardware article about Intel Granite Rapids WS competing with AMD Threadripper, including the main competitive claims and any cited specs such as core count and boost clock.", + "verification": "A Tom’s Hardware article page is open in a separate tab, and the final response includes the article’s key claims plus cited specs mentioned in the article.", + "weight": 0.25 + }, + "R4": { + "requirement": "Connect the Tom’s Hardware hardware summary back to why powerful hardware matters for AI or advanced computing.", + "verification": "Final response explicitly links the hardware comparison to AI workloads, advanced computing, or the need for strong compute performance.", + "weight": 0.1 + }, + "R5": { + "requirement": "Return a concise final write-up that combines the IBM categories, the one-sentence Copilot description, and the Tom’s Hardware summary into a beginner-friendly explainer.", + "verification": "Final answer is a unified, concise explainer rather than disconnected notes, and it includes all three required parts in a beginner-friendly flow.", + "weight": 0.15 + }, + "R6": { + "requirement": "Open the Tom’s Hardware article in a new browser tab during the browsing process.", + "verification": "Browser state shows the Tom’s Hardware article open in its own tab, separate from the Google search results tab.", + "weight": 0.05 + } + }, + "categories": [ + "Computers Electronics and Technology > Computers Electronics and Technology - Other", + "Science and Education > Science and Education - Other" + ], + "num_categories": 2 + }, + { + "task_id": "2504a7886c3dcb33f1aac7c5d2831985887e789e", + "confirmed_task": "I’m trying to decide whether brunch in San Francisco makes sense today, so could you start on weather.com and pull up the San Francisco 10-day forecast, then tell me the highs and lows for the next three days so I have a quick weather reality check before I head out. If the forecast looks decent enough for going out, switch over to Beach Chalet Restaurant & Brewery’s site and find the actual brunch page so I can see the posted schedule myself; let me know which days brunch is offered and the listed start and end times, and leave that brunch page open in a tab for me. Then use that timing as a reference and go to Fat Choy World’s website, open its current menu page, and figure out whether it appears to be open right now based on the hours or live status shown there, because I’m trying to decide whether to stick with a brunch plan or pivot to another meal instead. Please keep the Beach Chalet brunch page and the Fat Choy World menu page open in separate tabs so I can compare them visually afterward.", + "website": "https://www.google.com", + "reference_length": 3, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Report the forecasted high and low temperatures for each of the next 3 days from weather.com’s San Francisco 10-day forecast.", + "verification": "Grader can confirm the weather.com 10-day forecast page for San Francisco is open and that the reported highs/lows match the first three forecast days shown on the page.", + "weight": 0.35 + }, + "R2": { + "requirement": "Find Beach Chalet Restaurant & Brewery’s brunch schedule and report the days brunch is offered along with the posted start and end times.", + "verification": "Grader can confirm the Beach Chalet brunch page is open in a tab and that the response matches the visible brunch days and hours shown on that page.", + "weight": 0.35 + }, + "R3": { + "requirement": "Use Fat Choy World’s current menu page to determine whether the restaurant is open right now based on the posted hours or status.", + "verification": "Grader can confirm the Fat Choy World menu page is open in a separate tab and that the open-now determination is supported by the visible hours or status on the page at the time of browsing.", + "weight": 0.3 + } + }, + "categories": [ + "Science and Education > Weather", + "Food and Drink > Restaurants and Delivery", + "Travel and Tourism > Travel and Tourism - Other" + ], + "num_categories": 3 + }, + { + "task_id": "71f8e3e9b5a24f37f492fbf97b7d31e08e9a8d61", + "confirmed_task": "I’m in the UK and trying to work out whether starting a side hustle on top of my £55,500 salary is going to create extra tax admin, so could you check a few things in the browser for me? First, go to the official HMRC side-hustle guidance on taxhelpforhustlers.campaign.gov.uk and pull out the kinds of side-hustle income they say need to be reported, especially so I can tell whether things like reselling items online or doing delivery or other gig work would fall into those categories; please open the actual HMRC guidance page and leave it open so I can look at the wording myself. Then go to the Reed UK tax calculator on reed.co.uk, enter an annual salary of £55,500, and tell me the annual income tax figure it shows as my baseline; if the results page is separate, leave that open too so I can compare it with the HMRC guidance. After that, use Google to find a UK paycheck or salary calculator that shows a single pay-period net pay amount for a £55,500 salary, open the calculator result you use in its own tab, and report one estimated take-home amount for a single pay period. In the end, give me a short summary with the HMRC reportable income categories, the Reed annual income tax amount, the single-period net pay estimate, and a quick conclusion saying whether a side hustle in one of those HMRC categories would likely need reporting in addition to my salary.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify the reportable side-hustle income categories from the official HMRC side-hustle guidance and connect them to common examples such as reselling and delivery or gig/service work.", + "verification": "Grader can confirm the HMRC guidance page is open on taxhelpforhustlers.campaign.gov.uk and that the response reflects categories visible on that page, with examples mapped to those categories.", + "weight": 0.4 + }, + "R2": { + "requirement": "Use the Reed UK tax calculator to report the annual income tax due for an annual salary of £55,500.", + "verification": "Grader can confirm the Reed calculator results page shows salary input or results corresponding to £55,500 and that the reported annual income tax matches the visible result.", + "weight": 0.3 + }, + "R3": { + "requirement": "Use a UK paycheck or salary calculator found via Google to report one estimated net take-home amount for a single pay period for a £55,500 salary.", + "verification": "Grader can confirm Google was used to reach a calculator, the calculator page is open in its own tab, and the reported net pay amount corresponds to a visible single pay-period result for £55,500.", + "weight": 0.2 + }, + "R4": { + "requirement": "Provide a short conclusion that ties the HMRC categories to the salary baseline and states whether a side hustle in one of those categories would likely need reporting in addition to the £55,500 salary.", + "verification": "Grader can confirm the conclusion explicitly references at least one HMRC category from step 1 and correctly frames reporting as additional to the existing salary context.", + "weight": 0.1 + } + }, + "categories": [ + "Finance > Finance - Other", + "Law and Government > Government" + ], + "num_categories": 2 + }, + { + "task_id": "55bd922b66b05bcf4cbf5383333df59acce2ab32", + "confirmed_task": "I’m putting together a small-form-factor AM5 build and I want to base it on the ASUS ROG Strix B650E-I Gaming WiFi, so please start on the official ASUS site and open the exact product page for that motherboard, then leave it open in its own tab so I can glance at the specs and photos later. After that, use Google to find a community-made AM5 motherboard tier list from a forum, spreadsheet, Reddit post, or similar enthusiast source, open the actual tier list page, and check where the ASUS ROG Strix B650E-I Gaming WiFi shows up so I can sanity-check whether this is still considered a solid pick. If the board looks reasonable there, go to Amazon and find the product page for the Thermalright Peerless Assassin CPU cooler as a possible pairing for the build, and leave that open too so I can compare it visually with the motherboard tab. In the end, send me the three direct links and a short note saying what tier or listing the ASUS board got and whether that makes it seem like a sensible choice for this build.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Provide the direct official ASUS product page URL for the exact ASUS ROG Strix B650E-I Gaming WiFi motherboard.", + "verification": "Grader can confirm the URL is on rog.asus.com and the visible page title/model name exactly matches ASUS ROG Strix B650E-I Gaming WiFi.", + "weight": 0.3 + }, + "R2": { + "requirement": "Provide a direct URL to a community-created AM5 motherboard tier list page found via Google.", + "verification": "Grader can confirm the link is not a generic Google results page but the actual community tier list source page, visible in its own tab.", + "weight": 0.2 + }, + "R3": { + "requirement": "Correctly report where the ASUS ROG Strix B650E-I Gaming WiFi appears on the community AM5 motherboard tier list, or clearly state if it does not appear, with a brief note on whether that supports it as a reasonable pick.", + "verification": "Grader can compare the final note against the opened tier list page and confirm the board’s placement/category or absence is accurately described.", + "weight": 0.3 + }, + "R4": { + "requirement": "Provide the direct Amazon product page URL for the Thermalright Peerless Assassin CPU cooler.", + "verification": "Grader can confirm the URL is an Amazon product listing and the visible product title identifies a Thermalright Peerless Assassin CPU cooler.", + "weight": 0.2 + } + }, + "categories": [ + "Computers Electronics and Technology > Computer Hardware", + "Games > Video Games Consoles and Accessories" + ], + "num_categories": 2 + }, + { + "task_id": "1d3952479bc687cb5b04e14930533493f959dbe5", + "confirmed_task": "I’m trying to put together a Christmas gift package that I can actually mail without overthinking it, so could you start on Etsy and look through the Christmas ornaments area for one personalized family Christmas ornament listing that feels like a genuinely giftable idea, ideally something customized with family names or a year, and open the actual listing so I can see the photos and the price rather than just a search result. Once you’ve got that ornament style in mind, switch to Amazon and find one marble cheese board that would pair nicely with it in the same holiday package, because I want a second physical gift that feels festive and easy to wrap; please open the product page in its own tab and note the product name plus the star rating and review count if Amazon shows them. Then go to the USPS online store and find one Forever stamp product that’s currently for sale so I know what I could use for mailing a holiday card with the package, and leave that product page open too so I can verify it myself. At the end, give me a short summary with the Etsy ornament name and price, the Amazon cheese board name with rating and/or review count, and the USPS Forever stamp product name.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify one Etsy personalized family Christmas ornament from an actual listing page and report its product name and price.", + "verification": "Grader can confirm an Etsy listing page for a family Christmas ornament is open or was visited, with visible product title and price matching the reported ornament.", + "weight": 0.35 + }, + "R2": { + "requirement": "Select one Amazon marble cheese board suitable as a complementary Christmas gift and report the product name plus star rating and review count if shown.", + "verification": "Grader can confirm an Amazon product page for a marble cheese board is open in its own tab or was visited, with visible product title and rating/review information matching the report.", + "weight": 0.3 + }, + "R3": { + "requirement": "Provide the name of one Forever stamp product currently for sale on the USPS online store from a product page.", + "verification": "Grader can confirm a USPS store product page for Forever stamps is open or was visited, with a visible product name matching the reported stamp.", + "weight": 0.2 + }, + "R4": { + "requirement": "Give a concise final summary that includes the Etsy ornament name and price, the Amazon cheese board name with rating and/or review count, and the USPS Forever stamp product name.", + "verification": "Grader can compare the final response against the details gathered from the Etsy, Amazon, and USPS pages and confirm all three items are included concisely.", + "weight": 0.15 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Community and Society > Holidays and Seasonal Events", + "Lifestyle > Gifts and Flowers" + ], + "num_categories": 3 + }, + { + "task_id": "890a6880049a42684ac91a2e1809442846f9394c", + "confirmed_task": "I’m thinking about doing a simple public-transit outing from downtown Chicago to Aurora, Illinois soon, and I want a practical snapshot I can actually look at in the browser. On Google, please search for the recommended public-transit route from Chicago to Aurora, Illinois and open the actual transit directions so you can tell me the main mode I’d be taking and the estimated total travel time; leave that directions page open so I can review the route myself. Since I may need a little context for rail connections and fare structure, then go to Metra’s BNSF line page and summarize what the BNSF line is, where it runs, and where on that page or site fare information is listed; if there’s a separate fares link, open that in another tab so I have a visual reference. After that, head to Weather.com and pull up the local 10-day forecast for Aurora, Illinois, then give me the high and low temperatures for the next 3 days so I can judge whether the trip will feel comfortable. Please send everything back as one concise trip-planning summary, and keep the Google transit directions tab and the relevant Metra page open for comparison.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Find a recommended public-transit route from Chicago, Illinois to Aurora, Illinois on Google and report the main mode of transit and estimated total travel time.", + "verification": "Grader can confirm a Google transit directions/results page is open and the response includes a specific main mode and travel-time estimate taken from that page.", + "weight": 0.3 + }, + "R2": { + "requirement": "Summarize Metra’s BNSF line by stating what it is and where it runs.", + "verification": "Grader can confirm the Metra BNSF line page is open and the response accurately describes the line and its route coverage based on visible page text.", + "weight": 0.2 + }, + "R3": { + "requirement": "Identify where fare information for riding the BNSF line is listed, including any relevant fares link or section.", + "verification": "Grader can confirm the response points to a visible fare-information location on the BNSF line page or an opened fares tab/page on metra.com.", + "weight": 0.15 + }, + "R4": { + "requirement": "Report the forecasted high and low temperatures for the next 3 days from Weather.com’s 10-day forecast for Aurora, Illinois.", + "verification": "Grader can confirm a Weather.com 10-day forecast page for Aurora is open and the response includes three day-by-day high/low pairs matching the visible forecast.", + "weight": 0.2 + }, + "R5": { + "requirement": "Return all findings as one concise trip-planning summary that combines the Google transit recommendation, BNSF line context, fare-info location, and 3-day weather outlook.", + "verification": "Grader can confirm the final response is a single concise integrated summary containing all required elements from steps 1 through 3.", + "weight": 0.15 + } + }, + "categories": [ + "Travel and Tourism > Ground Transportation", + "Travel and Tourism > Tourist Attractions" + ], + "num_categories": 2 + }, + { + "task_id": "f979f723a3f6f65ea8d75903425f22c67505daf1", + "confirmed_task": "I’m trying to put together a Christmas gift built around Pokémon cards, but I want a realistic backup plan in case the main item is sold out. Please start on Collector Store and look up the Pokémon Phantasmal Flames Booster Elite Trainer Box, then open the actual product page and check whether it says it’s in stock or sold out, and note the listed price so I know if the original idea is still viable. After that, go to Best Buy and search for Pokémon trading cards or Pokémon card gift items and pick one gift option that looks like a reasonable substitute, making sure to open the product page so I can see the listing myself and leaving that tab open as a reference. Then use that same general idea on Walgreens by searching for Pokémon trading cards and finding two available options with prices, opening each Walgreens product in its own tab so I can compare them side by side and verify they’re actually live listings. In the end, give me a short backup-plan summary with the Collector Store stock status and price, the Best Buy product name and price, and the two Walgreens options with their prices.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Find the Collector Store product page for the Pokémon Phantasmal Flames Booster Elite Trainer Box and capture its stock status and listed price.", + "verification": "Grader can confirm the browser is on the Collector Store product page showing the product title plus a visible in-stock or sold-out indicator and a price.", + "weight": 0.35 + }, + "R2": { + "requirement": "Identify one Pokémon card gift option on Best Buy and record its product name and price from the product page.", + "verification": "Grader can confirm a Best Buy product page is open with a Pokémon card-related item, and the visible page shows the product name and price.", + "weight": 0.2 + }, + "R3": { + "requirement": "Find two available Pokémon trading card options on Walgreens and record the product name and price for each from live product pages.", + "verification": "Grader can confirm two Walgreens product tabs are open, each showing a Pokémon trading card listing with visible product names and prices.", + "weight": 0.3 + }, + "R4": { + "requirement": "Provide a short summary that combines the Collector Store stock status and price, the Best Buy backup option with price, and the two Walgreens options with prices as a practical gift backup plan.", + "verification": "Grader can compare the final written summary against the visible product pages and confirm all requested items and prices are included accurately.", + "weight": 0.15 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Games > Games - Other", + "Community and Society > Holidays and Seasonal Events" + ], + "num_categories": 3 + }, + { + "task_id": "7cc602f239775882273921d82e181020f1769b53", + "confirmed_task": "I’m trying to put together a simple evening-event outfit that I can actually order in the U.S., so could you start on Theory and find me one green dress that looks dressy enough for an evening plan and is available for U.S. delivery, then open the actual product page so I can see the photos and note the product name and price? Once you’ve got that dress, use the color and overall vibe as your reference point and go to a shoe site to find a matching pair of women’s shoes in size 9.5 that would work for the same outfit, and open that product in its own tab too so I can compare the two pages side by side; please note the shoe name and price. After that, check ooShirts because I may also need a simple custom group shirt order for the event, and confirm whether they offer no-minimum print-on-demand orders shipped within the United States, including whatever turnaround or shipping timing they state on the site. At the end, give me one concise summary with the Theory dress details, the matching shoes details, and the ooShirts no-minimum plus shipping/timing answer, and leave the dress and shoe product pages open for me.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Identify one green dress on Theory that is available for U.S. delivery and provide its product name and price.", + "verification": "Grader can confirm a Theory product page is open showing a green dress with visible product name and price, along with page indicators that support U.S. shopping or delivery availability.", + "weight": 0.35 + }, + "R2": { + "requirement": "Select one matching pair of women’s shoes in size 9.5 that fits the dress’s color and dressy evening vibe, and provide the product name and price.", + "verification": "Grader can confirm a shoe product page is open in a separate tab showing women’s shoes, visible size 9.5 availability or selectable size, and visible product name and price.", + "weight": 0.3 + }, + "R3": { + "requirement": "Determine whether ooShirts offers no-minimum print-on-demand orders shipped within the United States, including the stated turnaround or shipping timing.", + "verification": "Grader can confirm ooShirts pages show the minimum-order policy and visible text about turnaround, production, or shipping timing for U.S. orders.", + "weight": 0.2 + }, + "R4": { + "requirement": "Return the dress details, matching shoes details, and the ooShirts no-minimum and timing answer together in one concise summary, while leaving the dress and shoe product pages open.", + "verification": "Grader can confirm the final response includes all requested details in a concise combined summary and that the dress and shoe tabs remain open.", + "weight": 0.15 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Lifestyle > Fashion and Apparel" + ], + "num_categories": 2 + }, + { + "task_id": "8e639395e157ea4df2747a4a873b5f610d70d180", + "confirmed_task": "I want to put together a really easy beginner sketch reference pack for a casual drawing session, and I’d like you to grab the pieces in the browser so I can look at them afterward. First, on Pinterest, find one pin with a genuinely useful person drawing reference photo for figure practice—something clear enough that a beginner could sketch from—and open the actual pin page, then keep that tab open and save the pin link for me. Once you’ve got that figure reference, go to Google Images and search for a basic one-room hut from the 1500s in an English style so I can use it as a simple background setting; open the image result or source page that looks most believable and leave that open too so I can see the picture itself. After that, go to Bored Panda and open their collection of angry cat photos as a fun mood reference for the character expression, and tell me the exact page title while keeping that page open as well. In the end, send me the Pinterest pin link, the hut image or source page link, and the Bored Panda page title, plus a short note on how the figure, the hut, and the angry-cat mood could all work together in one simple sketch idea.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "Provide one Pinterest pin URL that leads to a useful person drawing reference photo suitable for figure practice.", + "verification": "Grader can confirm the browser is on an actual Pinterest pin page showing a person reference image and that the returned link matches that open pin.", + "weight": 0.35 + }, + "R2": { + "requirement": "Provide one Google Images result or source page URL showing a basic one-room hut from the 1500s in an English style.", + "verification": "Grader can confirm an image result or source page is open from Google Images and visually depicts a simple hut consistent with the requested historical English-style setting.", + "weight": 0.3 + }, + "R3": { + "requirement": "Provide the exact title of the Bored Panda collection featuring angry cat photos shared by their owners.", + "verification": "Grader can confirm the Bored Panda page is open and the reported title matches the visible page title/header.", + "weight": 0.2 + }, + "R4": { + "requirement": "Include a brief note explaining how the person reference, hut reference, and angry cat mood inspiration could fit together into one beginner-friendly sketch concept.", + "verification": "Grader can confirm the final note meaningfully references all three selected sources and combines them into a coherent sketch idea.", + "weight": 0.15 + } + }, + "categories": [ + "Hobbies and Leisure > Hobbies and Leisure - Other", + "Arts & Entertainment > Visual Arts and Design" + ], + "num_categories": 2 + }, + { + "task_id": "82af9358ab6a0421057340a0c038498348f2b3ec", + "confirmed_task": "I’m trying to get a handle on my UK telecom budget and want a real-world baseline I can actually look at in the browser. Please start on MoneySavingExpert’s Cheap Mobile Finder and set it to SIM-only deals with unlimited minutes, unlimited texts, and at least 10GB of data, then sort out the three cheapest options you can see in ascending monthly price order and note the provider, monthly price, contract length, and data allowance for each so I can compare what the low end of the market looks like. Once you’ve got that shortlist, open Vodafone UK in another tab and figure out what the “Xtra 40” part of one of their broadband plan names actually means, because I want to know whether that’s describing the broadband speed tier or some extra bundle feature; please leave the relevant Vodafone page open so I can see the wording myself. After that, go to PrintPigeon and find the service where I could upload a PDF and have it printed and mailed as a letter, and tell me the service name, the starting price, and the exact order page where the process begins so I could send myself a one-page note with the SIM comparison and the Vodafone explanation. If possible, keep the MoneySavingExpert results tab and the PrintPigeon order page open in separate tabs so I can visually compare them afterward.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "easy", + "rubrics": { + "R1": { + "requirement": "The MoneySavingExpert Cheap Mobile Finder is used with filters that clearly match SIM-only, unlimited minutes, unlimited texts, and at least 10GB data.", + "verification": "Grader can confirm the visible filtered results page on MoneySavingExpert shows qualifying SIM-only deals consistent with those constraints.", + "weight": 0.22 + }, + "R2": { + "requirement": "The three cheapest qualifying SIM-only deals are listed in ascending monthly price order.", + "verification": "Grader compares the reported three deals against the visible ordering on the MoneySavingExpert results page and confirms they are the cheapest qualifying options shown.", + "weight": 0.24 + }, + "R3": { + "requirement": "For each of the three MoneySavingExpert deals, the provider, monthly price, contract length, and data allowance are included accurately.", + "verification": "Grader checks each reported field against the corresponding visible deal cards or listing details on the MoneySavingExpert results page.", + "weight": 0.18 + }, + "R4": { + "requirement": "The explanation of what Vodafone broadband label “Xtra 40” refers to is correct and based on the Vodafone page.", + "verification": "Grader confirms the open Vodafone page contains wording showing whether “Xtra 40” denotes a speed tier or another plan attribute, and the response matches that meaning in one sentence.", + "weight": 0.16 + }, + "R5": { + "requirement": "On PrintPigeon, a service is identified that allows a user to upload or attach a PDF and have it printed and mailed as a letter.", + "verification": "Grader verifies on the open PrintPigeon page that the named service is for sending a printed letter from an uploaded document or PDF.", + "weight": 0.1 + }, + "R6": { + "requirement": "The PrintPigeon result includes the service name, starting price, and the specific order/start page where the mailing process begins.", + "verification": "Grader checks that the reported service name and starting price match the visible PrintPigeon page and that the provided page is the actual order or start page for initiating the mailing.", + "weight": 0.1 + } + }, + "categories": [ + "Computers Electronics and Technology > Telecommunications", + "Finance > Finance - Other" + ], + "num_categories": 2 + }, + { + "task_id": "63d68bb25e279fc22e6e3592d8ca59add33b6eb1", + "confirmed_task": "I’m trying to buy a family car in the Minneapolis–St. Paul area and want a solid shortlist I can actually look through later, so please use Cars.com to search near Minneapolis, Minnesota for family-friendly vehicles priced at $50,000 or less. I’m shopping for a family of four, so focus on practical options like midsize SUVs, crossovers, minivans, or other vehicles that clearly make sense for everyday family use. As you go through the results, open each promising listing in its own tab and make sure the actual vehicle page shows the price, mileage, model year, and dealership location, because I want every option to be something I can visually inspect afterward. Please gather about 15 listings that are still live and under budget, and if a listing doesn’t make the seating or family suitability obvious, cross-check that exact vehicle on Edmunds so we can confirm the body style or seating before keeping it. Once you’ve got the set, create a CryptPad Sheet called Minneapolis Family Cars with columns for make and model, model year, price, mileage, dealership location, and link to listing, and fill it in so each row matches one of the listing tabs you still have open. After that, give me a short summary of which models show up most often, which ones seem like the best value based on year, mileage, and price, and whether the final mix is mostly SUVs or minivans. Please leave the vehicle tabs and the finished spreadsheet open so I can compare everything on screen.", + "website": "https://www.google.com", + "reference_length": 8, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Cars.com is used to search near Minneapolis, Minnesota with the price constrained to $50,000 or less and the results focused on family-suitable vehicle types.", + "verification": "Grader can see a Cars.com results page showing Minneapolis-area search context and a max-price filter at or below $50,000 with relevant family vehicle results visible.", + "weight": 0.12 + }, + "R2": { + "requirement": "About 15 promising family-appropriate vehicle listings are opened in separate browser tabs from the Cars.com results.", + "verification": "Grader can see multiple open vehicle listing tabs, approximately 15 in total, each corresponding to a distinct candidate vehicle page.", + "weight": 0.12 + }, + "R3": { + "requirement": "Each selected listing is individually verified for make/model, year, price, mileage, dealership location, and that it is still a live listing under budget and suitable for a family of four.", + "verification": "Grader can inspect the open listing pages and confirm the required details are visible and match the final recorded set, with no included vehicle over $50,000 or obviously unsuitable.", + "weight": 0.2 + }, + "R4": { + "requirement": "Any unclear seating or body-style cases are cross-checked on Edmunds before inclusion in the final shortlist.", + "verification": "Grader can see Edmunds pages or evidence of cross-checking for ambiguous vehicles, confirming family suitability or body style for those cases.", + "weight": 0.08 + }, + "R5": { + "requirement": "A CryptPad Sheets titled Minneapolis Family Cars is created.", + "verification": "Grader can see the CryptPad Sheets title displayed as Minneapolis Family Cars.", + "weight": 0.08 + }, + "R6": { + "requirement": "The spreadsheet contains the required columns: make and model, model year, price, mileage, dealership location, and link to listing.", + "verification": "Grader can see the header row in CryptPad Sheets with all required column names present.", + "weight": 0.08 + }, + "R7": { + "requirement": "The spreadsheet is populated with about 15 verified vehicle rows, each matching a currently open listing tab and including a working listing URL.", + "verification": "Grader can compare the rows in the sheet against the open tabs and confirm that approximately 15 entries are present with corresponding links and extracted details.", + "weight": 0.22 + }, + "R8": { + "requirement": "A short summary is produced covering the most frequent models, the best-value options, and whether SUVs or minivans dominate the final set, while leaving the sheet and listing tabs open.", + "verification": "Grader can review the final written summary and confirm the spreadsheet and vehicle tabs remain open for visual inspection.", + "weight": 0.1 + } + }, + "categories": [ + "Vehicles > Makes and Models", + "Ecommerce & Shopping > Ecommerce and Shopping - Other" + ], + "num_categories": 2 + }, + { + "task_id": "53419597c0c8897d49f1af65f5255bf265edcfbf", + "confirmed_task": "I’m a Canadian citizen living in Pittsburgh, PA, and my passport expires in about 3 months, so I’m trying to get everything sorted before a 2-week tourist trip to Japan. Could you start on the official Government of Canada site and find the passport renewal process for a Canadian living in the U.S., including the exact renewal form I’d need, the supporting documents, photo rules, whether I need a guarantor or references, the fee in CAD, how I’m supposed to submit it from the U.S., and the current processing time, because I need to know if this is realistic before I book anything. Once you have that, use the official Canadian embassy/consulate pages to figure out which Canadian mission is closest to Pittsburgh, Pennsylvania 15222 that handles passport services, and open the actual office page so I can see the address, passport service hours, and whether I need an appointment or have to use some booking request process; please leave that page open. After that, check Japan’s official Ministry of Foreign Affairs site to confirm whether a Canadian passport holder going to Japan for tourism for 2 weeks needs a visa, and note any conditions or exceptions that matter. Then go to the Government of Canada travel advisory page for Japan and tell me the current advisory level plus any highlighted health, safety, or entry notes, and keep that advisory page open in another tab so I can look at it myself. Finally, compare travel insurance options on PolicyAdvisor.com and Kanetix.ca for this situation: a Canadian citizen currently living in the U.S. who wants coverage connected to travel to Japan, and I mainly want to see whether either site shows plans that would actually work for someone based in the U.S. rather than Canada, so please capture provider names, medical emergency coverage, trip cancellation/interruption if shown, and any residency or eligibility restrictions. If either site has useful quote or results pages, open the most relevant options in separate tabs so I can compare them visually. At the end, give me a concise summary that ties all of this together and clearly points out any uncertainty, especially around insurance eligibility for a Canadian living in the U.S.", + "website": "https://www.google.com", + "reference_length": 8, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Correctly identifies the official Government of Canada passport renewal process for a Canadian living abroad in the U.S., including the correct form and main supporting requirements.", + "verification": "Grader confirms the final answer references the official Canada passport renewal abroad page and includes the renewal form plus required documents and procedural requirements visible on that page.", + "weight": 0.18 + }, + "R2": { + "requirement": "Accurately reports passport renewal fees, submission method from the U.S., and current processing times from the official Canadian source.", + "verification": "Grader checks that the reported fee, submission path, and processing time match the official Canada.ca content viewed during the task.", + "weight": 0.12 + }, + "R3": { + "requirement": "Finds the nearest relevant Canadian mission to Pittsburgh and captures its passport service hours and appointment or booking instructions, with the office page opened for visual proof.", + "verification": "Grader confirms the selected mission is plausibly nearest to Pittsburgh, and the open mission page visibly shows the office identity plus service hours and booking or appointment details.", + "weight": 0.14 + }, + "R4": { + "requirement": "Correctly determines whether a Canadian passport holder needs a tourist visa for a 2-week trip to Japan using the official MOFA site.", + "verification": "Grader checks that the answer matches the visa status and any relevant conditions shown on the MOFA page for Canadian travelers.", + "weight": 0.12 + }, + "R5": { + "requirement": "Accurately reports the current Canadian government travel advisory level for Japan and at least one notable advisory detail, with the advisory page left open.", + "verification": "Grader confirms the advisory level and detail match the visible travel.gc.ca Japan advisory page left open in a tab.", + "weight": 0.1 + }, + "R6": { + "requirement": "Collects meaningful travel insurance comparison information from PolicyAdvisor relevant to travel involving Japan and specifically notes any residency or eligibility constraints for someone living in the U.S.", + "verification": "Grader checks that at least one relevant PolicyAdvisor result or quote page was reached and that the summary includes provider/plan details plus eligibility or residency limitations visible on the site.", + "weight": 0.12 + }, + "R7": { + "requirement": "Collects meaningful travel insurance comparison information from Kanetix relevant to travel involving Japan and specifically notes any residency or eligibility constraints for someone living in the U.S.", + "verification": "Grader checks that at least one relevant Kanetix result or quote page was reached and that the summary includes provider/plan details plus visible eligibility or residency limitations.", + "weight": 0.12 + }, + "R8": { + "requirement": "Produces a coherent final synthesis that integrates all official findings and compares the two insurance sources while clearly flagging uncertainty or limitations.", + "verification": "Grader confirms the final response includes all required sections and a side-by-side insurance comparison with explicit notes about uncertainty, especially for a Canadian resident in the U.S.", + "weight": 0.1 + } + }, + "categories": [ + "Law and Government > Government", + "Law and Government > Immigration and Visas", + "Travel and Tourism > Travel and Tourism - Other" + ], + "num_categories": 3 + }, + { + "task_id": "8fcdeed84a0deb05342b07c26116792a5b6a6a3f", + "confirmed_task": "I’m relocating to Austin in about two months for a new job near the Domain, so I want help narrowing down apartments that would actually work for day-to-day life without blowing my budget. Please start on Zillow and search the Domain/North Austin area for 1-bedroom apartments under $1,800 a month, and filter for places that have both in-unit washer/dryer and a pool because those are my non-negotiables. Open the three best-looking Zillow listings in separate tabs so I can compare the photos, map placement, and amenity details, and pull out the apartment name, full address, rent, and a couple of listing highlights from each. Then do the same search on Apartments.com with the same budget and amenity filters, again opening the three strongest options in their own tabs so I can visually compare them and note the same details. Once you’ve got both sets, compare the six options, remove duplicates if the same property shows up on both sites, and tell me which apartments seem like the best overall fit based on price, amenities, and location near the Domain. After that, use CapMetro’s site and map tools to check whether each shortlisted apartment is near a MetroRail stop or has a practical bus connection into the Domain area, because I want to know whether I could commute without driving every day; if the map view helps, pull it up and keep the most useful transit page open. Then look up the neighborhoods for those apartments on Niche so I can get a feel for what living there would be like, especially safety ratings, walkability info if it’s shown, and whether there are grocery stores nearby for basic errands. Finally, go to the Texas Attorney General website and find the renters’ rights guidance that matters most before signing a lease in Texas, especially anything about deposits, repairs, fees, disclosures, and ending a lease, and leave that page open too so I can read it myself later. In the end, give me one clean apartment-hunting brief that combines the listing comparison, transit practicality to the Domain, neighborhood pros and cons, and a short lease-review checklist I can use when I start contacting properties.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify 3 Zillow apartment listings in the Domain/North Austin area that satisfy all stated filters: 1 bedroom, under $1,800/month, in-unit washer/dryer, and pool.", + "verification": "Grader can confirm Zillow search results and/or open listing tabs show the applied filters and that 3 qualifying listings were opened with visible listing pages.", + "weight": 0.16 + }, + "R2": { + "requirement": "Identify 3 Apartments.com apartment listings in the Domain/North Austin area that satisfy all stated filters: 1 bedroom, under $1,800/month, in-unit washer/dryer, and pool.", + "verification": "Grader can confirm Apartments.com search results and/or open listing tabs show the applied filters and that 3 qualifying listings were opened with visible listing pages.", + "weight": 0.16 + }, + "R3": { + "requirement": "Create a deduplicated comparison of the Zillow and Apartments.com options using listing-level details such as price, amenities, address/location, and overall fit.", + "verification": "Grader can verify the comparison references the listings gathered from both sites, removes overlaps where the same property appears twice, and ranks or summarizes the best overall options.", + "weight": 0.18 + }, + "R4": { + "requirement": "Assess transit access for each shortlisted apartment using CapMetro, specifically whether it is near a MetroRail station or has a practical bus route connection to the Domain area.", + "verification": "Grader can confirm CapMetro pages or map views were used and that each shortlisted apartment has associated station or route information tied to Domain access.", + "weight": 0.16 + }, + "R5": { + "requirement": "Provide neighborhood research from Niche for the shortlisted apartment areas, including safety ratings, walkability information if available, and nearby grocery store options.", + "verification": "Grader can verify Niche neighborhood pages were consulted and that each shortlisted area includes the requested neighborhood details.", + "weight": 0.14 + }, + "R6": { + "requirement": "Summarize key Texas renters’ rights guidance from the Texas Attorney General website relevant to lease review, including deposits, repairs, fees, disclosures, and termination-related issues.", + "verification": "Grader can confirm the Texas Attorney General page was opened and the summary reflects topics visibly covered on the official guidance page.", + "weight": 0.1 + }, + "R7": { + "requirement": "Deliver a final integrated apartment-hunting brief that combines apartment comparison, transit suitability, neighborhood findings, and renters’ rights guidance into a usable decision aid.", + "verification": "Grader can verify the final output includes all major sections, references the shortlisted apartments consistently, and synthesizes findings into a coherent recommendation or planning brief.", + "weight": 0.1 + } + }, + "categories": [ + "Business and Consumer Services > Real Estate" + ], + "num_categories": 1 + }, + { + "task_id": "6421b906fe97b3799960af31c77f20ff25f756b1", + "confirmed_task": "I’m putting together a quick graduate outreach brief for a student in San Diego and want a few very specific examples from different kinds of schools and programs. Please start on Lewis & Clark’s admissions site and find the admissions representative who covers San Diego, California, then open that actual regional contact page and capture the rep’s name, email, phone, and anything else listed, and leave the page open so I can visually confirm how they assign territories. Then go to Hunter College Silberman’s site and find the Fall 2026 application deadline for the MSW program, making sure you’re on the real admissions or application page where the date is shown. After that, use Indiana University Kokomo’s graduate programs page to list all of the graduate programs they show there, and keep that page open in its own tab so I can compare the breadth of options at a smaller campus. Since the student is especially interested in California opportunities, go to the UCSF clinical trials site and find at least two glioblastoma trials in Northern California that are currently recruiting, then open the actual trial pages in separate tabs so you can give me each trial’s recruiting status and study location and I can see that they’re still live. For a student-life example, use Google to get to the official Oral Roberts University page that identifies the student activities director, and pull the person’s name plus email and phone if available, making sure it comes from an ORU page rather than a directory aggregator. Finally, use Google to find the official EDF Energy graduate programme page that mentions whether new hires get salary reviews, and quote the exact wording from the page if it’s there. Please give me everything back as a compact sourced summary, but keep the key pages open in tabs so I can glance at the evidence.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Correctly identify the Lewis & Clark admissions representative covering San Diego, California, including the representative’s name and listed contact details.", + "verification": "Grader can confirm the open Lewis & Clark regional contact page shows San Diego, California assigned to the named representative and displays the extracted contact information.", + "weight": 0.17 + }, + "R2": { + "requirement": "Report the Fall 2026 application deadline for Hunter College Silberman’s MSW program from the official page.", + "verification": "Grader can confirm the open Hunter Silberman admissions/application page visibly shows the Fall 2026 MSW deadline date reported in the summary.", + "weight": 0.14 + }, + "R3": { + "requirement": "List the graduate programs shown on Indiana University Kokomo’s graduate programs page.", + "verification": "Grader can compare the returned program list against the visible program names on the open IU Kokomo graduate programs page.", + "weight": 0.14 + }, + "R4": { + "requirement": "Identify at least two glioblastoma clinical trials in Northern California that are currently recruiting, including each trial’s recruiting status and study location.", + "verification": "Grader can inspect the separate open UCSF trial tabs and verify that each named trial is glioblastoma-related, marked currently recruiting, and has the reported Northern California location.", + "weight": 0.22 + }, + "R5": { + "requirement": "Find the Oral Roberts University student activities director’s name and contact information from an official ORU page.", + "verification": "Grader can confirm the open ORU page names the student activities director and shows the extracted email and/or phone details.", + "weight": 0.13 + }, + "R6": { + "requirement": "Determine whether EDF Energy’s graduate programme includes salary reviews for new hires and provide the exact confirming wording from the official page.", + "verification": "Grader can confirm the open EDF Energy page contains the quoted wording and that the answer matches the page text.", + "weight": 0.12 + }, + "R7": { + "requirement": "Return a compact summary with sources covering all requested items.", + "verification": "Grader can verify the final response includes all six requested findings, each paired with a source reference or page title/link, in a concise summary format.", + "weight": 0.08 + } + }, + "categories": [ + "Science and Education > Universities and Colleges", + "Science and Education > Education" + ], + "num_categories": 2 + }, + { + "task_id": "295f11f4eebda80a7551944fd9b6f4e01db92666", + "confirmed_task": "I’m trying to build a fun gift shortlist for someone who’s into Sonic, anime figures, trading cards, and Pokémon, and I want it to feel grounded in real options I could actually buy. Please start on Amazon and search for “Sonic toys,” then sort the results from price low to high so we can see the cheapest ideas first, and grab the first three items shown with their prices as my budget baseline. After that, stay on Amazon and search for “Sonic the Hedgehog character toys,” ideally things like Tails, Knuckles, Shadow, or Amy, and open three promising options in separate tabs so I can compare them visually before you note their names and prices. Then switch over to Amazon and check for a Cars 2 Lightning McQueen figure and note its product name and item code, then look up a Cars Chick Hicks figurine so we have a reference listed price to compare - please leave those product pages open so I can look at the photos. After that, head to Costco and check gift-style product results for three specific items that would fit this person’s interests, such as a Pokémon item, a collectible-style item, or even something like a themed bench or display-worthy gift, and capture the product names and prices from the actual listings. Finally, use Pokellector to look at the newest Pokémon TCG sets and tell me the most recent set names shown there, making sure to open the page where the set images are visible so I can use that as a reference for current packs. In the end, give me a concise shortlist with prices or item codes where available, and explicitly compare the anime collectible figure option against the cheapest Sonic toy options you found first so I can tell whether the premium figure is worth it.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Amazon results for \"Sonic toys\" are sorted low to high and the first three visible items with prices are captured.", + "verification": "Grader can confirm the Amazon sort state is Price: Low to High and that three top visible result cards are listed with matching prices.", + "weight": 0.18 + }, + "R2": { + "requirement": "Three Amazon Sonic character toy options are identified from a separate character-focused search, with names and prices, and opened in separate tabs.", + "verification": "Grader can confirm a search for Sonic character toys, see three relevant product tabs open, and match the recorded names and prices to those product pages.", + "weight": 0.16 + }, + "R3": { + "requirement": "A Cars 2 Lightning McQueen figure is found on Amazon with the exact product name and item code recorded.", + "verification": "Grader can confirm the Amazon product page shows a Cars 2 Lightning McQueen figure and that the recorded product name and item code match the page.", + "weight": 0.17 + }, + "R4": { + "requirement": "A Cars Chick Hicks figurine is found on Amazon with the listed price recorded, and the product page is left open for visual reference.", + "verification": "Grader can confirm the Amazon product page shows a Cars Chick Hicks figurine and that the listed price matches the reported value.", + "weight": 0.12 + }, + "R5": { + "requirement": "Three Costco gift products relevant to the recipient’s interests are captured with product names and prices from actual listings.", + "verification": "Grader can confirm three Costco product listings and verify the reported names and prices against the visible listing pages.", + "weight": 0.14 + }, + "R6": { + "requirement": "The most recent Pokémon TCG sets shown on Pokellector are listed with their visible set illustration images referenced.", + "verification": "Grader can confirm the newest sets page on Pokellector and match the reported set names to the visible set tiles/images.", + "weight": 0.11 + }, + "R7": { + "requirement": "The final response is a concise themed gift shortlist that includes all collected items, prices or item codes where available, and an explicit comparison between the anime collectible figure option and the cheaper Sonic toy options.", + "verification": "Grader can confirm the final summary includes outputs from all prior steps and contains a direct price/value comparison between the collectible figure and the low-cost Sonic toys.", + "weight": 0.12 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Games > Games - Other" + ], + "num_categories": 2 + }, + { + "task_id": "e8b73f739732f8aeb3c473d00f6219af5b8dcdb7", + "confirmed_task": "We’re expecting our first baby and I want a really practical shortlist of the best hospitals in Los Angeles for giving birth, not just a generic list, so could you use Google to research LA-area hospitals that clearly offer maternity, obstetrics, and labor-and-delivery services and then narrow that down to the 10 strongest options for childbirth? I’d like you to lean on a mix of official hospital maternity pages and something like U.S. News or similar quality indicators so I can tell which places are actually recognized, not just nearby. As you find good candidates, open each hospital’s actual maternity or labor-and-delivery page in its own tab and keep those tabs open so I can visually review the pages and photos afterward. For each of the final 10 hospitals, please verify on the official site that labor and delivery is explicitly offered, note the hospital name and Los Angeles-area location, write a short plain-English description of the maternity program, and capture whether it shows up in rankings, recognitions, or other quality signals. Then create a CryptPad Document file titled exactly “Best LA Maternity Hospitals” and put all 10 hospitals in it with the maternity-page links included, because I want one place where I can compare everything. Once that looks complete, add a short comparison section explaining the main differences between the hospitals, any patterns you noticed in the rankings or maternity offerings, and which few seem like the strongest recommendations overall. Please leave the CryptPad Document open at the end along with the 10 hospital maternity tabs so I can click through them myself.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "A credible Los Angeles-area candidate pool of hospitals with maternity-related care is identified using Google and reputable sources.", + "verification": "Search history and opened results show hospital candidates sourced from Google results leading to official hospital pages or reputable healthcare sources.", + "weight": 0.12 + }, + "R2": { + "requirement": "Exactly 10 hospitals are selected as the strongest childbirth options based on service availability plus reputation, rankings, or recognitions.", + "verification": "The final CryptPad Document contains 10 distinct hospitals and the selection is supported by evidence gathered from search and ranking sources.", + "weight": 0.16 + }, + "R3": { + "requirement": "Each of the 10 selected hospitals has its official maternity, obstetrics, or labor-and-delivery page opened in a separate browser tab and left open.", + "verification": "Browser tab bar shows 10 hospital-domain tabs corresponding to the 10 hospitals listed in the CryptPad Document.", + "weight": 0.18 + }, + "R4": { + "requirement": "For every listed hospital, labor and delivery services are explicitly verified on the official hospital page.", + "verification": "The open hospital pages visibly mention labor and delivery, childbirth, or equivalent maternity inpatient delivery services for each listed hospital.", + "weight": 0.16 + }, + "R5": { + "requirement": "Ranking, recognition, or quality-indicator evidence is gathered for each of the 10 hospitals from U.S. News or similarly reputable sources.", + "verification": "The document entries include ranking or recognition notes for each hospital, and browsing shows U.S. News or equivalent reputable source pages used to support those notes.", + "weight": 0.14 + }, + "R6": { + "requirement": "A CryptPad Document titled exactly 'Best LA Maternity Hospitals' is created and includes for each hospital its name, location, maternity program description, ranking/recognition status, and official maternity page link.", + "verification": "The CryptPad Document title matches exactly and the body contains complete entries for all 10 hospitals with the required fields and links.", + "weight": 0.16 + }, + "R7": { + "requirement": "The CryptPad Document ends with a comparative summary highlighting key differences, patterns, and top recommendations, and the doc remains open alongside the hospital tabs.", + "verification": "The final section of the CryptPad Document contains a written comparison and recommendation summary, and the browser still shows the doc plus the hospital tabs open.", + "weight": 0.08 + } + }, + "categories": [ + "Health > Health - Other" + ], + "num_categories": 1 + }, + { + "task_id": "b21a86441ddca8186175bfffcaae0358ed66eec4", + "confirmed_task": "Can you help me plan a short LA trip from Pittsburgh and keep the key pages open so I can actually look at them afterward? Start on Google Flights, Kayak, or Expedia and search a round-trip from PIT to LAX for a simple 2-day trip, making sure the outbound gets into Los Angeles before 6:00 PM and the return lands back in Pittsburgh before midnight. I’d really prefer a nonstop if one exists, but if not, pick the option with the shortest total travel time that still feels like a good value, and open the top few flight options so I can compare before you choose the best one; then leave the selected flight page open in its own tab. After that, use Google Hotels or a hotel booking site to find me a hotel around Koreatown or West Hollywood with at least a 4.5 rating and a nightly rate under $350, and open the actual hotel listing so I can see the price, rating, photos, and map location, then keep that tab open too. Once the stay looks settled, go to Google Maps and pull up Griffith Observatory, The Getty Center, Santa Monica Pier, one well-rated Korean BBQ place in Koreatown, one coffee shop rated 4.5 or better, and somewhere with a great sunset view, because I want a realistic plan instead of a random list. Use the map routes to figure out a 2-day itinerary that avoids driving during LA rush hour as much as possible, especially around 7 to 10 AM and 4 to 7 PM, and try to keep travel between stops under about 40 minutes when that’s realistic, with a little buffer time between activities so the days don’t feel crammed. Open the important map views or route checks in separate tabs if needed so I can visually compare how far apart things are, and then give me a simple trip summary with the flight you chose, the hotel you recommend, and a 2-day schedule showing activity times plus the travel time between each stop.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "A round-trip PIT to LAX flight search is completed on a flight site, with top candidate options reviewed and one selected that arrives in Los Angeles before 6:00 PM outbound and returns to Pittsburgh before midnight.", + "verification": "Grader can see an open flight search/results or details page showing PIT, LAX, round-trip results, visible candidate options, and the selected itinerary meeting the timing constraints.", + "weight": 0.22 + }, + "R2": { + "requirement": "The chosen flight reasonably reflects the stated preference hierarchy: nonstop if available, otherwise the shortest total travel time, while still balancing price and schedule.", + "verification": "Visible comparison among top few flight options supports why the selected itinerary is a reasonable best choice based on stops, duration, and price.", + "weight": 0.13 + }, + "R3": { + "requirement": "A hotel in Koreatown or West Hollywood is identified with rating at least 4.5 and nightly price under $350, and the actual listing page is opened.", + "verification": "Open hotel listing visibly shows neighborhood or map placement, nightly price under $350, rating of 4.5 or higher, and listing details/photos.", + "weight": 0.2 + }, + "R4": { + "requirement": "Google Maps is used to identify all required stop types: Griffith Observatory, The Getty Center, Santa Monica Pier, a Korean BBQ spot in Koreatown, a coffee shop rated at least 4.5, and a sunset-view location.", + "verification": "Open Google Maps place pages or map tabs show each required destination category and allow visual confirmation of their locations.", + "weight": 0.16 + }, + "R5": { + "requirement": "The itinerary routing is realistic, uses map route checks, avoids LA rush hour driving where practical, keeps travel between stops under about 40 minutes when possible, and includes buffer time.", + "verification": "Open route/map tabs and the written plan show travel-time checks, sensible sequencing, reduced rush-hour exposure, and spacing between activities.", + "weight": 0.17 + }, + "R6": { + "requirement": "A final travel plan is produced with the selected flight, hotel recommendation, and a 2-day itinerary including activity timing and travel times, while keeping the key browser pages open.", + "verification": "Final summary includes all required trip components, and the flight tab, hotel tab, and relevant map/route tabs remain open for visual review.", + "weight": 0.12 + } + }, + "categories": [ + "Travel and Tourism > Air Travel", + "Travel and Tourism > Accommodation and Hotels" + ], + "num_categories": 2 + }, + { + "task_id": "72875601345415ba90a3c31bd93c25bb5ea54bb2", + "confirmed_task": "Can you help me plan a Christmas trip to San Francisco from Pittsburgh and do it in the browser so I can actually look at the options with you? Start on Google Flights and search round-trip flights from PIT to SFO leaving December 23 and coming back December 26. I’d really prefer a nonstop if one exists, but if not, pick the best option with a short layover and reasonable total travel time, and please favor something that gets me into San Francisco in the afternoon on December 23 so I still have that evening free. Open the best few flight options and leave the results or selected flight page open so I can review it. After that, use Google Hotels or the hotel results on Google to find a place in a central area, ideally North Beach, Nob Hill, or Union Square, with at least a 4.5-star rating and under $400 per night, because I want something nice but still realistic for Christmas week. When you find the best fit, open the actual hotel page so I can see the photos, nightly price, and map location, and keep that tab open too. Once those are set, switch to Google Maps and map out the trip around the city using the hotel as the base. I want the 3-day plan for December 23 through 25 to include the Golden Gate Bridge, Alcatraz Island, the Ferry Building, one Michelin-recommended restaurant, one well-known bakery or coffee shop, and at least one scenic viewpoint. Try to group things so I’m not zigzagging all over San Francisco, keep most travel legs under about 30 minutes if possible, leave some buffer time between activities, and include at least one segment by public transit instead of driving. Please open the relevant places and route views in Maps so I can visually see how they connect, and then put everything into a CryptPad Document with the flight you’d choose, the hotel you recommend, and a day-by-day itinerary with times and travel methods. Leave the flight page, hotel page, and key map tabs open for me as proof while you finish the report.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "A Google Flights search for PIT to SFO on December 23 to December 26 is completed, multiple viable options are reviewed, and one recommended itinerary is selected with preference for nonstop or short-layover service and afternoon arrival on December 23.", + "verification": "Grader can see Google Flights results or a selected itinerary page showing PIT, SFO, the correct dates, and visible flight options or chosen flight details left open in the browser.", + "weight": 0.24 + }, + "R2": { + "requirement": "A hotel in San Francisco is identified and opened from Google hotel results, meeting the constraints of central location, ideally North Beach, Nob Hill, or Union Square, at least 4.5 stars, and under $400 per night.", + "verification": "Grader can see an open hotel page with visible hotel name, star rating, nightly price, photos, and map/location information consistent with the requested neighborhoods or central area.", + "weight": 0.2 + }, + "R3": { + "requirement": "Google Maps is used to identify all required San Francisco trip components: Golden Gate Bridge, Alcatraz Island, Ferry Building, one Michelin-recommended restaurant, one well-known bakery or coffee shop, and at least one scenic viewpoint, anchored to the selected hotel.", + "verification": "Grader can see Google Maps place pages, pins, or tabs showing the hotel and all required destinations, with enough visible map context to confirm they were actually opened and examined.", + "weight": 0.18 + }, + "R4": { + "requirement": "A coherent 3-day itinerary for December 23 through 25 is planned using map travel times, includes all required stops, avoids excessive cross-city backtracking, keeps most legs under about 30 minutes where feasible, includes some buffer time, and uses public transit for at least one segment.", + "verification": "Grader can inspect open Google Maps routes or route tabs and the resulting plan to confirm travel methods, approximate times, and logical geographic grouping across the three days.", + "weight": 0.24 + }, + "R5": { + "requirement": "A CryptPad Document is created that clearly summarizes the selected flight, recommended hotel, and full 3-day itinerary with activities, timing, and travel methods, while the key browser resources remain open for review.", + "verification": "Grader can see a CryptPad Document containing the trip summary and can also confirm that the flight page, hotel page, and at least one relevant map route or place tab remain open.", + "weight": 0.14 + } + }, + "categories": [ + "Travel and Tourism > Air Travel", + "Travel and Tourism > Accommodation and Hotels", + "Community and Society > Holidays and Seasonal Events" + ], + "num_categories": 3 + }, + { + "task_id": "256342f13c0a03e080f92ee073153fe33a6881c0", + "confirmed_task": "I’m trying to get a realistic shortlist of the best knee surgeons in New York City because I may need ACL reconstruction or meniscus repair, and I want something I can actually look through myself afterward. Please start in Google and research orthopedic surgeons in NYC who are specifically known for knee ligament reconstruction, ACL surgery, and meniscus repair, then create a spreadsheet called Top ACL Surgeons NYC to keep everything organized. As you find strong candidates, open each surgeon’s official hospital or practice profile page in its own tab so I can compare them side by side, and only keep surgeons whose actual profile page clearly says they perform ACL reconstruction, meniscus repair, knee ligament reconstruction, or very closely related sports knee procedures. For each surgeon you keep, put their full name, hospital or practice affiliation, specialty focus, a short note confirming where ACL reconstruction or meniscus repair is mentioned, and the direct profile link into the spreadsheet. Please keep going until there are exactly 10 verified NYC surgeons in the sheet, and make sure every person listed still has their real profile page open in a tab so I can inspect the pages and see the affiliations myself. Once the list is complete, look across the 10 entries and add a short summary of which hospitals, orthopedic groups, or medical centers show up most often, because I want to know which institutions seem to dominate this specialty in the city.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "A spreadsheet titled 'Top ACL Surgeons NYC' is created and used as the working document.", + "verification": "Grader can see a spreadsheet with the exact title open and populated during the task.", + "weight": 0.12 + }, + "R2": { + "requirement": "Official hospital or practice profile pages for candidate NYC surgeons are opened in separate browser tabs.", + "verification": "Grader can see multiple open tabs corresponding to official surgeon profile pages rather than generic search results or directory summaries.", + "weight": 0.16 + }, + "R3": { + "requirement": "Each included surgeon is verified on the actual profile page as performing ACL reconstruction, meniscus repair, knee ligament reconstruction, or a clearly equivalent sports knee procedure.", + "verification": "On each selected surgeon tab, the grader can locate visible text or procedure listings that substantiate the inclusion criteria.", + "weight": 0.24 + }, + "R4": { + "requirement": "For every selected surgeon, the spreadsheet records full name, hospital or practice affiliation, specialty focus, confirmation note for ACL reconstruction or meniscus repair, and the direct profile link.", + "verification": "Grader can inspect the spreadsheet rows and confirm all required fields are present for each surgeon and correspond to the open profile tabs.", + "weight": 0.2 + }, + "R5": { + "requirement": "Exactly 10 New York City surgeons are included, and each spreadsheet entry corresponds to an official profile page that remains open in a tab.", + "verification": "Grader can count exactly 10 completed spreadsheet entries and match each one to an open official profile tab for that surgeon.", + "weight": 0.18 + }, + "R6": { + "requirement": "The spreadsheet includes a short summary of which hospitals, orthopedic groups, or medical centers appear most frequently among the 10 surgeons.", + "verification": "Grader can see a written summary in the spreadsheet that synthesizes affiliation frequency across the final 10 entries.", + "weight": 0.1 + } + }, + "categories": [ + "Health > Health - Other", + "Health > Medicine" + ], + "num_categories": 2 + }, + { + "task_id": "d3250da48cc778a40d11683a56fdfca962d6fe19", + "confirmed_task": "I’m putting together a coordinated holiday gift bundle for one family and want it to feel like everything belongs together instead of looking random. On Kohl’s, please find two gift ideas for siblings that stay under $25 each and are actually available for pickup in store today as a backup to shipping — one that would make sense for a 12-year-old girl and one for an 11-year-old boy — and open each product in its own tab so I can compare the vibe and price side by side. Once you’ve got those, go to Etsy and open a personalized family Christmas ornament listing that looks giftable, then tell me the shop name and exactly what customization choices the listing offers, because I’d like to add something with the family name and need to know what I can personalize. After that, use Target to find a shatterproof gold-and-white ornament set that visually matches the personalized ornament and would work as filler in the bundle, and open the actual product page so I can see the photos and any color or finish options shown. Then head to Walmart and browse for two boys outfit gift options for a younger boy, and pick the one that best matches the overall style and price level of the other gifts so the bundle feels consistent; please keep the better outfit page open for me. Finally, use Google to find one highly rated hot buttered rum recipe from a recognizable recipe site, open the actual recipe page, and give me the recipe name, source, ingredient list, and basic preparation steps so I can include a cozy holiday extra with the package idea. At the end, send me a concise summary with all the selected items, prices, pickup-today details for Kohl’s, the Etsy customization options, the Target ornament details, the two Walmart outfit options with your preferred pick, and the recipe source and steps.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify two Kohl’s gift ideas for siblings, one for a 12-year-old girl and one for an 11-year-old boy, each under $25, and confirm pickup in store today on the product pages.", + "verification": "Grader can confirm two separate Kohl’s product pages are open or were visited, each showing a product title, price below $25, and visible pickup-today availability information.", + "weight": 0.24 + }, + "R2": { + "requirement": "Review one Etsy personalized family Christmas ornament listing and report the shop name plus the visible customization or variation options offered.", + "verification": "Grader can confirm an Etsy listing page was opened and that the response includes the shop name and the customization fields or variation choices shown on that listing.", + "weight": 0.2 + }, + "R3": { + "requirement": "Find one Target shatterproof gold-and-white ornament set and report the product name along with any listed color or finish options visible on the product page.", + "verification": "Grader can confirm a Target product page for a shatterproof gold-and-white ornament set was opened and that the response matches the product title and visible option details on the page.", + "weight": 0.18 + }, + "R4": { + "requirement": "Provide two Walmart boys outfit gift options with product names and prices, and identify which one best matches the style and price level of the other selected gifts.", + "verification": "Grader can confirm two Walmart product pages or listings were reviewed and that the response includes two outfit names, their prices, and a clearly stated preferred choice.", + "weight": 0.18 + }, + "R5": { + "requirement": "Find one highly rated hot buttered rum recipe via Google and include the recipe name, source, ingredient list, and basic preparation steps from the actual recipe page.", + "verification": "Grader can confirm a Google results page led to a recipe page and that the response includes the recipe title, source site, ingredients, and preparation summary.", + "weight": 0.12 + }, + "R6": { + "requirement": "Return a concise final summary that includes all selected products, prices, Kohl’s pickup-today details, Etsy customization information, Target ornament details, the two Walmart outfit options with the preferred pick, and the recipe source and steps.", + "verification": "Grader can confirm the final response synthesizes outputs from all prior steps into one coherent holiday bundle summary with no major omissions.", + "weight": 0.08 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Community and Society > Holidays and Seasonal Events", + "Lifestyle > Gifts and Flowers" + ], + "num_categories": 3 + }, + { + "task_id": "4b9eb54dde6c129b27ccb642ef24fb060e736913", + "confirmed_task": "I’m trying to get more comfortable cooking at home without buying a bunch of gear, so on Amazon please compare a few 3-quart electric multicookers and pick the best one for me if my main use is making rice and steaming vegetables in a small kitchen. I’d like you to open the most promising options in separate tabs so I can visually compare the listings, and then leave the chosen product page open with the title, price, capacity, and the features that make it best for simple beginner meals. Once you’ve picked that cooker, use Google to find one highly rated hot buttered rum recipe from a real recipe site and pull out the ingredients and basic steps, mainly as a simple example of the kind of recipe format I could actually follow. Since I’m still learning the basics, also go to Reddit and find a beginner-friendly discussion about how long to boil chicken breast, then give me the time range people recommend. After that, use the USDA DRI Calculator for a sample adult profile — age 30, 5 feet 6 inches, 150 pounds, sedentary activity level — and record the estimated daily carb, protein, and fat targets so I have a realistic nutrition reference point. Then, using Google, find the City of Milwaukee food license requirements and Wisconsin DATCP guidance for starting a small charcuterie or food business, and summarize the key licensing steps I’d need to look into if I ever wanted to turn basic home cooking into a small side business. Please keep the USDA results page and the Milwaukee licensing source page open so I can look at them myself, and finish with a short summary tying together the multicooker choice, the macro targets, and whether this setup seems like a practical beginner routine.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Compares multiple Amazon 3-quart electric multicookers and selects one best option for rice and steaming vegetables, including product title, visible price, capacity, and key features from the listing.", + "verification": "Grader can confirm multiple Amazon product tabs were opened and that the final chosen product page remains open showing the listing title, price, and 3-quart capacity.", + "weight": 0.24 + }, + "R2": { + "requirement": "Provides one highly rated hot buttered rum recipe found via Google, including a clear ingredient list and basic preparation steps.", + "verification": "Grader can confirm a Google results path to a recipe site and that the returned summary includes ingredients and basic steps matching the recipe page.", + "weight": 0.16 + }, + "R3": { + "requirement": "Reports a Reddit-sourced beginner recommendation for boiling chicken breast and includes the time range given in the discussion.", + "verification": "Grader can confirm a Reddit thread was opened and that the reported boiling time range is visible in the discussion content.", + "weight": 0.14 + }, + "R4": { + "requirement": "Uses the USDA DRI Calculator with the specified sample adult profile and records estimated daily carbohydrate, protein, and fat targets.", + "verification": "Grader can confirm the USDA DRI Calculator results page is open and shows macro targets for the entered profile inputs.", + "weight": 0.2 + }, + "R5": { + "requirement": "Summarizes the key licensing steps for starting a charcuterie or food business in Milwaukee using City of Milwaukee food license requirements and Wisconsin DATCP guidance.", + "verification": "Grader can confirm Google was used to reach the City of Milwaukee licensing source and Wisconsin DATCP guidance, and that the summary mentions both sources' key requirements.", + "weight": 0.16 + }, + "R6": { + "requirement": "Briefly explains how the chosen multicooker and USDA macro targets could fit into a balanced beginner cooking routine.", + "verification": "Grader can confirm the final response explicitly connects the selected cooker's use cases with the reported carb, protein, and fat targets in a practical beginner-oriented summary.", + "weight": 0.1 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Home and Garden > Home and Garden - Other", + "Food and Drink > Cooking and Recipes" + ], + "num_categories": 3 + }, + { + "task_id": "78ddd1aab59eebace5f6f523d90012aa6c871c54", + "confirmed_task": "I’m trying to decide whether renting at The Ophelia in Pittsburgh makes more sense than buying nearby, so could you help me look at both sides in the browser? Start on apartments.com and open The Ophelia’s actual floor plan or availability page, then note at least two floor plans that are currently shown as available, including each plan’s name and the bedroom/bathroom setup, and leave that page open so I can look at the layouts myself. Since Pittsburgh winters are rough and I’m also thinking about car-related moving costs, go to WeatherTech and use their vehicle selector for a 2020 Toyota Highlander to find the floor mat and cargo liner options that fit, then open the cargo liner product page and keep that tab open as a reference. After that, use Google to find one LED emblem option for a 2023 Honda Civic, and click through to the actual product page so you can capture the product name and price rather than just a search snippet. Once you have those cost references, go to Zillow and search around the same Pittsburgh area for homes currently for sale that could realistically compete with renting there, then open five live listings in separate tabs and capture each property’s address and listing URL so I can compare them side by side. In the end, give me a concise comparison that pulls together the two apartment floor plans, the WeatherTech cargo liner reference, the LED emblem option, and the five Zillow listings so I can get a real renting-versus-buying snapshot.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Report at least two currently available floor plans from The Ophelia website, including each plan’s name and bedroom/bathroom details.", + "verification": "Grader can confirm the apartments.com floor plans or availability page is open and shows the named plans with matching bed/bath information.", + "weight": 0.22 + }, + "R2": { + "requirement": "Find WeatherTech floor mat and cargo liner options that fit a 2020 Toyota Highlander and provide the cargo liner product page reference.", + "verification": "Grader can confirm the WeatherTech tab shows 2020 Toyota Highlander fitment and an open cargo liner product page.", + "weight": 0.18 + }, + "R3": { + "requirement": "Use Google to find one LED emblem option for a 2023 Honda Civic and report the product name and price from the actual product page.", + "verification": "Grader can confirm a Google search was performed and the clicked product page displays the reported item name and price.", + "weight": 0.14 + }, + "R4": { + "requirement": "Find five currently for-sale Zillow home listings in the Pittsburgh area that could compete with renting there, and include each listing’s address and URL.", + "verification": "Grader can confirm five zillow.com listing tabs are open or accessible and each corresponds to a live property page with the reported address.", + "weight": 0.31 + }, + "R5": { + "requirement": "Provide a concise final comparison covering the apartment floor plans, the WeatherTech cargo liner reference, the LED emblem option, and the five Zillow listings to support a rent-versus-buy decision.", + "verification": "Grader can confirm the final response includes all required categories and accurately summarizes the information gathered from the open pages.", + "weight": 0.15 + } + }, + "categories": [ + "Business and Consumer Services > Real Estate" + ], + "num_categories": 1 + }, + { + "task_id": "795bfe117e0f58e49ca37ae8e453a507859a2a2b", + "confirmed_task": "I’m trying to piece together a really cheap trip to London for two, so can you help me build it in a practical order and keep the actual pages open where it matters? Start on Booking.com and search London for 2 adults staying this December, then find me at least one hotel that comes in under £100 total for the 2-night stay, because that ultra-budget option is going to set the tone for everything else. Open the actual property page in its own tab so I can see the photos and location, and note the hotel name, stay dates, total displayed price, and link. Once you’ve got that baseline, stay on Booking.com and look up NOX Hotel for a 1-night stay for 2 adults on any date in 2026, just so I can tell whether my bargain London option is unusually cheap or more normal for the city; open the NOX listing page too and record the date and total displayed price you find. After that, use Google to look for at least two hotels near Washington, DC Union Station that show 4-star-or-higher guest ratings and nightly prices under $200, because I may want a backup benchmark for city lodging in another trip later; please open each hotel result in its own tab or go to the actual hotel posting page so I can verify they’re real options and still look live. Then, because I want the whole trip to stay low-cost overall, go to Amazon and shortlist three mid-to-low-priced headphones or earbuds with active noise cancellation for travel that fit the same budget mindset, and open each product page in a separate tab so I can compare them side by side. In the end, send me one clean summary with the hotel names, prices, dates, ratings where relevant, key headphone features, and links, and leave the Booking.com property tabs and the Amazon product tabs open for me.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify at least one Booking.com London hotel for 2 adults in December with a displayed total price under £100 for the 2-night stay, and capture the hotel name, stay dates, total price, and link.", + "verification": "Grader can confirm the Booking.com search results or property page shows London, 2 adults December and a total under £100, with the property tab open.", + "weight": 0.28 + }, + "R2": { + "requirement": "Find a Booking.com NOX Hotel result for a 1-night stay in 2026 for 2 adults and record the specific date, displayed total price, and link.", + "verification": "Grader can confirm the NOX Hotel listing or property page on Booking.com shows a 1-night stay for 2 adults with a visible total price and the page left open.", + "weight": 0.2 + }, + "R3": { + "requirement": "Find at least two hotels near Washington, DC Union Station via Google that each show a guest rating of 4 stars or higher and a nightly price under $200, and record their names, ratings, prices, and links.", + "verification": "Grader can confirm on Google results, hotel panels, or linked hotel pages that two qualifying hotels near Union Station display ratings of 4.0+ and nightly prices below $200, with tabs open for the chosen options.", + "weight": 0.22 + }, + "R4": { + "requirement": "Shortlist at least three Amazon headphones or earbuds with active noise cancellation, including each product name, current price, key features, and link.", + "verification": "Grader can confirm three Amazon product pages are open and each page visibly indicates ANC or active noise cancellation, along with product name and current price.", + "weight": 0.18 + }, + "R5": { + "requirement": "Return one consolidated summary covering the London budget hotel, the NOX Hotel comparison, the two Washington, DC Union Station hotel benchmarks, and the three Amazon UK headphone options, with all requested names, dates, prices, ratings where relevant, key features, and links.", + "verification": "Grader can compare the final response against the collected browser evidence from the open Booking.com, Google, and Amazon UK tabs and verify all requested fields are included.", + "weight": 0.12 + } + }, + "categories": [ + "Travel and Tourism > Air Travel", + "Travel and Tourism > Accommodation and Hotels" + ], + "num_categories": 2 + }, + { + "task_id": "ec290c1a334e976ffa3ba68b71ac6c09c2eb82ba", + "confirmed_task": "I’m in the UK and I’m worried my tenancy deposit may not have been handled properly when I took over an existing tenancy, so could you start on Citizens Advice and find the guidance that explains whether the landlord or agent still had to protect the deposit in that kind of handover situation, and also how I’m supposed to check whether it’s protected and what I can do if it wasn’t done correctly. Please open the actual Citizens Advice page and leave it open so I can look at the wording myself. Once you’ve got that, use Google to find a solid explanation of what a rent ledger is and how to make one, because I want to document every rent payment and deposit-related amount clearly if I end up disputing this; tailor that summary to my situation by spelling out exactly which columns or entries I should include for a UK tenancy deposit issue, and open the most useful source in its own tab so I can compare it with the Citizens Advice guidance. After that, still using Google, find a practical personal-finance discussion about getting through to the next paycheck and pull out at least three realistic short-term ways to cover expenses while I sort this out, since I may need a bit of breathing room without making things worse. Then go to MoneySavingExpert’s Cheap Mobile Finder and filter for SIM-only deals with unlimited minutes, unlimited texts, and at least 10GB of data, and list the three cheapest options in ascending price order so I can see whether switching my phone plan would help. Please keep the filtered results page open too, and give me one combined summary that brings together the deposit guidance, the rent-ledger setup advice, the short-term cash-flow ideas, and the mobile deal recommendations.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Correctly summarize Citizens Advice guidance on whether a deposit should be protected when taking over an existing tenancy, including how to check protection status and what the tenant can do if the rules were not followed.", + "verification": "Grader confirms the answer reflects the content of the open Citizens Advice page and includes all three elements: protection rule, how to check, and next actions if non-compliant.", + "weight": 0.3 + }, + "R2": { + "requirement": "Explain what a rent ledger is and how to create one, including the key fields or entries to record, and tailor the explanation to evidence useful in a UK tenancy deposit dispute. Make sure a useful resource is pulled up.", + "verification": "Grader confirms the answer defines a rent ledger, describes how to set one up, and lists dispute-relevant fields such as dates, amounts due, amounts paid, payment method, arrears or balance, deposit-related notes, and supporting references from the source tab.", + "weight": 0.25 + }, + "R3": { + "requirement": "Provide at least three practical, actionable ways to cover expenses until the next paycheck, based on a personal finance discussion found via Google.", + "verification": "Grader confirms there are at least three distinct short-term suggestions and that they are framed as realistic actions drawn from a discussion source rather than generic filler.", + "weight": 0.15 + }, + "R4": { + "requirement": "Use MoneySavingExpert’s Cheap Mobile Finder to identify the three cheapest SIM-only deals in ascending price order after filtering for unlimited minutes, unlimited texts, and at least 10GB of data.", + "verification": "Grader confirms the filtered MoneySavingExpert results page is open and that the listed deals match the visible filtered results and are ordered from cheapest to most expensive.", + "weight": 0.2 + }, + "R5": { + "requirement": "Present the final answer as one combined, user-oriented summary that integrates the deposit guidance, tailored rent ledger advice, short-term expense suggestions, and mobile deal recommendations.", + "verification": "Grader confirms the final response is consolidated, coherent, and includes all four required sections in a way that is clearly tailored to the user’s situation.", + "weight": 0.1 + } + }, + "categories": [ + "Law and Government > Legal", + "Business and Consumer Services > Real Estate" + ], + "num_categories": 2 + }, + { + "task_id": "b183c34b5697881596a40d77bff64a5e013dc725", + "confirmed_task": "I’m trying to make a budget-conscious Apple purchase and want a real browser-based comparison, not just a generic summary. Please start on Apple’s site and open the current iPad Pro page and iPad Air page in separate tabs so I can compare them side by side, then pull out at least three concrete differences like the chip, display, storage options, camera setup, accessory support, or starting price, and tell me whether the Pro seems worth considering for someone mainly trying to save money. If the Air looks like the more practical route, switch over to Best Buy and look up the 11-inch iPad (A16, Wi‑Fi, 128GB) listings in pink and blue, and also check the blue open-box options in good and excellent condition, because I want to know the cheapest acceptable way to buy one right now; open the relevant product pages so you can verify the color and condition details on the actual listings, and leave the cheapest one open. After that, go back to Apple and check the current MacBook Pro lineup so I have a laptop price ceiling, and identify the lowest starting-price MacBook Pro model Apple is selling right now. Then head to Amazon, search for “iphone 17 pro,” and look through the live results for two listings that are obviously actual phones, because I want to avoid junk search results while comparison shopping; open those result pages too so the titles and prices are clearly visible. In the end, give me a short recommendation that connects the iPad Pro vs Air comparison to the Best Buy iPad choice and tells me exactly how much cheaper that iPad option is than the cheapest MacBook Pro, while keeping the iPhone price in mind.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Provide an Apple iPad Pro vs iPad Air comparison with at least three concrete spec differences and a brief judgment about whether the Pro is worth considering for a budget-conscious buyer.", + "verification": "Grader confirms Apple iPad Pro and iPad Air pages were opened in separate tabs or otherwise directly visited, and the final response includes at least three specific differences grounded in those pages plus a value judgment.", + "weight": 0.28 + }, + "R2": { + "requirement": "List the current Best Buy prices for the 11-inch iPad (A16, Wi‑Fi, 128GB) in pink, blue, and blue open-box good and excellent conditions, and identify the cheapest acceptable option.", + "verification": "Grader confirms the relevant Best Buy listing pages were opened and that the response includes prices for pink, blue, blue open-box good, and blue open-box excellent, with one option explicitly named as the cheapest acceptable route.", + "weight": 0.3 + }, + "R3": { + "requirement": "Identify the lowest starting-price MacBook Pro model currently listed on Apple’s site, including the model name and starting price.", + "verification": "Grader confirms Apple’s MacBook Pro lineup page was visited and the response names the lowest-priced MacBook Pro configuration with its starting price.", + "weight": 0.16 + }, + "R4": { + "requirement": "Find two Amazon search results for “iphone 17 pro” that are clearly phone listings, and provide each title and price.", + "verification": "Grader confirms Amazon search results and/or product pages were opened and that the response includes two listings that are clearly actual phone listings with visible titles and prices.", + "weight": 0.12 + }, + "R5": { + "requirement": "Return a short final recommendation that ties the Best Buy iPad choice back to the earlier iPad Pro vs iPad Air comparison and states how far below the cheapest MacBook Pro the chosen iPad option is.", + "verification": "Grader confirms the final summary explicitly references the earlier Air vs Pro conclusion, names the recommended Best Buy iPad option, and calculates the price gap versus the cheapest MacBook Pro.", + "weight": 0.14 + } + }, + "categories": [ + "Computers Electronics and Technology > Consumer Electronics", + "Ecommerce & Shopping > Price Comparison" + ], + "num_categories": 2 + }, + { + "task_id": "3868f9b52e96067b4f55834a3b110e1228b48e65", + "confirmed_task": "I’m thinking about moving into post-production work in Los Angeles and want a realistic sense of the entry path, especially for media/entertainment IT-engineer-type roles. Please start on Google and look up what employers in media and entertainment usually expect for IT engineers, then pull together at least three recurring requirements you keep seeing and at least two concrete training or certification routes, because I want to know whether this is something I could realistically train into. Once you have that baseline, go to the Motion Picture Editors Guild site and find the actual West Coast or Los Angeles path for joining IATSE Local 700, including the steps, eligibility, and anything about applications, rosters, fees, or required experience, so I can compare the union route with the broader training path. If there are separate pages that matter, open the key Local 700 pages in their own tabs and leave the most useful one open so I can look at the exact wording myself. After that, go back to Google and search for current Los Angeles or broader West Coast jobs that actually match the skills and requirements you found earlier, and open at least two relevant live job postings in separate tabs so I can visually compare them; for each one, note the title, company, location, how it connects to the earlier requirements, and whether the posting says anything about visa sponsorship or work authorization. To round it out, use Google one more time to build me a short dated timeline of Rosie O’Donnell’s feud with Donald Trump with at least three dated moments from public sources, just as a quick check of the kind of entertainment-news research context that might overlap with this world. Please give me everything as a concise career brief with clear sections for training paths, Local 700 union entry, relevant current job examples, and the short timeline, and mention which pages you left open for me to review.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Find and summarize at least three typical job requirements for IT engineers in the media and entertainment industry and at least two concrete training or certification options.", + "verification": "Final brief includes a training paths section with 3+ recurring requirements and 2+ named training/certification routes sourced from Google results or opened pages.", + "weight": 0.28 + }, + "R2": { + "requirement": "Use the Motion Picture Editors Guild site to summarize the West Coast/Los Angeles joining path for IATSE Local 700, including steps, eligibility, and application/joining details.", + "verification": "Final brief includes a union requirements section tied to editorsguild.com, and the browser shows a relevant Local 700 page open with visible guild-specific joining information.", + "weight": 0.27 + }, + "R3": { + "requirement": "Identify at least two current live Los Angeles or West Coast job postings relevant to the earlier requirements and note whether visa sponsorship or work authorization is mentioned.", + "verification": "Browser has at least two relevant job postings open in separate tabs, and the final brief lists title, company, location, relevance to prior requirements, and sponsorship/work authorization notes for each.", + "weight": 0.25 + }, + "R4": { + "requirement": "Provide a short Rosie O’Donnell vs. Donald Trump timeline with at least three dated milestones from public sources.", + "verification": "Final brief includes a timeline section with 3+ dated events and enough detail to distinguish each milestone.", + "weight": 0.1 + }, + "R5": { + "requirement": "Return everything as a concise career brief with clearly labeled sections and mention which pages were left open for review.", + "verification": "Response is organized into the four requested sections and explicitly references the guild page and job-posting tabs left open.", + "weight": 0.1 + } + }, + "categories": [ + "Jobs and Career > Jobs and Employment", + "Arts & Entertainment > Arts and Entertainment - Other" + ], + "num_categories": 2 + }, + { + "task_id": "041a4bee5d80a28567dc65bc2e41dd198672bfe2", + "confirmed_task": "I’m trying to plan a birthday weekend in New York for my significant other in mid-May, and I want to stay at an Arlo property if the pricing works out. On arlohotels.com, please check the NYC locations for every Friday-to-Sunday weekend in May and compare the rates you can actually see for the Arlo branches in New York, because I want to figure out which weekend is cheapest overall. I’d really prefer Arlo Williamsburg in Brooklyn if it’s no more than $30 above the cheapest NYC Arlo option for that same weekend, so please make that comparison clearly and use that preference when you decide what to recommend. Once you’ve found the best weekend and hotel combination, keep the final hotel page open so I can look at the room details myself. After that, use Ticketmaster to see what sporting events are happening in NYC for each May weekend, and only include options where tickets are available at $400 or less per person since I’d be buying 2 tickets and don’t want to blow the budget. Open the actual event pages, not just search results, so you can verify the listings are live and capture the event name, date, venue, and visible ticket price, and leave a couple of the best event tabs open so I can compare them on screen. In the end, give me a short trip-planning summary with the Arlo hotel comparison, whether Brooklyn stayed within my $30 preference window, the cheapest May weekend, your recommended hotel choice, and the sporting-event options for every May weekend that fit the budget.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify the NYC Arlo hotel branches searched and record visible Friday-to-Sunday weekend rates in May for each branch across the weekends checked.", + "verification": "Grader can confirm multiple NYC Arlo property search/result pages were visited on arlohotels.com and that comparable May weekend rates were captured from visible booking results.", + "weight": 0.3 + }, + "R2": { + "requirement": "Explicitly compare Arlo Williamsburg in Brooklyn against the other NYC Arlo branches and determine whether it is within $30 of the cheapest alternative for the relevant weekend.", + "verification": "Grader can confirm the summary includes a price comparison involving Arlo Williamsburg and a clear yes/no determination on the within-$30 rule based on rates visible in the Arlo booking pages.", + "weight": 0.2 + }, + "R3": { + "requirement": "Determine the cheapest available May weekend across the Arlo NYC properties and state the final recommended hotel and weekend using the Brooklyn preference rule.", + "verification": "Grader can confirm the chosen weekend and hotel are consistent with the collected Arlo rates, and that the recommended hotel page remains open as browser proof.", + "weight": 0.2 + }, + "R4": { + "requirement": "For every May weekend, list NYC sporting events on Ticketmaster that have visible ticket availability at $400 or less per person, including event name, date, venue, and ticket price.", + "verification": "Grader can confirm Ticketmaster event pages were opened for the listed events and that each included event shows a live listing with visible pricing at or below the budget threshold.", + "weight": 0.2 + }, + "R5": { + "requirement": "Leave the recommended Arlo hotel tab and at least two qualifying Ticketmaster event tabs open, and produce a final trip-planning summary that combines the hotel recommendation with the event options.", + "verification": "Grader can confirm the browser has the specified tabs open and that the final response synthesizes hotel comparison results with the per-weekend sporting-event options.", + "weight": 0.1 + } + }, + "categories": [ + "Travel and Tourism > Accommodation and Hotels" + ], + "num_categories": 1 + }, + { + "task_id": "4614aa083147e45cbc2977cc8634b9d9db25edfe", + "confirmed_task": "I’m trying to narrow down a few law schools in the Maryland/DC area and want a practical outreach plan before I start contacting anyone. On the University of Maryland Carey Law site, please find the actual way I can request admissions materials and also look for at least one upcoming online admissions event with its date and time, because I want to see how easy they make it to get information and whether there’s a virtual event I could realistically attend; open the event details page in its own tab and leave it there so I can look at it later. Then go to the University of Baltimore School of Law site and find the instructions for scheduling an admissions meeting by opening the relevant admissions event or meeting page and pulling the registration link or contact method from the actual details page, and keep that page open too so I can compare the two schools side by side. After that, on American University Washington College of Law’s site, find the PIPS Scholarship page and give me a short plain-English summary of what the scholarship is for, along with the application form link, because funding could change which school I prioritize; if there’s a dedicated scholarship page, leave that open in another tab as proof. Once you’ve gathered all of that, recommend which of these three schools I should contact first based on the best mix of easy admissions outreach and potential funding, and include the specific action or contact details you found for each school so I have a simple next move.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Correctly identify the University of Maryland Carey Law admissions-materials request method and report at least one upcoming online event with its date and time.", + "verification": "Grader can confirm the response matches information visible on the Maryland Carey admissions/request page and on an open online event details tab showing the event date/time.", + "weight": 0.3 + }, + "R2": { + "requirement": "Correctly find the University of Baltimore School of Law admissions meeting scheduling instructions, including the registration link or contact method from the relevant event/details page.", + "verification": "Grader can confirm the response against the open UBalt admissions meeting or event details page showing how to register or whom to contact.", + "weight": 0.25 + }, + "R3": { + "requirement": "Provide an accurate brief summary of American University Washington College of Law’s PIPS Scholarship and include the application form link.", + "verification": "Grader can confirm the summary and link against the open PIPS Scholarship page and the linked application form reference on American’s site.", + "weight": 0.2 + }, + "R4": { + "requirement": "Recommend which school to contact first using the gathered evidence about admissions outreach accessibility and potential funding, and include the specific contact/action details found for all three schools.", + "verification": "Grader can verify that the recommendation is supported by the findings from the Maryland Carey, UBalt, and American tabs and that each school’s specific action/contact details are included.", + "weight": 0.25 + } + }, + "categories": [ + "Science and Education > Universities and Colleges", + "Law and Government > Legal" + ], + "num_categories": 2 + }, + { + "task_id": "dd2eedbc88cb41cc69e43dd1da9de7255a81a966", + "confirmed_task": "I’m trying to put together a quick Christmas family outing plan in Yorkshire with Leeds as the base, so could you start on the Carriageworks Theatre site and find a Christmas time Leeds pantomime there, ideally the main festive panto, and note the show title, the venue name, and the full run of performance dates so I have one solid Leeds option to anchor everything around. Once you’ve got that, leave the actual show page open in its own tab so I can look at the artwork and dates myself, then go to Big Panto Guide and check the 2026 West Yorkshire listings and pull out the top three pantomime options with their show names, venues, and dates so I can see what Leeds is competing with nearby. After that, use Google to search for at least two Yorkshire Christmas pantomimes scheduled for 2026, and open the real event or venue pages in separate tabs so you can verify they’re live and capture the show name, venue, city, and performance dates from the actual listings rather than just the search results. When you’ve got those, compare the Leeds Carriageworks option against the wider West Yorkshire and Yorkshire shortlist and tell me whether Leeds still looks like the best anchor city for a family outing. Please ignore anything unrelated, and keep the Leeds page plus the two Yorkshire event pages open so I can compare them visually afterward.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify one upcoming Leeds pantomime from the Carriageworks Theatre website and capture the show name, venue, and full performance date range.", + "verification": "Grader can confirm the Carriageworks production page is open and the extracted details match the visible title, venue, and dates on that page.", + "weight": 0.24 + }, + "R2": { + "requirement": "Extract the top three 2026 West Yorkshire pantomime listings from Big Panto Guide, each with show name, venue, and dates.", + "verification": "Grader can confirm the Big Panto Guide West Yorkshire 2026 listings page was used and that three entries with matching visible names, venues, and dates were recorded.", + "weight": 0.24 + }, + "R3": { + "requirement": "Find at least two Yorkshire Christmas pantomimes scheduled for 2026 via Google and verify each on its actual event or venue page, capturing show name, venue, city, and performance dates.", + "verification": "Grader can confirm at least two separate event or venue tabs are open from Google-discovered results and that the recorded details match the visible pages.", + "weight": 0.26 + }, + "R4": { + "requirement": "Keep the Leeds Carriageworks page and the two Yorkshire event pages open for visual comparison.", + "verification": "Grader can confirm the relevant tabs remain open at the end of the task.", + "weight": 0.12 + }, + "R5": { + "requirement": "Provide a short recommendation assessing whether Leeds still looks like the best anchor city based on the Carriageworks option, the West Yorkshire top three, and the wider Yorkshire options.", + "verification": "Grader can confirm the final summary explicitly compares Leeds against the broader shortlist and states a reasoned recommendation.", + "weight": 0.14 + } + }, + "categories": [ + "Arts & Entertainment > Performing Arts", + "Community and Society > Holidays and Seasonal Events", + "Travel and Tourism > Tourist Attractions" + ], + "num_categories": 3 + }, + { + "task_id": "0eecee553a8cdda936c2cdd2a9189354a92e00b8", + "confirmed_task": "I’m putting together a one-period digital literacy lesson pack for a middle-school class and want the pieces to feel like they belong together, not like I grabbed them randomly. Could you start on Slidesgo and pick one fun, classroom-appropriate presentation template that would work for a grade 7 or 8 lesson, ideally something bright and student-friendly rather than corporate, because I want to use that visual style as the theme for everything else? Open the actual template page and leave it open so I can see the preview images, and note whether it’s available for Google Slides or PowerPoint. Then use Google to find a printable worksheet or practice page for an 8th-grade student on basic marketing strategies or persuasion techniques, like identifying advertising tricks or persuasive techniques, and open the actual resource page so I can check that it really looks classroom-ready and printable. After that, go to Citation Machine and verify that Harvard style is actually available there by navigating to wherever the citation styles are shown or selectable, because I’ll need to cite the worksheet and any media correctly; leave that proof visible or keep the page open. Finally, on YouTube, find three beginner-friendly videos about online safety or cybersecurity basics that would make sense for students, open each video in its own tab so I can compare them, and play one of them briefly so you can tell me what the opening covers. At the end, send me a short lesson-pack summary with the Slidesgo template name and format option, the worksheet title and where it’s hosted, the evidence that Harvard style can be selected on the citation site, and the three YouTube video titles with their channel names.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Select one Slidesgo presentation template that is fun and classroom-appropriate for a middle-school lesson, and provide the exact template name plus an available use/download option such as Google Slides or PowerPoint.", + "verification": "Grader can confirm the chosen Slidesgo template page is open and shows the template title along with a visible Google Slides or PowerPoint option.", + "weight": 0.24 + }, + "R2": { + "requirement": "Find one printable worksheet or practice page for an 8th-grade student on marketing strategies or persuasion techniques, and provide the resource title and the site where it is accessed.", + "verification": "Grader can confirm the opened resource page appears to be a worksheet or practice page and that the response includes its title and hosting/access location.", + "weight": 0.24 + }, + "R3": { + "requirement": "Identify Citation Machine as a citation generator that supports Harvard style and provide explicit evidence from the site showing Harvard can be selected.", + "verification": "Grader can confirm a Citation Machine page is open with visible citation-style options or text indicating Harvard style is available.", + "weight": 0.22 + }, + "R4": { + "requirement": "Identify three YouTube videos that teach online safety or cybersecurity basics and provide each video’s title and channel name.", + "verification": "Grader can confirm three YouTube video tabs or pages are open and that the returned titles and channel names match the visible video pages.", + "weight": 0.2 + }, + "R5": { + "requirement": "Demonstrate browser-only proof by leaving the Slidesgo template page and Citation Machine proof page open, opening the three YouTube videos in separate tabs with one video briefly played, and reporting what the opening of the played video covers.", + "verification": "Grader can confirm the relevant tabs remain open, one YouTube video shows playback progress or a changed play state, and the response includes a brief description of what the opening of that video covers.", + "weight": 0.1 + } + }, + "categories": [ + "Science and Education > Education", + "Computers Electronics and Technology > Graphics Multimedia and Web Design" + ], + "num_categories": 2 + }, + { + "task_id": "29e019b665e4eba930fcb1fc28a149eb6522ed29", + "confirmed_task": "I’m in NYC and trying to get my footing before I can seriously plan for law school, so I need help pulling together a realistic picture from a few specific sites. First, on Legal Aid NYC, please look for the most useful guidance for New York City rent disputes and pull out at least three concrete help options or contact paths I could actually use right now, plus two Legal Aid NYC articles that seem especially relevant to rent problems; open the actual article pages in separate tabs and leave the most useful one open so I can look at it myself. Then, because I may need to survive the gap before a first paycheck, use Google to find a practical personal-finance discussion about not making it until the first payday and summarize at least three actionable ideas that feel realistic for someone trying to bridge expenses temporarily. After that, go to Disney Careers and search specifically for at least three entry-level job openings in New York City that could plausibly fit a recent graduate, and for each one note the title, NYC-area location, application page, and any basic qualification cues; please open each job posting in its own tab so I can visually compare them. Finally, use AccessLex ARC to find one LSAC law school admissions cycle dataset and note the exact dataset title and what cycle or year it covers, then go to LawHub and pull the total annual cost of attendance for Case Western Reserve University School of Law so I have one concrete law-school cost benchmark. In the end, give me a concise summary that ties together the rent-help options, the short-term cash-flow ideas, the Disney job leads, the LSAC dataset reference, and the Case Western cost figure so I can judge whether this path feels workable.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Legal Aid NYC findings include at least three specific rent-dispute help options or contact pathways and two relevant Legal Aid NYC articles.", + "verification": "Grader can confirm the summary references Legal Aid NYC content and that two actual Legal Aid NYC article pages were opened, with one left visible.", + "weight": 0.24 + }, + "R2": { + "requirement": "At least three actionable suggestions are summarized from a practical personal-finance discussion about making it until the first payday.", + "verification": "Grader can confirm a Google-led result was used to reach a discussion page and that the final notes contain three concrete bridge-expense ideas tied to that discussion.", + "weight": 0.18 + }, + "R3": { + "requirement": "At least three Disney Careers openings in New York City are identified with title, location, application page, and basic qualification cues.", + "verification": "Grader can confirm three separate Disney Careers job posting tabs are open and that each posting visibly shows the job title and NYC-area location.", + "weight": 0.24 + }, + "R4": { + "requirement": "One AccessLex ARC LSAC admissions cycle dataset is reported with its exact title and the cycle/year it covers.", + "verification": "Grader can confirm the ARC page shows the named dataset and its associated cycle or year.", + "weight": 0.14 + }, + "R5": { + "requirement": "The total annual cost of attendance for Case Western Reserve University School of Law is captured from LawHub and included in the final response.", + "verification": "Grader can confirm the LawHub page for Case Western Reserve University School of Law displays a total annual cost of attendance figure matching the reported value.", + "weight": 0.1 + }, + "R6": { + "requirement": "The final synthesis concisely connects rent-help options, first-paycheck bridge ideas, NYC Disney job leads, the LSAC dataset reference, and the Case Western cost benchmark.", + "verification": "Grader can confirm the final answer integrates outputs from all prior steps into one coherent planning summary rather than listing them separately.", + "weight": 0.1 + } + }, + "categories": [ + "Law and Government > Legal", + "Science and Education > Universities and Colleges" + ], + "num_categories": 2 + }, + { + "task_id": "7959caf1580d130cedcba72e8f21ab0e9408ba91", + "confirmed_task": "I'm trying to piece together a really cheap Barcelona city break for 2 adults for 30th of next month to the 1st of the following month, and I want a few comparison points so I can sanity-check the budget. First, on Booking.com, search Barcelona for those dates and find me one hotel that's within 3 miles of the city centre and comes in under £120 total, then open the actual property page so you can grab the exact hotel name, total price, and the location details shown there, and leave that tab open so I can look at it later. Once you have that as my lodging benchmark, go to AirBnB and check the all-listings page to see how many accommodations are currently available there, just so I can compare a small apartment's availability with the Barcelona hotel market. After that, open a new house listing on Rightmove and note the asking price and number of bedrooms from the listing page itself, because I want a quick reality check on what short-stay costs look like next to property prices elsewhere; keep that listing open too so I can see the photos and details. Finally, use the Barcelona hotel price you found to work out the nightly rate, take half of that, and then on Hertz look near Barcelona for a Honda with great ratings during the duration of my trip, and tell me if any option also comes in under $100 per day; open the best matching car listing in its own tab and use it to clearly say whether the car's daily cost is more than half of the hotel's nightly rate. Please give me a short trip-planning summary with the Barcelona hotel first, then the AirBnB availability count, then the Rightmove price check, and end with the hotel-versus-car comparison stated plainly.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "A Booking.com hotel in Barcelona for 2 adults from the 30th of next month to the 1st of the following month is identified that is within 3 miles of the city centre and under £120 total, with the exact hotel name, total price, and location details captured from the property page.", + "verification": "Grader can confirm the Booking.com search dates and occupancy, then verify the open hotel property page shows the hotel name, total price under £120, and location/distance details.", + "weight": 0.3 + }, + "R2": { + "requirement": "The number of currently available accommodation listings on the AirBNB all-listings page is reported accurately.", + "verification": "Grader can verify the visible count by checking the all-listings page and matching the number of available listings shown.", + "weight": 0.15 + }, + "R3": { + "requirement": "A Rightmove new house listing's asking price and bedroom count are found and recorded from the live listing page.", + "verification": "Grader can confirm the open Rightmove listing shows the same asking price and bedroom count reported by the agent.", + "weight": 0.15 + }, + "R4": { + "requirement": "A Hertz Honda listing near Barcelona for 30th of next month to the 1st of the following month is found with great ratings, including the rating and daily price, and the agent determines whether any qualifying option comes in under $100/day.", + "verification": "Grader can confirm a Hertz Honda rental listing near Barcelona is displayed for the correct dates, that rating and daily price are visible or captured, and that the response states whether the daily cost is under $100.", + "weight": 0.2 + }, + "R5": { + "requirement": "The final comparison correctly uses the Barcelona hotel's nightly rate to state whether the Hertz car's daily price is more than half of the hotel's nightly rate.", + "verification": "Grader can recompute the nightly hotel rate from the Booking.com total, divide by two, and compare that threshold with the reported Hertz daily price to confirm the final statement.", + "weight": 0.12 + }, + "R6": { + "requirement": "The final response is a short trip-planning summary presented in the requested order, with the Barcelona hotel first and the hotel-versus-car comparison clearly stated at the end.", + "verification": "Grader can inspect the final response structure and confirm it includes all required findings in order and ends with a clear hotel-versus-car cost comparison.", + "weight": 0.08 + } + }, + "categories": [ + "Travel and Tourism > Air Travel", + "Travel and Tourism > Accommodation and Hotels", + "Food and Drink > Restaurants and Delivery" + ], + "num_categories": 3 + }, + { + "task_id": "e7596a6d6079be82e5219c1ac1c5f40f33d2bce8", + "confirmed_task": "I’m putting together a quick starter pack for a Colorado outreach idea centered on helping children in need, and I want it to feel grounded before I share it with anyone. Please start on Google and find at least three Colorado charities that specifically help children in need, then open each organization’s official site in its own tab so you can confirm it’s the real organization and leave those tabs open so I can look at them later; I need each charity’s name and official website for the brief. After that, go to Microsoft’s nonprofit resources site and look for at least three software companies listed there along with the exact nonprofit discount or free-program names they offer, because I want to include practical tools these kinds of charities could actually use; if the offer details live on separate pages, open those in separate tabs too and keep the most useful one visible so I have browser proof of what you found. Then go back to Google and look up how lower, middle, and upper class are commonly described in the U.S., including income ranges and the main factors people use beyond income, and do the same specifically for Colorado so I can shape future donor messaging with a little context. Please pull everything together into one organized brief with the charity list, the software offers, and concise U.S. and Colorado class summaries, including a plain-language definition of middle class.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify at least three Colorado charities that help children in need and provide each organization’s name plus official website URL.", + "verification": "Grader confirms the final brief lists three or more Colorado child-focused charities and that corresponding official organization sites were opened in browser tabs.", + "weight": 0.24 + }, + "R2": { + "requirement": "Use browser verification for the charity research by opening each organization’s official site in its own tab and leaving those tabs available for review.", + "verification": "Grader checks that multiple charity website tabs are open and correspond to the organizations named in the brief.", + "weight": 0.1 + }, + "R3": { + "requirement": "List at least three software companies from Microsoft nonprofit resources and include the exact nonprofit discount or free-program name offered by each.", + "verification": "Grader confirms the companies and program names match Microsoft nonprofit resources content shown in the browser.", + "weight": 0.2 + }, + "R4": { + "requirement": "Provide a short description of each nonprofit software offer and keep at least one supporting Microsoft offer page visible or open as browser proof.", + "verification": "Grader checks the brief includes descriptions and that one or more relevant Microsoft nonprofit resource or offer-detail tabs remain open.", + "weight": 0.12 + }, + "R5": { + "requirement": "Summarize U.S. lower-, middle-, and upper-class income ranges and the key factors commonly used to classify class status, including a plain-language definition of middle class.", + "verification": "Grader confirms the brief contains U.S. class ranges, non-income factors, and a clear plain-language middle-class summary.", + "weight": 0.14 + }, + "R6": { + "requirement": "Include the same class-income summary for Colorado, covering lower-, middle-, and upper-class income ranges and key classification factors.", + "verification": "Grader confirms the brief separately includes Colorado-specific class ranges and factors, not just national information.", + "weight": 0.12 + }, + "R7": { + "requirement": "Return all findings as one organized brief that combines the charity list, software tools list, and U.S. and Colorado class-income summaries.", + "verification": "Grader checks the final response is structured as a single coherent brief with all required sections present.", + "weight": 0.08 + } + }, + "categories": [ + "Community and Society > Philanthropy" + ], + "num_categories": 1 + }, + { + "task_id": "908c9a864e81539503be6ca074788c462b2e1319", + "confirmed_task": "I’m putting together a quick pop-culture briefing for a friend group chat, and I want it to feel like one connected snapshot instead of a pile of random notes. Could you start on Wikipedia and pull a short, clean summary of Snowfall so I have the premise, setting, and what the show is mainly about, then do the same on Wikipedia for A Knight of the Seven Kingdoms, making sure to note the main characters it follows and how they connect back to the bigger Game of Thrones world through family or house relationships so I can contrast those two scripted shows. After that, go to Lifetime’s site and open the actual Married at First Sight Season 18 page to see where it says to watch it, and list the Season 18 episodes that are currently shown there, making sure Episode 4 is included if it’s visible; please leave that season page open so I can glance at the episode list myself. Then use Google to figure out which season or seasons of Chicago P.D. include Vanessa Rojas, because I want one quick character-specific network TV fact in the briefing. Once the TV part is set, head to Reddit and find an actual discussion thread about Chicago P.D., open the thread so you can verify it’s live, and give me the thread title. Then browse r/starterpacks and grab two recent funny meme post titles that feel like good examples of lighter community chatter, opening each in its own tab so I can compare them later. In the end, give me one compact briefing that ties all of that together naturally.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Provide a concise Wikipedia-based summary of Snowfall that includes the show's premise, setting, and main story focus.", + "verification": "Grader checks that the response includes all three elements and that the Snowfall Wikipedia page was visited.", + "weight": 0.14 + }, + "R2": { + "requirement": "Provide a concise Wikipedia-based summary of A Knight of the Seven Kingdoms that names the main characters and explains their relationship to the broader Game of Thrones world through lineage, house ties, or background.", + "verification": "Grader confirms the response names the principal characters and includes their relevant connections as visible on the Wikipedia page.", + "weight": 0.16 + }, + "R3": { + "requirement": "Report where Married at First Sight Season 18 can be watched on Lifetime and list the Season 18 episodes currently shown there, including Episode 4 if visible.", + "verification": "Grader verifies the Lifetime Season 18 page is open, shows watch availability information, and displays an episode list containing Episode 4 if present on the page.", + "weight": 0.2 + }, + "R4": { + "requirement": "Determine and report which season or seasons of Chicago P.D. include Vanessa Rojas.", + "verification": "Grader confirms the season number(s) reported are supported by the Google search results or opened source pages.", + "weight": 0.12 + }, + "R5": { + "requirement": "Find and provide the title of one live Reddit discussion thread about Chicago P.D.", + "verification": "Grader checks that an actual Reddit thread page about Chicago P.D. was opened and that the reported title matches the visible post title.", + "weight": 0.12 + }, + "R6": { + "requirement": "Identify two recent funny meme post titles from r/starterpacks.", + "verification": "Grader confirms two posts from r/starterpacks were opened in separate tabs and that the titles match visible recent posts.", + "weight": 0.11 + }, + "R7": { + "requirement": "Return all requested TV and Reddit findings as one compact, connected briefing rather than disconnected notes.", + "verification": "Grader checks that the final response integrates all required findings into a single cohesive briefing.", + "weight": 0.15 + } + }, + "categories": [ + "Arts & Entertainment > Streaming & Online TV", + "Arts & Entertainment > Music", + "Computers Electronics and Technology > Social Media Networks" + ], + "num_categories": 3 + }, + { + "task_id": "b4d11b2d7069bf45410b6784544504b23360b34a", + "confirmed_task": "I’m trying to put together a really cheap late-November city-break from London and want a realistic shortlist I can actually look at in the browser. Please start on Skyscanner and search round-trip flights leaving from London for November 18 and coming back November 28 then pull out four destination options that look viable and note the destination city or airport and the lowest price shown for each so I can see which places are even in budget. Open the most promising flight results in their own tabs and leave the cheapest-looking Skyscanner option visible so I have a reference point. Once you’ve got that shortlist, use it to decide which destination appears cheapest overall. After that, go to Ryanair and check London to Budapest for the week starting November 18 because Budapest is usually a low-cost fallback for me and I want to know whether it still deserves a spot on the list; list the cheapest available options you can find in ascending price order and keep the Budapest results page open so I can compare it visually with Skyscanner. Since I may need to stay in London the night before flying, switch to Booking.com and look up 22 Suites in London, report its guest review score, and summarize at least three recent guest reviews so I can tell whether the nice rating actually matches what people are saying. Then do the same quality check for The Chapter Hotels – Finsbury Park by reporting its overall review score and review descriptor and reading into the recent reviews enough to judge whether it really seems dependable for a one-night pre-flight stay. If possible, open both hotel pages in separate tabs and leave the review sections visible so I can compare them side by side. In the end, give me a short recommendation on which flight option you’d prioritize, whether Budapest should stay on the shortlist as a backup, and which of those two London hotels seems more reliable for the night before an early flight.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Provide four Skyscanner round-trip destination options from London for 11/18 to 11/28, each with destination city/airport and lowest displayed price.", + "verification": "Grader confirms four destinations and prices are extracted from visible Skyscanner search results for the specified dates.", + "weight": 0.22 + }, + "R2": { + "requirement": "Identify which of the four Skyscanner destinations appears cheapest overall and keep the cheapest-looking result visible/open as browser proof.", + "verification": "Grader confirms the chosen cheapest option matches the visible Skyscanner tabs/results and that a cheapest result page remains open.", + "weight": 0.14 + }, + "R3": { + "requirement": "List the cheapest Ryanair London-to-Budapest flight options for the week starting 11/18 in ascending price order.", + "verification": "Grader confirms the Ryanair results page shows London to Budapest flights in the requested week and that the reported options are ordered from lowest to highest price.", + "weight": 0.16 + }, + "R4": { + "requirement": "Report the Booking.com guest review score for 22 Suites and summarize at least three recent guest reviews.", + "verification": "Grader confirms the 22 Suites property page and review section are open and that the score plus three review summaries align with visible recent reviews.", + "weight": 0.16 + }, + "R5": { + "requirement": "Report the Booking.com overall review score and descriptor for The Chapter Hotels – Finsbury Park and judge whether reviews are generally very positive based on recent comments.", + "verification": "Grader confirms the property page shows the stated score and descriptor and that the positivity judgment is supported by visible recent reviews.", + "weight": 0.14 + }, + "R6": { + "requirement": "State whether Budapest should remain on the shortlist as a backup based on comparison with the Skyscanner shortlist.", + "verification": "Grader confirms the recommendation explicitly compares Ryanair Budapest pricing against the Skyscanner destination prices.", + "weight": 0.08 + }, + "R7": { + "requirement": "Recommend which flight option to prioritize and which of the two London hotels seems more reliable for a pre-flight overnight stay.", + "verification": "Grader confirms the final recommendation references both the cheapest/most suitable flight findings and the comparative hotel review evidence from Booking.com.", + "weight": 0.1 + } + }, + "categories": [ + "Travel and Tourism > Air Travel", + "Travel and Tourism > Accommodation and Hotels" + ], + "num_categories": 2 + }, + { + "task_id": "46ecc9a5dce920d6c72198b4efb1a46855bac7d0", + "confirmed_task": "I’m trying to put together a budget-conscious outfit shortlist and want it to feel like a real shopping comparison, not just random picks. Start on Depop and go to the seller page for rainbow_bebe to see whether they currently have any prom dress listings under $100, because if there’s a good one there I’d use that as my budget anchor; open at least one qualifying listing in its own tab so I can see the actual photos and price on the live listing page. If that seller doesn’t have a convincing under-$100 option, switch to PrettyLittleThing and find an emerald green dress that’s available in size 12, then open the product page and note the name and price so I have a fallback main dress candidate. After that, on JJ’s House, browse cocktail dresses and find one deep V-neck option that looks like a dressier comparison point, and keep that product page open too so I can compare the styling. Once you’ve seen those dress options, decide which dress feels like the best value based on price and what’s actually available, then go to Old Navy and pick one comfortable-looking, affordable women’s jogger to round out the shortlist as a casual extra, making sure you open the actual product page and capture the name and price. Then check Tecovas for one men’s cowboy boot made of real leather, open the product page, and list the design or color options shown there so I can see how many choices there are. Finally, go to ALS and look up the Nike Phantom 6 Club FG/MG low-top men’s soccer cleat, tell me the current price, and say clearly whether it stays under $60 so I can judge whether this whole shortlist still feels budget-friendly. Please leave the key product tabs open for the dress comparison and give me a final shortlist with the item names, prices, the Tecovas options, and a quick note on which dress you’d personally pick as the best value.", + "website": "https://www.google.com", + "reference_length": 8, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Depop seller rainbow_bebe is checked for prom dress listings under $100, and if a qualifying listing exists, at least one live listing title and price are captured from the opened listing page.", + "verification": "Grader can confirm navigation to the rainbow_bebe seller area on Depop and see either a qualifying under-$100 prom dress listing opened in a tab or a clear finding that no such qualifying listing is present.", + "weight": 0.16 + }, + "R2": { + "requirement": "A PrettyLittleThing emerald green dress available in size 12 is found and its product name and price are recorded.", + "verification": "Grader can verify the PrettyLittleThing product page shows an emerald green dress with size 12 available and visible name and price.", + "weight": 0.13 + }, + "R3": { + "requirement": "A JJ’s House cocktail dress with a deep V-neck is identified, and its name and price are captured from the product page.", + "verification": "Grader can verify the JJ’s House product page shows a cocktail dress with deep V-neck styling and visible product name and price, with the page kept open for comparison.", + "weight": 0.13 + }, + "R4": { + "requirement": "One dress is selected as the best-value pick based on the earlier dress findings, with a brief comparison note explaining the choice.", + "verification": "Grader can compare the reported dress options and confirm that the final answer names one chosen dress and includes a short rationale tied to price and/or availability.", + "weight": 0.17 + }, + "R5": { + "requirement": "An Old Navy women’s jogger that appears comfortable and affordable is selected from the product page, with name and price recorded.", + "verification": "Grader can verify an Old Navy jogger product page is opened and that the reported name and price match the visible page details.", + "weight": 0.11 + }, + "R6": { + "requirement": "A men’s real leather cowboy boot is found on Tecovas, and the boot name plus visible design or color options are listed.", + "verification": "Grader can verify the Tecovas product page indicates real leather and shows the product name along with selectable design or color options.", + "weight": 0.12 + }, + "R7": { + "requirement": "The ALS product page for the Nike Phantom 6 Club FG/MG low-top men’s soccer cleat is found, the current price is recorded, and the answer clearly states whether it is under $60.", + "verification": "Grader can verify the ALS product page title matches the cleat and the visible price supports the under-$60 conclusion.", + "weight": 0.1 + }, + "R8": { + "requirement": "The final shortlist consolidates all required items and findings: dress results, chosen best-value dress, Old Navy jogger, Tecovas boot with options, and ALS cleat affordability check.", + "verification": "Grader can confirm the final response includes all requested item names, prices where applicable, the Tecovas options, and the cleat budget judgment in one coherent shortlist.", + "weight": 0.08 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Lifestyle > Fashion and Apparel" + ], + "num_categories": 2 + }, + { + "task_id": "80257c727b8e8c5426c1b03a2a4493231747e5d7", + "confirmed_task": "I’m mocking up a tiny React art prototype for a web page and want you to help me gather the pieces in a way I can actually look at in the browser. Start on JetBrains and figure out which IDE they specifically position for editing and organizing web code, because I want a sensible default tool before I build anything. Then use Google Images to find one cartoon mouse image with a transparent background that could work as a reference asset, and open the actual source page plus the direct image in separate tabs so I can visually confirm it really has transparency and isn’t just a white background baked in. While you’re in Google Images, also search for “taffy tails stretchy” and pick one result for Stretchy from Taffy Tails, opening the result page too so I can compare whether I want a generic mouse look or that more specific character style. After that, go to react-svgr.com and convert a simple SVG into a React component so I have a vector element to pair with the raster mouse image in the prototype, and keep the conversion result visible. Once the visual side is sorted out, use Google to find the current guidance on whether a project made in CapCut Web can be moved into the CapCut desktop app, because I may reuse these same art assets in a promo clip later, and then find instructions for changing the background color behind an imported photo in CapCut so the background can match whichever mouse style looked better from the earlier image search. Please leave the most useful image/source tabs open for comparison and give me a concise build note with the recommended IDE, both image sources, the React component code, and short CapCut instructions tailored to using those mouse assets.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Correctly identify the JetBrains IDE intended for editing and organizing web code and recommend it by name.", + "verification": "Grader confirms the final note names WebStorm and that the JetBrains page viewed corresponds to the web-development IDE.", + "weight": 0.14 + }, + "R2": { + "requirement": "Provide one cartoon mouse image with a transparent background, including a direct image file URL and the source page URL.", + "verification": "Grader confirms separate browser tabs were opened for the direct image file and the source page, and the final note includes both URLs.", + "weight": 0.18 + }, + "R3": { + "requirement": "Provide one Google Images result for Stretchy from Taffy Tails with its source/result page for visual comparison.", + "verification": "Grader confirms a Google Images search for “taffy tails stretchy” was performed, a result page was opened, and the final note includes the selected result/source.", + "weight": 0.12 + }, + "R4": { + "requirement": "Convert an SVG into a React component using SVGR and provide the resulting React component code snippet.", + "verification": "Grader confirms react-svgr.com shows a conversion result and the final note includes a plausible generated React component snippet.", + "weight": 0.18 + }, + "R5": { + "requirement": "Summarize how to move a CapCut Web project into the CapCut desktop app, including the recommended method or a confirmation that direct transfer is not possible, with supporting evidence.", + "verification": "Grader confirms the final note includes a clear transfer conclusion and cites or reflects information found from search results/pages opened during browsing.", + "weight": 0.16 + }, + "R6": { + "requirement": "Summarize instructions for changing the background color behind an imported photo in CapCut.", + "verification": "Grader confirms the final note includes a usable sequence of CapCut actions for changing the background color behind an imported image.", + "weight": 0.1 + }, + "R7": { + "requirement": "Return a concise build note that combines the recommended IDE, both image sources, the React component code, and CapCut instructions tailored to the compared mouse assets.", + "verification": "Grader confirms the final response synthesizes outputs from all prior steps and explicitly tailors the CapCut guidance to the generic mouse versus Stretchy comparison.", + "weight": 0.12 + } + }, + "categories": [ + "Computers Electronics and Technology > Programming and Developer Software", + "Arts & Entertainment > Visual Arts and Design" + ], + "num_categories": 2 + }, + { + "task_id": "976970ff5d37116847b8a9351a0922196bee88a2", + "confirmed_task": "I’m putting together a short youth mental-health workshop and want a tidy evidence pack I can actually cite, so could you do this in the browser and keep the key pages open for me? Start on Google and find one peer-reviewed academic source that clearly says eating disorders commonly begin during adolescence or the early teen years, then open the actual article or abstract page in its own tab and pull the exact supporting quote plus enough citation detail that I could reuse it. From there, use Google again to find one credible source explaining that the human brain keeps growing or developing after childhood, and open the source page itself so I can see it’s a real organization or publication rather than just a search snippet. After that, still using Google, find one credible source saying human cognitive biases are influenced by evolutionary processes, and open that source in a separate tab too because I want to compare the wording across the three sources. Once those evidence tabs are gathered, go to Wikipedia’s page for Lauryn Hill’s The Miseducation of Lauryn Hill, pull at least five song titles from the track listing, and leave that album page open so I can glance at the list myself. Then choose the one song title that would make the best discussion prompt for teens based on the themes from the three sources, and give me a concise workshop brief that ties the evidence together with the citations, the exact quote for the eating-disorders source, the five song titles, and a short explanation of why your chosen title fits.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Provides one peer-reviewed academic source stating that eating disorders commonly begin during adolescence or the early teen years, including a usable citation and the exact supporting quote.", + "verification": "Grader can confirm a Google result was opened to an article or abstract page in its own tab and that the final response includes both citation details and a direct quote supporting adolescent onset.", + "weight": 0.22 + }, + "R2": { + "requirement": "Provides one credible source explaining that the human brain continues to grow or develop after childhood, with a usable citation.", + "verification": "Grader can confirm the source page itself was opened from Google and that the final response includes a citation tied to the brain-development claim.", + "weight": 0.16 + }, + "R3": { + "requirement": "Provides one credible source supporting the claim that human cognitive biases are influenced by evolutionary processes, including title, publisher, and URL.", + "verification": "Grader can confirm the opened source page supports the claim and that the final response includes the source title, publisher, and URL.", + "weight": 0.16 + }, + "R4": { + "requirement": "Uses the opened evidence tabs to extract or summarize support from all three sources in a way that is coherent for the workshop theme.", + "verification": "Grader can confirm multiple source tabs were opened and the final brief accurately connects adolescent onset, ongoing brain development, and evolutionary influences on cognitive bias.", + "weight": 0.14 + }, + "R5": { + "requirement": "Lists at least five song titles from Lauryn Hill’s The Miseducation of Lauryn Hill sourced from Wikipedia.", + "verification": "Grader can confirm the Wikipedia album page is open and that at least five track titles in the response match the visible track listing.", + "weight": 0.12 + }, + "R6": { + "requirement": "Selects one song title from the album as a teen discussion prompt and explains why it fits the workshop themes drawn from the gathered evidence.", + "verification": "Grader can confirm the chosen title appears in the Wikipedia track list and that the explanation meaningfully links to the three evidence themes.", + "weight": 0.1 + }, + "R7": { + "requirement": "Returns the findings as a concise workshop brief including the three requested evidence sources, citations, the exact quote for the eating-disorders source, at least five song titles, and the chosen title with rationale.", + "verification": "Grader can confirm all required elements are present in one concise final brief and that the visible pages used match the cited sources.", + "weight": 0.1 + } + }, + "categories": [ + "Health > Mental Health", + "Science and Education > Science and Education - Other" + ], + "num_categories": 2 + }, + { + "task_id": "5944875c7f32a98df978040e4447534f7ba0aadb", + "confirmed_task": "I’m putting together a small Zelda: Breath of the Wild–themed dessert setup for a get-together and want it to feel like it came straight out of the game, but still be practical to make in a real kitchen. Please start on Google and find a genuinely comprehensive BOTW cooking guide or full recipe list that covers the game’s meals and elixirs, then open the actual guide page so I can visually confirm it looks complete and leave that tab open as the inspiration reference. From that, pick a tropical direction that feels like a natural fit for a real dessert—something in the fruit-and-island vibe of the game—and then use Google to find a copycat Disney Dole Whip recipe page with a clear ingredient list and simple prep, because I think that could become the main dessert. Since I may want a richer backup option for people who don’t want pineapple, go to Sally’s Baking Addiction and find the chocolate buttercream frosting recipe that takes 20 minutes or less and uses 6 ingredients, and keep that recipe page open in its own tab too so I can compare the two dessert directions side by side. After that, use Google to find one recommended method for making a custom photo frame with an X-Carve CNC that I could turn into a Zelda-themed sign or menu card, and summarize the materials, software or workflow, and key steps. Then use Google again to find one reliable method for mirroring an iPhone screen to another device so I can show the BOTW inspiration page while assembling everything, and finally find how to switch an iPad keyboard from the floating mini keyboard back to the full-size keyboard in case I end up typing labels on an iPad instead. In the end, give me a concise plan that ties the BOTW cooking inspiration to the tropical dessert choice, the chocolate comparison option, and the presentation setup, and mention which tabs you left open for me to review.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "A comprehensive, trustworthy BOTW cooking guide or full recipe list is found via Google, and the actual guide page is opened and left open so it can be visually confirmed as a complete reference covering the game's meals and elixirs.", + "verification": "Grader can confirm the open tab displays a BOTW cooking guide or recipe list page that appears comprehensive, covering meals and elixirs from the game.", + "weight": 0.22 + }, + "R2": { + "requirement": "A tropical BOTW-inspired dessert direction is selected based on the game inspiration, and a copycat Disney Dole Whip recipe page is found with ingredients and basic preparation captured.", + "verification": "Grader can confirm a Dole Whip recipe page is open and the response includes a clear tropical theme connection plus the recipe’s ingredient list and prep summary.", + "weight": 0.16 + }, + "R3": { + "requirement": "A Sally’s Baking Addiction chocolate buttercream frosting recipe is found that takes 20 minutes or less and uses 6 ingredients, with the ingredient list and total time recorded, and the page left open in its own tab.", + "verification": "Grader can confirm the Sally’s Baking Addiction recipe tab is open and the response states the total time and a 6-ingredient list matching the page.", + "weight": 0.16 + }, + "R4": { + "requirement": "One recommended X-Carve CNC custom photo frame method is found and summarized with key steps, materials, and any software or workflow details relevant to making a Zelda-themed sign or menu card.", + "verification": "Grader can confirm a source page about making a custom frame with an X-Carve CNC is open and the response includes materials, process steps, and software/workflow notes.", + "weight": 0.14 + }, + "R5": { + "requirement": "One reliable method for mirroring or sharing an iPhone screen to another device is found and summarized with the required steps and destination requirements.", + "verification": "Grader can confirm a relevant help or instructional page is open and the response explains a valid mirroring method with actionable steps.", + "weight": 0.1 + }, + "R6": { + "requirement": "At least one valid method is provided for returning the iPad floating mini keyboard to the full-size keyboard.", + "verification": "Grader can confirm a relevant help page is open and the response includes a valid method such as dragging from the keyboard control or pinching outward on the mini keyboard.", + "weight": 0.08 + }, + "R7": { + "requirement": "The final response is a concise integrated plan that clearly connects the BOTW recipe inspiration to the tropical dessert choice, the richer chocolate comparison option, and the presentation/display setup, while mentioning which tabs were left open for review.", + "verification": "Grader can confirm the final write-up synthesizes all findings into one coherent plan and explicitly references the open tabs for the BOTW guide and dessert comparison pages.", + "weight": 0.14 + } + }, + "categories": [ + "Food and Drink > Cooking and Recipes", + "Games > Video Games Consoles and Accessories" + ], + "num_categories": 2 + }, + { + "task_id": "92cf5ed054aa12e3e99a63bf7ce0e203ea85cee7", + "confirmed_task": "Help me open the Stanford CS231N introduction lecture on YouTube and start playing it. Create a list of 3 of the datasets they mention in the lecture (with relevant timestamps for each), and then help me find a way to download these 3 datasets. Go to the source and pull up the tab with the download link for all three datasets. Keep all relevant dataset links open in multiple tabs and provide a summary of them.", + "website": "https://www.google.com", + "reference_length": 3, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "The agent identifies the CS231N introduction lecture on youtube and plays it.", + "verification": "Grader can confirm the open and playing introduction lecture.", + "weight": 0.25 + }, + "R2": { + "requirement": "The agent watches the video and identifies the 3 datasets they mention in the lecture video, then produces a list with relevant time stamps.", + "verification": "Grader can confirm the agent identified 3 datasets mentioned in the CS231N introduction lecture video and listed each with a relevant timestamp.", + "weight": 0.25 + }, + "R3": { + "requirement": "The agent opens and finds all three dataset links separately that are valid and correct according to the lecture video, and provides a summary of the datasets.", + "verification": "3 open dataset sources sourced from the introduction CS231N video, and the final response includes a summary describing each dataset.", + "weight": 0.5 + } + }, + "categories": [ + "Science and Education > Science and Education - Other", + "Computers Electronics and Technology > Programming and Developer Software" + ], + "num_categories": 2 + }, + { + "task_id": "cbe12703e129832feb5e7b56c4141476fb423338", + "confirmed_task": "I’m trying to put together a compact entertainment lineup for this week that gives me a nice mix of short narrative games, a few more action-heavy palate cleansers, some quick browser-game breaks, and one cozy book. Please start on Reddit and look through real recommendation threads for at least five story-heavy games that people describe as short or easy to finish, because that’ll set the mood I’m aiming for; open the most useful Reddit threads in separate tabs and leave at least one of the key threads open so I can glance at the discussion myself. Then, still using Reddit and that same taste baseline, find at least three FPS shooters that people recommend as good contrast after emotional or story-driven games, so I can see what might work as a reset between heavier experiences. After that, go to Poki and find three mind-challenging browser games I could dip into between the bigger titles, and open each game’s actual Poki page in its own tab so I can visually confirm they’re the right kind of quick break. Finally, head to Goodreads and pull up the page for The Very Secret Society of Irregular Witches, read at least three user reviews, and give me the main takeaways from those reviews with an eye toward whether it matches the cozy, character-driven vibe from the first Reddit step; leave the Goodreads book page open too so I can look at the rating and cover. In the end, give me one concise recommendation bundle with the game and book titles, links where they make sense, and a short note on how each FPS pick, each Poki game, and the book complement the story-heavy shortlist you found first.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify at least five story-heavy short game recommendations sourced from Reddit threads and list their titles.", + "verification": "Grader confirms at least five game titles are present and that Reddit recommendation threads were opened, with at least one relevant Reddit thread visibly left open.", + "weight": 0.28 + }, + "R2": { + "requirement": "Identify at least three FPS shooter recommendations sourced from Reddit threads that fit as palate cleansers after the story-heavy games.", + "verification": "Grader confirms at least three FPS titles are listed from Reddit discussions and that the final notes frame them as contrast to the narrative shortlist.", + "weight": 0.2 + }, + "R3": { + "requirement": "Find three mind-challenging browser games on Poki and provide each title with its direct Poki game page link.", + "verification": "Grader confirms three Poki game titles and direct Poki URLs are included, and that the actual game pages were opened in separate tabs.", + "weight": 0.2 + }, + "R4": { + "requirement": "Read at least three Goodreads user reviews for The Very Secret Society of Irregular Witches and summarize the main takeaways from those reviews.", + "verification": "Grader confirms the response references takeaways from at least three user reviews and that the Goodreads book page is visibly open.", + "weight": 0.17 + }, + "R5": { + "requirement": "Return one concise recommendation bundle with titles, links where applicable, and brief notes explaining how the FPS picks, Poki games, and the book complement the story-heavy shortlist from step 1.", + "verification": "Grader confirms the final output combines all categories into one bundle and includes explicit complement notes tying later picks back to the story-heavy shortlist.", + "weight": 0.15 + } + }, + "categories": [ + "Games > Video Games Consoles and Accessories", + "Arts & Entertainment > Books and Literature" + ], + "num_categories": 2 + }, + { + "task_id": "8ac68fada21a861a0cf341b10bdef88a7ecd89de", + "confirmed_task": "I’m putting together a small study pack for a 5th grader and want it to feel practical, not overwhelming. Please start on Google and find one printable spelling worksheet that would work for a 5th grader who may have dyslexia, ideally something with a clean layout and readable formatting, and open the actual worksheet page so you can verify it really looks printable; keep the worksheet title and the direct page or download link, and leave that tab open so I can look at it later. Then, using that same age level, go back to Google and find one free, kid-friendly math practice site for 5th grade, and don’t just name the homepage — open the exact page where a 5th grader should begin practicing so I have a real starting point, and keep that in its own tab too. After that, use Google to find a video lesson that clearly covers both explicit and recursive formulas for arithmetic sequences as a stretch resource for later, then open the actual video page, start playing it, and tell me the title, URL, and creator or channel so I know it’s the right lesson. Finally, go to Citation Machine and confirm there’s a citation generator there that supports both APA and Harvard styles, because I want one place to cite the worksheet, the math site, and the video; if you can, navigate to where those style options are visible and leave that page open as proof. At the end, give me a short organized resource list with links for the worksheet, the math starting page, the video, and the citation generator site.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Provide one printable spelling worksheet suitable for a 5th grader with dyslexia, including the worksheet title and a direct page or download/print link.", + "verification": "Grader can confirm an open worksheet tab shows a printable worksheet page with readable formatting and that the returned title and link match the visible page.", + "weight": 0.27 + }, + "R2": { + "requirement": "Provide one free, kid-friendly 5th-grade math practice website and include the exact starting page URL for practice.", + "verification": "Grader can confirm the open tab is not just a homepage but a specific 5th-grade practice page appropriate for a child learner, and that the returned URL matches it.", + "weight": 0.24 + }, + "R3": { + "requirement": "Provide one video lesson that explains both explicit and recursive formulas for arithmetic sequences, including the title, URL, and creator/channel name.", + "verification": "Grader can confirm the open video page is playing or paused on the actual lesson and that the visible title and channel/creator match the returned details.", + "weight": 0.24 + }, + "R4": { + "requirement": "Identify Citation Machine as a citation generator site that supports both APA and Harvard styles, including the site name and URL.", + "verification": "Grader can confirm the Citation Machine page visibly shows or allows selection of both APA and Harvard citation styles.", + "weight": 0.15 + }, + "R5": { + "requirement": "Return the findings as a short organized resource list with links for the worksheet, math starting page, video, and citation generator.", + "verification": "Grader can confirm the final response contains four clearly labeled entries with the requested titles/details and working links corresponding to the opened pages.", + "weight": 0.1 + } + }, + "categories": [ + "Science and Education > Education" + ], + "num_categories": 1 + }, + { + "task_id": "365c0ba179de85bd5821988800b5706137576c2a", + "confirmed_task": "I want a compact current-events briefing I can skim in a minute or two, like the kind of snapshot a well-informed friend would pull together for me in a browser. Start on The New York Times homepage and grab the five biggest headlines that are visibly featured there right now, along with the section each one belongs to, because I want a broad read on the day before diving into anything niche; please open at least two of those headline stories in separate tabs so you can confirm the section labels and leave the NYT homepage open as a reference. Then use Google News or a normal Google search to find one recent article from a reliable publication about Call of Duty: Black Ops 6, open the actual article page, and give me the publication name plus the main takeaway so the briefing has one entertainment/tech item too. After that, go to Reddit and find the r/Futurology discussion asking what people think the future of the U.S. will be, read through the comment thread on the actual post page, and summarize the main themes people are expressing; keep that Reddit thread open so I can look at the tone myself. Finally, use Google to find a reliable source explaining how a U.S. federal government shutdown affects SNAP benefits, open the source page that actually answers it, and summarize clearly whether benefits continue and what exceptions or caveats apply, because I want one practical policy note in the mix. Please return everything as one concise briefing with labeled sections for NYT, Black Ops 6, Reddit sentiment, and SNAP shutdown guidance.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Include exactly five current headlines from The New York Times homepage, each paired with the correct section.", + "verification": "Grader can confirm five NYT homepage headlines were taken from the visible homepage and that at least two corresponding story tabs are open for section verification.", + "weight": 0.3 + }, + "R2": { + "requirement": "Each of the five NYT headlines is paired with the correct section label as shown on the homepage.", + "verification": "Grader can confirm each headline in the final briefing includes a section label that matches the visible NYT homepage or opened story page.", + "weight": 0.15 + }, + "R3": { + "requirement": "Include one recent Call of Duty: Black Ops 6 article from a reliable publication, naming the publication and summarizing the main takeaway.", + "verification": "Grader can see a Google results path to an opened article page from a recognizable publication and match the publication name and takeaway in the final briefing.", + "weight": 0.15 + }, + "R4": { + "requirement": "Summarize the main themes expressed in the comments of the Reddit r/Futurology discussion about the future of the U.S.", + "verification": "Grader can verify the agent opened the actual Reddit thread in r/Futurology and that the final summary reflects multiple recurring comment themes rather than only the post title.", + "weight": 0.2 + }, + "R5": { + "requirement": "Summarize reliable information on how a U.S. federal government shutdown affects SNAP benefits, clearly stating whether benefits continue and any exceptions or caveats.", + "verification": "Grader can confirm an opened reliable source page found via Google that directly addresses SNAP during a shutdown and compare it to the final explanation.", + "weight": 0.15 + }, + "R6": { + "requirement": "Return the results as one concise briefing with labeled sections for NYT headlines, Call of Duty: Black Ops 6, r/Futurology sentiment, and SNAP shutdown policy note.", + "verification": "Final response is a single compact briefing organized into the four requested labeled sections.", + "weight": 0.05 + } + }, + "categories": [ + "News & Media Publishers" + ], + "num_categories": 1 + }, + { + "task_id": "73c63095aeed43efb10a74eee7db7459c5ea9f84", + "confirmed_task": "I’m trying to sort out a realistic housing plan in Grand Rapids, Michigan and want to compare a normal rental against cheaper live-in alternatives, with a hotel as a short-stay fallback while I go look at places. Please start on Zillow and search Grand Rapids rentals with the monthly rent set between $1,400 and $2,400, then open one listing that looks like a real option in its own tab so I can see the photos and map, and grab the basics for it like the address, monthly price, and the Zillow listing page. Use that exact monthly rent as the benchmark for whether a trailer or other small living setup would actually save me money, then go to Craigslist for Grand Rapids and find at least three listings that seem suitable for living in and are priced below that Zillow benchmark; open the actual posting pages so you can verify they’re still live and note each title, price, and link. After that, because I may need somewhere temporary while I travel to inspect options, go to Booking.com and look up a hotel in Grand Rapids with visible guest reviews, open the property page, and summarize the overall guest review score plus at least three takeaways from recent reviews so I can tell what staying there would really be like. Leave the Zillow tab and the Booking.com hotel page open so I can compare them visually afterward, then give me a concise recommendation on how the rental and cheaper Craigslist alternatives stack up against the hotel, and say which seems better reviewed for a temporary stay.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify one Zillow rental listing in Grand Rapids, Michigan within the $1,400 to $2,400 monthly rent range and provide its address, monthly rent, and Zillow property page URL.", + "verification": "Grader can confirm the Zillow listing page is open in a tab and shows a Grand Rapids rental with rent between $1,400 and $2,400, plus the reported address and URL match the visible page.", + "weight": 0.3 + }, + "R2": { + "requirement": "Use the exact Zillow monthly rent from Step 1 as the benchmark and provide at least three Grand Rapids Craigslist listings suitable for living in that are priced below that benchmark, including each title, price, and posting URL.", + "verification": "Grader can confirm each Craigslist posting page is open or accessible, appears to be a live Grand Rapids-area listing suitable for living in, and the visible prices are all below the Zillow rent reported in Step 1.", + "weight": 0.3 + }, + "R3": { + "requirement": "For one Booking.com hotel in Grand Rapids, Michigan, report the overall guest review score and summarize at least three takeaways from recent guest reviews.", + "verification": "Grader can confirm the Booking.com property page is open and shows a review score, and the summarized takeaways are grounded in visible recent review content on the page.", + "weight": 0.2 + }, + "R4": { + "requirement": "Return a concise recommendation comparing the Zillow rental, the cheaper Craigslist alternatives, and the Booking.com hotel, and explicitly state which appears better reviewed for a temporary stay.", + "verification": "Grader can confirm the final summary references the gathered Zillow, Craigslist, and Booking.com findings and includes a clear conclusion about the better-reviewed temporary-stay option.", + "weight": 0.2 + } + }, + "categories": [ + "Business and Consumer Services > Real Estate", + "Travel and Tourism > Accommodation and Hotels" + ], + "num_categories": 2 + }, + { + "task_id": "e4be2c73dc00107611cd648772a11fb15c18289b", + "confirmed_task": "I’m trying to get the swirl “Getting and Cleaning Data” course working in RStudio, and the setup seems to be breaking in a few different places, so can you help me trace it in the browser like you would if you were checking this on my machine? Start on GitHub and find the actual repository location for the swirl “Getting and Cleaning Data” course files, then open the real course folder so you can verify the exact folder path I should be pointing to when I install or load that course in R. Please leave that GitHub course page open in its own tab so I can look at the folder structure afterward. Once you’ve confirmed that path, use Google to look up the specific swirl problem people hit when loading Lesson 1, “Manipulating Data with dplyr,” and find a practical fix that makes sense in the context of the course files being in the right place. If the fix mentions checking objects, packages, or column references, then in another tab look up the common reasons R or RStudio throws “object not found” or doesn’t recognize a data frame column name, because I want a short checklist of what to verify next if the lesson still fails. After that, also use Google to find the fix for the Excel import error “libxls error: unable to open file,” and make sure you get the correct R code for opening a .xlsx file with the right package and function, since that happened earlier in the same workflow. In the end, give me a concise troubleshooting note in that exact order — folder path first, then the Lesson 1 fix, then the object or column-name checks, then the Excel import fix — and tie the later fixes back to the earlier setup issue so it reads like one clean diagnosis.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify the actual GitHub location of the swirl \"Getting and Cleaning Data\" course files and state the specific folder path the user should point to in R.", + "verification": "Grader can confirm the agent opened the GitHub course folder page and reported the exact course folder path consistent with the visible repository structure.", + "weight": 0.3 + }, + "R2": { + "requirement": "Summarize the specific Lesson 1 (\"Manipulating Data with dplyr\") loading issue and provide a practical fix tied to the confirmed course setup context.", + "verification": "Grader can confirm the agent visited a relevant Google result page about the swirl Lesson 1 issue and the final note connects the fix to the earlier course-path verification.", + "weight": 0.25 + }, + "R3": { + "requirement": "Explain common reasons R shows \"object not found\" or fails to recognize a data frame column name, with at least three concrete checks or fixes.", + "verification": "Grader can confirm the agent opened a relevant source page and the final note includes at least three distinct troubleshooting checks such as spelling/case, df$col or proper column reference, package loading, object existence, or environment scope.", + "weight": 0.2 + }, + "R4": { + "requirement": "Provide a fix for the \"libxls error: unable to open file\" issue and include correct R code to read a .xlsx file using the proper package and function.", + "verification": "Grader can confirm the agent opened a source page showing the correct Excel-reading approach and the final answer includes valid .xlsx import code with the appropriate package/function.", + "weight": 0.15 + }, + "R5": { + "requirement": "Return the final answer as a concise troubleshooting note in the requested order, with later fixes tied back to the earlier setup issue.", + "verification": "Grader can confirm the response is ordered as folder path, Lesson 1 fix, object/column checks, then Excel import fix, and that it reads as one connected troubleshooting flow rather than isolated notes.", + "weight": 0.1 + } + }, + "categories": [ + "Science and Education > Education", + "Computers Electronics and Technology > Programming and Developer Software" + ], + "num_categories": 2 + }, + { + "task_id": "139b0e467c6e335945c64249c22929516253c1bb", + "confirmed_task": "I’m thinking about signing up for UserTesting for a little side income, but I don’t want to waste time on something sketchy or low-paying, so can you sanity-check it for me in the browser? First, use Google to find at least one independent article or ranking that treats Userlytics as one of the better user-testing platforms, because I want a real comparison point that isn’t coming from Userlytics itself; open the actual ranking page and leave it open in a tab so I can look at the source. Then go to UserTesting’s own site and find what they say contributors can earn for tests, including any rates, ranges, or conditions that affect payment, and keep that page open too. After that, go through UserTesting’s contributor help or support pages and pull out at least five specific things a contributor should avoid doing during tests so I can tell how easy it would be to get rejected or rated badly; if the guidance is spread across multiple help pages, open the most relevant ones in separate tabs so I can compare them. Finally, use Google to get to Irreality Labs Inc’s official website, look through what the company says it does, and then use visible public details from the site or linked company profiles to judge whether it looks like a legitimate business and the kind of company that might realistically use UX research or testing platforms like these. Please give me a concise wrap-up with the independent Userlytics source and standing, UserTesting pay details, at least five contributor mistakes to avoid, and your judgment on Irreality Labs Inc, and leave the key pages open so I can verify them myself.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify at least one independent, non-Userlytics source that ranks or reviews Userlytics among leading user testing platforms, including the source name and Userlytics’ stated position or standing.", + "verification": "A browser tab is open to a third-party ranking/review page where Userlytics is visibly listed or discussed as a leading platform, and the response names the source and standing shown on that page.", + "weight": 0.18 + }, + "R2": { + "requirement": "Report what UserTesting says contributors can earn, including any stated pay rates, ranges, examples, or conditions for payment from UserTesting’s own site.", + "verification": "A UserTesting page is open showing contributor earnings information, and the response accurately reflects the visible pay details and any stated conditions.", + "weight": 0.22 + }, + "R3": { + "requirement": "List at least five specific actions contributors should avoid during UserTesting tests, based on the Contributor Code of Conduct and privacy or contributor guidance.", + "verification": "One or more support.usertesting.com tabs are open to relevant guidance pages, and the response includes five or more avoidable behaviors that are clearly grounded in those pages.", + "weight": 0.24 + }, + "R4": { + "requirement": "Summarize what Irreality Labs Inc does using information from its official website, including its products and/or services.", + "verification": "The official Irreality Labs Inc site is open, and the response describes the company’s offerings in a way that matches visible site content.", + "weight": 0.14 + }, + "R5": { + "requirement": "Provide a legitimacy judgment on Irreality Labs Inc based on verifiable public company details such as address, leadership, registrations, linked profiles, or other visible public-facing business information.", + "verification": "The response cites concrete public details visible on the official site or linked company profiles and uses them to support a legitimacy judgment.", + "weight": 0.12 + }, + "R6": { + "requirement": "Deliver a concise final summary that ties together the independent Userlytics comparison point, UserTesting contributor pay details, at least five things contributors should avoid, and the judgment on Irreality Labs Inc including whether it seems like the kind of company that might use research or testing platforms.", + "verification": "The final response integrates findings from all prior steps into a short, coherent recommendation rather than listing disconnected facts.", + "weight": 0.1 + } + }, + "categories": [ + "Jobs and Career > Jobs and Employment", + "Business and Consumer Services > Business Services" + ], + "num_categories": 2 + }, + { + "task_id": "e1aca0ae8174c6f1847be80c82d8adc63d031b23", + "confirmed_task": "I’m trying to plan a beginner-friendly fitness outing in Fresno sometime this week, and I want to compare a couple of calmer yoga options with something more high-energy before I decide. Please start on toweryogafresno.com and pull up the actual class schedule for Tower Yoga Fresno, then note the days and times when their Tower Yoga classes are offered so I can use that as my baseline for what would fit my week; if there’s a schedule page or calendar view, leave it open so I can glance at it myself. Then go to bluemoonyogastudios.com and figure out what kind of yoga studio Blue Moon Yoga is in Fresno, and list the Fresno studio locations shown on the site, because convenience matters if Tower Yoga’s times don’t work for me; please open the Fresno location information in its own tab so I can visually compare the studio names. After that, check fresnofightgirl.com and see what Fight Girl Fitness offers in Fresno, especially the types of classes they have and how a brand-new person is supposed to get started, like whether there’s a trial, membership, booking flow, or intro option, since I might want something more energetic than yoga. In the end, give me a short recommendation on which of the three seems easiest for a beginner this week based on the schedule details and how straightforward the getting-started process looks, and keep the most useful pages open in separate tabs so I can compare them.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Report the days and times when Tower Yoga classes are offered from the Tower Yoga Fresno schedule.", + "verification": "Grader confirms the answer matches the visible schedule or calendar page left open on toweryogafresno.com.", + "weight": 0.24 + }, + "R2": { + "requirement": "Describe what Blue Moon Yoga is in Fresno based on the Blue Moon Yoga Studios website.", + "verification": "Grader confirms the description is supported by visible text on the relevant Blue Moon Yoga page.", + "weight": 0.14 + }, + "R3": { + "requirement": "List the names of the Fresno studio locations shown for Blue Moon Yoga on the website.", + "verification": "Grader confirms the listed location names match the Fresno location information visible in the opened Blue Moon tab.", + "weight": 0.16 + }, + "R4": { + "requirement": "Summarize the types of classes offered by Fight Girl Fitness in Fresno.", + "verification": "Grader confirms the class types are supported by visible class or program information on fresnofightgirl.com.", + "weight": 0.16 + }, + "R5": { + "requirement": "Explain how a new person can get started with Fight Girl Fitness, including any memberships, trial, booking, or participation details available on the website.", + "verification": "Grader confirms the getting-started summary matches visible onboarding, membership, booking, or introductory information on the site.", + "weight": 0.16 + }, + "R6": { + "requirement": "Provide a short recommendation comparing Tower Yoga, Blue Moon Yoga, and Fight Girl Fitness, and identify which option seems easiest for a beginner based on schedule convenience and onboarding details.", + "verification": "Grader confirms the recommendation references findings from all three sites and is consistent with the extracted schedule, location, and getting-started details.", + "weight": 0.14 + } + }, + "categories": [ + "Health > Nutrition Diets and Fitness" + ], + "num_categories": 1 + }, + { + "task_id": "4b9850333bdd7298442df495aff3832c13b119da", + "confirmed_task": "I’m trying to put together a cozy monthly subscription night for a friend in the UK and want one tidy recommendation I can actually look at in the browser afterward. Please start on Beer52 and figure out which of their beer subscription plans are genuine monthly options that deliver within the UK, because I only want plans that would work for a regular monthly treat here rather than anything one-off or unclear. Open the relevant Beer52 plan pages in separate tabs and leave the best evidence visible so I can compare them myself. Once you know the monthly beer choices, go to snackd.co.uk and find at least three snack subscription boxes that also deliver to the UK, then open each actual product or brand page so you can verify from the site itself that UK delivery or shipping is available and leave those tabs open too, since I want to see the boxes and not just a summary. After that, use Google to work out which UK streaming subscription service includes the movie “Wicked” at no extra cost in the base subscription, and please make sure it’s not just a rental, purchase option, add-on channel, or ad-supported loophole because I want something I could actually pair with the subscription night without paying extra for the film. Open the page that proves where it’s included and keep that tab available. Finally, check PrintPigeon and explain, in plain English, how it takes an email or online message and turns it into a posted letter, and tell me the current price for sending one standard UK letter so I can decide whether mailing the invite is worth the extra effort. In the end, give me one coherent summary that pulls together the Beer52 monthly UK options, the three snack alternatives with UK-delivery proof, the streaming service for “Wicked,” and the PrintPigeon explanation and letter price.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Correctly identify the Beer52 subscription plans that are monthly and deliver within the UK.", + "verification": "Grader can confirm qualifying Beer52 plans from open tabs showing plan details and visible evidence of monthly cadence and UK delivery availability.", + "weight": 0.28 + }, + "R2": { + "requirement": "Find at least three snack subscription boxes from snackd.co.uk and provide evidence from each box’s own page or linked site that UK delivery or shipping is available.", + "verification": "Grader can inspect at least three open listing/brand tabs and see visible UK shipping or delivery wording for each snack subscription option.", + "weight": 0.28 + }, + "R3": { + "requirement": "Correctly determine which UK streaming subscription service includes “Wicked” at no extra cost in the base subscription, excluding rental, purchase, add-ons, and ad-supported access.", + "verification": "Grader can confirm from the opened proof page that “Wicked” is included with the named service’s subscription and not presented as a rent/buy/add-on-only option.", + "weight": 0.22 + }, + "R4": { + "requirement": "Accurately explain how PrintPigeon turns an email or online message into a posted letter and state the current price for one standard UK letter.", + "verification": "Grader can verify the explanation and price against visible PrintPigeon site content showing the workflow and standard-letter pricing.", + "weight": 0.12 + }, + "R5": { + "requirement": "Produce one coherent final summary combining the Beer52 monthly UK options, three snack alternatives with UK-delivery proof, the qualifying streaming service for “Wicked,” and the PrintPigeon mailing explanation and price.", + "verification": "Final response includes all required components in a single integrated summary with no missing category.", + "weight": 0.1 + } + }, + "categories": [ + "Food and Drink > Beverages", + "Ecommerce & Shopping > Ecommerce and Shopping - Other" + ], + "num_categories": 2 + }, + { + "task_id": "946321e8a9788f485d360f619127a2e7b7e1693a", + "confirmed_task": "I’m planning a cozy Christmas day at home with my toddler and want ideas for both of us that I can actually look at on screen afterward. Could you start on Hobbycraft and find three low-mess Christmas craft activities that feel toddler-friendly, then open the actual project pages in separate tabs so I can compare the photos and note the materials each one needs? After that, use Google to find three more at-home Christmas activity ideas for a toddler that are clearly different from the Hobbycraft ones, because I want a fuller mix of options beyond simple repeats, and open at least one of those source pages so I can see it’s a real activity page. Once you’ve got the toddler plan sorted, switch over to Scratch Magazine and look for Grinch Christmas nail inspiration, then find one Grinch tutorial page I could realistically follow and leave that tutorial page open so I can check the design details myself. From there, go to Amazon and find at least three Grinch-themed nail art ideas or products that use different materials, like stickers or decals, gel polish, brushes, glitter, rhinestones, stamping plates, or similar, and make sure they actually fit the style or techniques shown on the Scratch tutorial. Please open the most promising Amazon product pages in their own tabs so I can compare them visually, and then give me a clear summary with the toddler activity options, the Scratch tutorial, and the three nail product ideas with links plus a quick note on how each one matches the Grinch look.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Exactly three low-mess toddler-friendly Christmas craft activities are identified on Hobbycraft, with materials summarized for each.", + "verification": "Grader can confirm three distinct Hobbycraft activity pages were opened or visited and the final response includes each activity name plus its materials list.", + "weight": 0.24 + }, + "R2": { + "requirement": "At least three additional at-home toddler Christmas activity ideas are found via Google and are different from the Hobbycraft activities, each with a short description.", + "verification": "Grader can confirm Google search results were used, at least one non-Hobbycraft source page was opened, and the final response lists three distinct additional ideas that do not duplicate the Hobbycraft ones.", + "weight": 0.22 + }, + "R3": { + "requirement": "Grinch Christmas nail inspiration is found on Scratch Magazine, including one specific Grinch tutorial page title and URL.", + "verification": "Grader can confirm navigation on Scratch Magazine to a Grinch-related nail page and that a tutorial page was left open or clearly identified with title and link.", + "weight": 0.2 + }, + "R4": { + "requirement": "At least three Amazon Grinch-themed nail art ideas or products are found using different material types, such as decals, gel polish, glitter, brushes, rhinestones, or stamping plates.", + "verification": "Grader can confirm three Amazon product pages or listings were opened or visited, with products spanning different materials such as decals, gel, glitter, brushes, rhinestones, or stamping tools.", + "weight": 0.2 + }, + "R5": { + "requirement": "The final summary ties the Amazon products back to the Scratch tutorial style or techniques and includes all toddler activities, the tutorial, and product links.", + "verification": "Grader can confirm the final response contains the three Hobbycraft crafts with materials, three additional Google-sourced toddler ideas with descriptions, one Scratch tutorial title and URL, and three Amazon product ideas with links plus notes explaining how they match the Scratch inspiration.", + "weight": 0.14 + } + }, + "categories": [ + "Hobbies and Leisure > Crafts", + "Community and Society > Holidays and Seasonal Events" + ], + "num_categories": 2 + }, + { + "task_id": "54c07bd8afe7d70cd55b716977d2c29f1b2a91e9", + "confirmed_task": "I’m trying to put together a quick but thoughtful Christmas gift shortlist for a few different people, and I want it to feel balanced instead of random. First, on The New York Times site, find a gift-guide article aimed at hard-to-shop-for people that actually shows prices and includes Amazon purchase links, because I want one practical gift from a credible roundup to use as the anchor for the whole list; open the article itself and use it to pick one practical item, then leave that article tab open so I can look at the recommendations later. Once you’ve got that anchor gift, go to Etsy and search for personalized custom Christmas ornaments that would work as sentimental add-ons, and open three promising listings in separate tabs so I can compare the photos, names, and prices like a real shopper would. After that, head to Duke Cannon’s site and look through the Holiday Collection for two gifts that feel more like stocking-stuffer options for a guy, and open the actual product pages so you can grab the names, prices if shown, and links from the live listings. Then round it out on Lookfantastic by browsing men’s Christmas gifting toiletries or body gift sets and finding three options with current prices, mainly so I can compare whether those feel like better value than the Duke Cannon picks; please open the product pages for the three best matches in separate tabs too. In the end, send me a concise shortlist with the NYT article URL, the one practical anchor gift, the three Etsy ornament options, the two Duke Cannon holiday gifts, and the three Lookfantastic men’s gift-set options, all with names, prices when shown, and URLs.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "A qualifying New York Times hard-to-shop-for gift article is identified and the response includes the article URL plus one practical anchor gift taken from that article.", + "verification": "Grader can confirm the NYT page is an actual gift-guide article for hard-to-shop-for people, that prices are shown in the article, and that at least three items in the article include Amazon purchase links; the chosen anchor gift appears within that article.", + "weight": 0.24 + }, + "R2": { + "requirement": "Three different Etsy personalized custom Christmas ornament options are collected with names, prices, and URLs from live listings.", + "verification": "Grader can confirm three separate Etsy listing tabs are open or were opened, and each listing visibly shows a personalized/custom ornament product with a name, price, and distinct URL.", + "weight": 0.22 + }, + "R3": { + "requirement": "Two Duke Cannon Holiday Collection gifts suitable as stocking-stuffer style options for a guy are provided with names, URLs, and prices when shown.", + "verification": "Grader can confirm both items come from Duke Cannon’s Holiday Collection or holiday gift area and that the live product pages show the product names and URLs, with prices captured if visible.", + "weight": 0.18 + }, + "R4": { + "requirement": "Three Lookfantastic men’s Christmas gifting toiletries or body gift sets are listed with current prices and URLs.", + "verification": "Grader can confirm three separate Lookfantastic product pages were opened and that each item is a men’s Christmas gifting toiletry/body gift set with a visible current price and URL.", + "weight": 0.18 + }, + "R5": { + "requirement": "The final output is a concise, combined Christmas gift shortlist covering all required categories: one NYT-inspired practical anchor gift, three Etsy ornaments, two Duke Cannon gifts, and three Lookfantastic comparison options.", + "verification": "Grader can confirm the final response includes all nine gift options plus the NYT article URL, and each entry contains the required identifying details in a compact shortlist format.", + "weight": 0.18 + } + }, + "categories": [ + "Ecommerce & Shopping > Ecommerce and Shopping - Other", + "Community and Society > Holidays and Seasonal Events", + "Lifestyle > Gifts and Flowers" + ], + "num_categories": 3 + }, + { + "task_id": "d8fe04d1cf29251d68382cde58f4424e80bad07c", + "confirmed_task": "I’m trying to figure out a shared journaling setup for me and my partner, but I’m pretty cautious about privacy and especially about what outside AI tools can get into. To set a baseline first, please go to Oura’s support site and find the guidance around Oura Membership privacy, specifically anything that explains how to stop an AI agent from accessing membership information, and leave that support article open so I can look at the exact wording myself. Once you’ve got that privacy baseline, head over to Journey Cloud and look through its journaling plans and any pages about shared journals, partner or couples use, or collaborative entries, because I want to know whether it would actually work for two people without feeling too exposed; if there are relevant pricing or plan pages, open the main options in separate tabs so I can compare them visually later. After that, check the page about the Huan Dao Meditation app on Formfacade and tell me what the app is and which Eastern spiritual wellness methods it says it uses, since I want one concrete example of the kind of wellness features people sometimes pair with journaling. Then use Google to find Norton Secure VPN’s official product page, open the real Norton page, and pull at least three advertised features from it so I have a simple privacy-tool reference point; please keep the Norton product page open too. In the end, give me a concise recommendation on whether Journey Cloud seems to fit the privacy expectations set by the Oura guidance, while also mentioning Huan Dao as a wellness-feature example and Norton Secure VPN as a privacy comparison point.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Find and summarize Oura support guidance on Oura Membership privacy, including the specific instruction or policy for preventing an AI agent from accessing membership information.", + "verification": "Grader can confirm the browser is on a relevant support.ouraring.com article and that the final answer includes the privacy guidance and explicit AI-agent access prevention detail sourced from that page.", + "weight": 0.28 + }, + "R2": { + "requirement": "Summarize Journey Cloud’s shared journaling options, including any couples, partner, friend, or shared-journal capabilities, and include pricing details from Journey Cloud.", + "verification": "Grader can confirm relevant journey.cloud pricing and feature pages were opened, ideally in separate tabs, and that the final answer reports both sharing-related features and plan pricing.", + "weight": 0.28 + }, + "R3": { + "requirement": "Identify what the Huan Dao Meditation app is and list the Eastern spiritual wellness methods it claims to use.", + "verification": "Grader can confirm the formfacade.com page about Huan Dao Meditation was visited and that the final answer accurately states the app description and named methods from that page.", + "weight": 0.14 + }, + "R4": { + "requirement": "Use Google to locate Norton Secure VPN’s official product page, summarize what the product is, and list at least three advertised features from Norton’s page.", + "verification": "Grader can confirm a Google results page was used, the official Norton page was opened and left visible, and the final answer includes at least three features taken from that product page.", + "weight": 0.16 + }, + "R5": { + "requirement": "Provide a final recommendation on whether Journey Cloud fits the user’s privacy expectations, explicitly using Oura’s privacy baseline and incorporating Journey Cloud findings, with Huan Dao as a wellness-feature example and Norton Secure VPN as a privacy comparison point.", + "verification": "Grader can confirm the final synthesis references findings from Oura and Journey Cloud directly, and also mentions Huan Dao and Norton in the recommendation rather than listing them separately without comparison.", + "weight": 0.14 + } + }, + "categories": [ + "Computers Electronics and Technology > Computers Electronics and Technology - Other", + "Health > Health - Other", + "Lifestyle > Lifestyle - Other" + ], + "num_categories": 3 + }, + { + "task_id": "e20742e26a4d6f9c3d62a9d1cef634297bd4204f", + "confirmed_task": "I’m putting together a small Christmas get-together here in the UK and want one tidy plan I can glance at later. Could you start on Sainsbury’s and find three gluten-free Christmas starter ideas that feel properly festive, then open the actual recipe or product pages in separate tabs so I can compare the photos and names, because I’m trying to decide whether the meal should lean more elegant, cosy, or party-food style. Once you’ve got that food direction, go to The Kitchn and find their guidance for cooking a spiral-cut ham, then give me the main cooking approach in a short summary that matches the festive theme from the starters so I can picture the full menu. After that, switch to Marks & Spencer and find three stocking-filler gift ideas for the dads coming over, keeping each one under £20, and open the product pages so I can visually compare whether they feel useful or just novelty gifts. Then check Boots for two men’s gift sets that include aftershave or body spray, with prices, so I can compare those against the M&S options as slightly more polished backups. Please leave the most promising Sainsbury’s starter tab and the two best gift pages open at the end, and give me one concise menu-and-gift shortlist with item names, prices where relevant, and a quick note on why each option fits.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Provide three gluten-free Christmas starter ideas from Sainsbury’s with the correct names.", + "verification": "Grader can confirm three distinct Sainsbury’s starter recipe or product pages were opened and the returned names match visible page titles.", + "weight": 0.22 + }, + "R2": { + "requirement": "Use the Sainsbury’s starters to infer a festive food theme and carry that into the menu framing.", + "verification": "Final write-up explicitly links the starter choices to a coherent festive style such as elegant, cosy, or party-food and uses that framing for the main course.", + "weight": 0.12 + }, + "R3": { + "requirement": "Summarize The Kitchn’s recommended method for cooking a spiral-cut ham, including the key preparation and cooking steps.", + "verification": "Returned summary reflects the visible Kitchn article guidance on how to prepare, heat, and finish a spiral-cut ham.", + "weight": 0.2 + }, + "R4": { + "requirement": "Provide three Marks & Spencer stocking-filler gift ideas suitable for fathers that each cost under £20, including item name and price.", + "verification": "Grader can confirm each selected M&S product page shows a price below £20 and that the returned names and prices match the visible listings.", + "weight": 0.18 + }, + "R5": { + "requirement": "Provide two Boots men’s gift sets that include aftershave or body spray, including item name and price for each.", + "verification": "Grader can confirm the Boots product pages are gift sets for men and that the visible product details indicate aftershave or body spray is included.", + "weight": 0.14 + }, + "R6": { + "requirement": "Include browser-only proof by leaving open the most promising Sainsbury’s starter tab and the two best gift product pages at the end.", + "verification": "Open tabs at completion include one Sainsbury’s starter page and two selected gift pages from M&S and/or Boots for visual review.", + "weight": 0.06 + }, + "R7": { + "requirement": "Return everything as one concise menu-and-gift shortlist with names, prices where relevant, and a brief note explaining why each option fits.", + "verification": "Final response is a combined shortlist covering starters, ham approach, M&S gifts, and Boots gift sets, with short fit notes and prices where applicable.", + "weight": 0.08 + } + }, + "categories": [ + "Food and Drink > Cooking and Recipes", + "Community and Society > Holidays and Seasonal Events" + ], + "num_categories": 2 + }, + { + "task_id": "69f48a0950d532a2f04ff51abe4bf0e05ec5649e", + "confirmed_task": "I'm planning a trip from London to Seoul around July and want you to help me narrow things down in a realistic booking flow. First, compare the cheapest round-trip economy flight options you can find from London to Seoul for a departure on July 17th and return in early August using the flight sites available, and use what you find to identify the best-value option overall. Once you've got that flight shortlist and winner, switch to Booking.com and check two Seoul properties I might use for my whole stay — The Joseon Hotel and The Lotte Hotel. For each one, look at the guest review score and read recent reviews so I can tell whether either place seems reliably good enough for a pre-flight night. In the end, give me a concise recommendation that names the cheapest flight source and fare you found, summarizes whether The Joseon Hotel and The Lotte Hotel are generally very positive, and highlights at least three recent review takeaways for The Lotte Hotel so I can decide if I should book that hotel before the London flight.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Compare round-trip economy flight options from London to Seoul departing July 17th and returning in early August across multiple flight sites, and identify the cheapest option with fare and source.", + "verification": "Grader can confirm flight search results are shown for London to Seoul with the correct dates, economy class, and the cheapest fare is identified with airline/price/source site.", + "weight": 0.25 + }, + "R2": { + "requirement": "Identify the best-value flight option overall across the sites searched and name the cheapest flight source and fare.", + "verification": "Final response explicitly names which site offered the cheapest fare and states the price.", + "weight": 0.15 + }, + "R3": { + "requirement": "Look up The Joseon Hotel on Booking.com and report its guest review score and whether reviews are generally very positive.", + "verification": "Grader can confirm the Booking.com property page for The Joseon Hotel in Seoul is open or was visited, and the review score is reported.", + "weight": 0.2 + }, + "R4": { + "requirement": "Look up The Lotte Hotel on Booking.com, report its guest review score, and highlight at least three recent review takeaways.", + "verification": "Grader can confirm the Booking.com property page for The Lotte Hotel in Seoul is open or was visited, the review score is reported, and at least three concrete review takeaways are included.", + "weight": 0.25 + }, + "R5": { + "requirement": "Provide a concise recommendation naming the cheapest flight source and fare, summarizing whether The Joseon Hotel and The Lotte Hotel are generally very positive, and including the three Lotte Hotel review takeaways.", + "verification": "Grader can confirm the final response covers all three components: flight winner, both hotel review assessments, and at least three Lotte Hotel review themes.", + "weight": 0.15 + } + }, + "categories": [ + "Travel and Tourism > Air Travel", + "Travel and Tourism > Accommodation and Hotels" + ], + "num_categories": 2 + }, + { + "task_id": "24c664186e6839e1a0a117041480ff143bf8c91a", + "confirmed_task": "I’m trying to sanity-check whether moving ahead with a Tesla Model 3 lease in Los Angeles is actually manageable month to month, so start on Google and look up current Tesla Model 3 lease pricing for the Los Angeles area, including the lease term, due-at-signing amount, and any discounts, tax credits, or rebates you can find, because I want a realistic baseline instead of just a headline number. Once you’ve got that monthly lease figure, use it as a reference point and go to the Los Angeles Craigslist site, specifically the San Gabriel Valley section, and find at least three trailer listings that look like plausible live-in fallback options under $10,000; open the actual posting pages in separate tabs so I can see the photos and verify the listings are still live, then note each one’s title, price, and location. After that, go to Zillow and look for an LA-area rental whose monthly price is in the same ballpark as the Tesla lease payment you found, so I can compare whether paying for housing at that level makes more sense than taking on the car; open the actual Zillow listing page and leave it open so I can check the photos and map myself. In the end, give me a short comparison that includes the Tesla lease deal, the three Craigslist trailer backups, and the Zillow rental option that’s closest in monthly price to the lease.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Find and summarize a current Los Angeles Tesla Model 3 lease offer with monthly payment, lease term, due-at-signing amount, and any available discounts or rebates.", + "verification": "Grader confirms the response includes all four lease elements and that the information is consistent with the Google results or linked source pages viewed during browsing.", + "weight": 0.35 + }, + "R2": { + "requirement": "Identify at least three San Gabriel Valley Craigslist trailer listings under $10,000 that appear suitable for living in and provide each listing’s title, price, and location.", + "verification": "Grader confirms three distinct Craigslist posting pages were opened and that each reported title, price, and location matches the visible listing pages.", + "weight": 0.3 + }, + "R3": { + "requirement": "Open the actual Craigslist posting pages in separate tabs so the listings can be visually checked for photos and live status.", + "verification": "Grader confirms multiple Craigslist tabs are open on individual posting pages rather than only search results, with visible listing content and photos/status indicators.", + "weight": 0.1 + }, + "R4": { + "requirement": "Find one Zillow rental listing in the Los Angeles area with monthly rent close to the Tesla lease payment and provide its address and Zillow property page.", + "verification": "Grader confirms the Zillow listing page is open and the reported address and URL match the visible property page, with rent in the same general monthly range as the lease figure.", + "weight": 0.15 + }, + "R5": { + "requirement": "Present a short comparison tying together the Tesla lease baseline, the three Craigslist trailer fallback options, and the Zillow rental option closest in monthly price.", + "verification": "Grader confirms the final response includes all components in a concise comparison and explicitly relates the Zillow rental to the lease payment benchmark.", + "weight": 0.1 + } + }, + "categories": [ + "Vehicles > Makes and Models", + "Finance > Banking Credit and Lending" + ], + "num_categories": 2 + }, + { + "task_id": "44f1e02116715d5fe313996811b358fe25bc3ee4", + "confirmed_task": "I’m trying to put together a quick accessory shortlist for the different cars in our household, and I want to see real product pages rather than just a text summary. Please start on WeatherTech and use the vehicle selector for a 2020 Toyota Highlander to confirm which floor mat option and which cargo liner or cargo mat option actually fit, because the Highlander is the one I’m most likely to buy for first. When you find the cargo liner page, open the actual product listing and leave that tab open so I can look at the photos and fitment details later. After you’ve confirmed the Highlander cargo setup, go to Temu and find a set of car headrest hooks that would work for a Fiat 500, mainly so I can compare whether a cheap organizer add-on is enough for our smaller car instead of doing a full cargo solution. Open the actual Temu listing and note the product name plus whatever compatibility details on the page make it seem usable with a Fiat 500. Then use Google to find one suitable LED emblem option for a 2023 Honda Civic, open the actual product result in its own tab, and grab the product name and current price so I have one exterior accessory idea to round out the shortlist. In the end, give me a concise comparison covering the Highlander fitment details, the direct WeatherTech cargo liner link, the Fiat 500 hook listing with compatibility notes, and the Civic LED emblem option with price.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Identify the WeatherTech floor mat product that fits a 2020 Toyota Highlander and report the fitment details.", + "verification": "Grader can confirm the WeatherTech vehicle-selected results or product page shows 2020 Toyota Highlander fitment for the floor mat option.", + "weight": 0.2 + }, + "R2": { + "requirement": "Identify the WeatherTech cargo liner or cargo mat product that fits a 2020 Toyota Highlander and include the direct product page link.", + "verification": "Grader can confirm the open WeatherTech cargo liner tab shows the fitting Highlander cargo product and that a direct link is provided.", + "weight": 0.25 + }, + "R3": { + "requirement": "Find one Temu car headrest hook set that would work with a Fiat 500 and report the product name.", + "verification": "Grader can confirm the Temu listing page is open and the reported product name matches the visible listing title.", + "weight": 0.15 + }, + "R4": { + "requirement": "Report compatibility details from the Temu listing that support why the hook set would work with a Fiat 500.", + "verification": "Grader can verify the cited listing text or specs mention universal fit, seat headrest mounting, dimensions, or other compatibility cues visible on the Temu page.", + "weight": 0.15 + }, + "R5": { + "requirement": "Find one suitable LED emblem option for a 2023 Honda Civic and report its product name.", + "verification": "Grader can confirm the selected Google result or opened product page shows an LED emblem option associated with a 2023 Honda Civic.", + "weight": 0.1 + }, + "R6": { + "requirement": "Include the displayed price for the selected 2023 Honda Civic LED emblem option.", + "verification": "Grader can verify the reported price matches the visible price on the Google result or opened product page.", + "weight": 0.05 + }, + "R7": { + "requirement": "Return a concise comparison covering all three vehicles and the requested accessory categories.", + "verification": "Grader can confirm the final response includes Highlander floor and cargo fitment details with cargo link, Fiat 500 hook name with compatibility notes, and Civic LED emblem name with price in a compact comparison format.", + "weight": 0.1 + } + }, + "categories": [ + "Vehicles > Makes and Models", + "Ecommerce & Shopping > Ecommerce and Shopping - Other" + ], + "num_categories": 2 + }, + { + "task_id": "18ddad3e0781d4b8fb2e1998ff836a0b07d0cdce", + "confirmed_task": "I’m in Boston for the next two days on a work trip with my wife, and I want to lock in two different dinners that feel right for each occasion. First, on the Michelin Guide site, please find me one Boston restaurant that’s Michelin-starred or clearly in that polished fine-dining tier for a business dinner, and keep it around a $500 per person ceiling so I know it’s appropriate without going overboard. Then switch to OpenTable and look for a separate restaurant in Boston that feels genuinely romantic for an anniversary dinner, ideally Italian seafood or something very Boston-specific, because I want the second night to feel more personal and celebratory. Once you have both, use Google to open the actual restaurant websites or current listing pages in separate tabs and verify that the business-dinner place really looks suitable for client-style dining and that the anniversary place clearly handles special occasions like anniversaries or romantic dinners; while you’re there, check whether each one appears to have availability sometime in the next two days. Please leave the final restaurant pages open so I can look at the photos, ambiance, and booking details myself, and give me a short summary of which place is for the business dinner and which is for the anniversary dinner, why each one fits, the expected price level, the special-occasion evidence you found, and the availability status for the next two days.", + "website": "https://www.google.com", + "reference_length": 4, + "level": "medium", + "rubrics": { + "R1": { + "requirement": "Select one Boston restaurant from the Michelin Guide or a clearly equivalent fine-dining listing that is appropriate for a business dinner and plausibly within the stated $500 per person budget.", + "verification": "Grader can confirm a Michelin Guide restaurant page or equivalent fine-dining page is open showing the Boston restaurant name, cuisine/style, and price indicators consistent with the budget.", + "weight": 0.3 + }, + "R2": { + "requirement": "Select one separate Boston restaurant from OpenTable that is romantic and suitable for an anniversary dinner, matching the preference for Italian seafood or Boston-specific cuisine.", + "verification": "Grader can confirm an OpenTable listing page is open showing the restaurant name, cuisine category, and visual or descriptive cues indicating a romantic/special-occasion setting.", + "weight": 0.25 + }, + "R3": { + "requirement": "Verify that the business-dinner restaurant appears suitable for polished client or business dining, report the expected price level, and report whether it has availability within the next two days.", + "verification": "Grader can confirm an official site or current booking/listing page is open for the business restaurant with visible evidence such as fine-dining descriptions, ambiance cues, price indicators, reservation interface, or available time slots/dates within the next two days, and the response states the expected price level.", + "weight": 0.225 + }, + "R4": { + "requirement": "Verify that the anniversary restaurant handles romantic or special-occasion dining, report the expected price level, and report whether it has availability within the next two days.", + "verification": "Grader can confirm an official site or current booking/listing page is open for the anniversary restaurant with visible evidence such as anniversary/private dining language, romantic ambiance cues, price indicators, guest photos, or available reservation times/dates within the next two days, and the response states the expected price level.", + "weight": 0.225 + } + }, + "categories": [ + "Food and Drink > Restaurants and Delivery", + "Travel and Tourism > Travel and Tourism - Other" + ], + "num_categories": 2 + }, + { + "task_id": "753ce2163f6e018ea33423ad4400ba3f759e9df8", + "confirmed_task": "I’m daydreaming about doing a ridiculous-but-fun summer baseball trip where I see exactly one game at all 30 MLB stadiums, and I want you to build the whole thing in a way I could actually use. Start on MLB.com and pull the official summer schedule so we can choose one real game date at each stadium, and please lean toward matchups where I might get to see stars I care about most like Shohei Ohtani, Aaron Judge, and Ronald Acuna Jr. whenever that’s realistically possible. As you’re picking games, open the actual game or team schedule pages in separate tabs for a few representative stops so there’s visible proof the dates are live, and keep the key schedule tabs open so I can glance at them later. Once you’ve got the 30 stadium/date choices, use Google Flights to figure out the smartest sequence between stops and compare flights versus driving for each leg, using whatever is cheaper and more practical in summer, because I want this to feel like a real budget-conscious trip instead of fantasy routing. After that, use Booking.com to find one hotel option for each game night that’s reasonably close to the stadium—something like within about 2 miles if possible and not outrageously priced for a solo traveler—and open at least a couple of the actual hotel listing pages with photos/maps so I can visually sanity-check the neighborhoods. Then use Yelp to find at least one must-try local food spot near each stadium, ideally something iconic to that city or ballpark area, and open a few of the restaurant pages so I can see that they’re real places with reviews. Finally, put everything into a CryptPad Document in one organized itinerary with each stadium listed exactly once, the chosen game and matchup, whether it includes Ohtani, Judge, Acuna, or another notable player, the travel leg before it with the cheaper mode and estimated cost, one hotel with estimated nightly price, one food pick, and running totals so I can see what this insane summer would actually cost. Leave the finished CryptPad Document open at the end, and if you create any comparison tabs along the way, keep the most useful ones open so I can review them.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "The final itinerary includes all 30 MLB stadiums exactly once, each paired with one official summer game date and matchup sourced from MLB.com.", + "verification": "Grader can confirm 30 unique stadium entries in the CryptPad Document and cross-check representative open MLB.com schedule/game tabs showing selected dates and matchups.", + "weight": 0.22 + }, + "R2": { + "requirement": "Selected games are prioritized for appearances by Shohei Ohtani, Aaron Judge, and Ronald Acuna Jr. where feasible, and each stop includes a star-player note or another notable player when those three are not present.", + "verification": "Grader can inspect the itinerary’s player notes and compare them against open MLB.com schedule/team pages for representative entries involving the named players.", + "weight": 0.14 + }, + "R3": { + "requirement": "The itinerary specifies a complete visit sequence across all 30 stadiums and identifies the cheaper practical travel mode between each consecutive stop using Google Flights comparisons and driving where appropriate.", + "verification": "Grader can review the ordered route in the CryptPad Document and compare representative travel legs against open Google Flights results or documented drive choices tied to the selected dates and cities.", + "weight": 0.22 + }, + "R4": { + "requirement": "Each stadium stop has one accommodation option for the corresponding game night near the stadium, with an estimated nightly price included.", + "verification": "Grader can confirm 30 lodging entries in the itinerary and inspect several open Booking.com hotel listing pages showing price and proximity/map context.", + "weight": 0.14 + }, + "R5": { + "requirement": "Each stadium stop includes at least one nearby must-try local food recommendation sourced from Yelp.", + "verification": "Grader can confirm 30 food entries in the itinerary and inspect several open Yelp business pages with ratings/location details near the relevant stadium areas.", + "weight": 0.1 + }, + "R6": { + "requirement": "A finished CryptPad Document is created and left open, containing the complete organized itinerary with stadium, game, player note, travel leg and cost, hotel and cost, food pick, per-stop totals, and overall total trip cost.", + "verification": "Grader can view the open CryptPad Document and verify that all required columns/fields and totals are present in one coherent document or table.", + "weight": 0.18 + } + }, + "categories": [ + "Sports > Baseball", + "Travel and Tourism > Travel and Tourism - Other" + ], + "num_categories": 2 + }, + { + "task_id": "3add0c2ffff8e0b3cacedf2e895d213735702f62", + "confirmed_task": "I’m daydreaming about a huge spring basketball trip where I see one game at every single NBA arena, and I want it planned like something I could actually follow. Please start on NBA.com and use the schedule to pick one spring home game for each of the 30 arenas, leaning toward games where I’d get to see LeBron James, Stephen Curry, or Victor Wembanyama whenever that’s realistically possible, because I’d love a few marquee-player nights mixed into the full set. As you go, open the actual game pages in tabs for the key star-player picks so I can visually confirm the matchups and dates, and keep the schedule pages open where you found the arena dates. Once you’ve got all 30 arena stops, use Google Flights and Google Maps/Travel to figure out the cheapest practical way to move from one city to the next in a sensible route, choosing between flying and driving based on cost and reasonableness so I can see whether this works better as a true road trip in clusters with flights between regions. After that, on Booking.com, find one hotel option for each game night that’s reasonably close to the arena and not wildly expensive, and open the actual hotel listing pages for a few representative stops so I can see the map and photos. Then use Yelp to find at least one must-try local food spot near each arena so the trip feels fun and not just logistical, and open a few of the restaurant pages so I can visually check that they’re real places near the venue. Finally, put everything into a CryptPad Document in a clean table or structured list with all 30 stops, including arena, city, selected game date, matchup, whether LeBron/Curry/Wembanyama is featured, the travel leg from the previous stop with the cheapest mode and estimated cost, one nearby hotel with estimated nightly price, one local food recommendation, and a running total estimate for the whole trip, and leave the doc open so I can review it.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A complete set of 30 spring NBA arena stops is selected from NBA.com, with one spring home game identified for each NBA arena including arena name, city, date, and matchup.", + "verification": "Grader confirms the final itinerary contains 30 distinct NBA arenas and that NBA.com schedule pages or game pages are open or referenced for the selected spring home games.", + "weight": 0.2 + }, + "R2": { + "requirement": "The selected games are prioritized to include LeBron James, Stephen Curry, or Victor Wembanyama when available, and marquee-player game pages are opened in tabs for visual confirmation.", + "verification": "Grader checks that the itinerary marks whether LeBron, Curry, or Wembanyama is featured for each stop and that NBA.com game detail tabs are open for representative star-player selections.", + "weight": 0.15 + }, + "R3": { + "requirement": "An optimized 30-stop route is produced using Google Flights and Google Maps/Travel, with each leg assigned the cheapest practical travel mode between driving and flying and an estimated cost.", + "verification": "Grader verifies that each consecutive leg in the itinerary includes a travel mode and estimated cost, and that Google route or fare result pages are visibly used as evidence for representative legs.", + "weight": 0.25 + }, + "R4": { + "requirement": "One accommodation option near each arena is identified for the relevant game night on Booking.com, with estimated nightly pricing and proximity that supports attending the game.", + "verification": "Grader confirms 30 hotel entries are present and that several Booking.com listing pages with map/photo views are open as browser proof.", + "weight": 0.15 + }, + "R5": { + "requirement": "At least one must-try local food recommendation near each arena is identified on Yelp.", + "verification": "Grader checks that every stop has a food recommendation and that sample Yelp business pages are open showing location and reviews near the arena area.", + "weight": 0.1 + }, + "R6": { + "requirement": "A CryptPad Document is created and left open containing the complete integrated itinerary with all 30 stops, including game details, star-player note, travel leg, hotel, food recommendation, per-stop costs, and a summarized total estimated trip cost.", + "verification": "Grader confirms the CryptPad Document is open and includes all required columns or fields for all 30 stops plus a total cost summary.", + "weight": 0.15 + } + }, + "categories": [ + "Sports > Basketball", + "Travel and Tourism > Travel and Tourism - Other" + ], + "num_categories": 2 + }, + { + "task_id": "fc98d55986ef93480fb659db44e070c04f93301a", + "confirmed_task": "I’m daydreaming about doing a big summer baseball trip through Japan where I see exactly one game at every single NPB stadium, and I want it planned like something I could actually book, not just a rough idea. Please start on npb.jp and pull the full summer schedule, then identify all 12 NPB stadiums and pick one realistic summer game at each stadium so the dates can fit into one chronological trip. Once you’ve got those game dates, use Google Flights and Google Travel to figure out the cheapest practical route between the cities, mixing flights and trains or other ground transport when that saves money, because I want to keep the total cost under control without making the trip impossible. After that, go to Booking.com and find a place to stay for each game night that’s reasonably close to the stadium, ideally something like a well-rated hotel or business hotel that would be easy after a night game, and open the actual hotel listing so I can see the photos and map. Then use Google Search and Yelp to figure out the best Japanese food plan in each stadium city — I’m hoping for a real food tour feel, so look for standout ramen shops, izakayas, sushi spots, local specialties, and markets where that makes sense. Please open a couple of the most promising food spots in separate tabs for at least a few cities so I can visually compare whether they look worth it. Finally, put everything together in a CryptPad Document as one clean chronological itinerary with the stadium, matchup, date, city, travel leg, transport mode, hotel, food plan, and estimated costs for each stop, and leave the doc open at the end so I can review it.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "The agent identifies all 12 NPB stadiums and selects one summer game date and matchup for each stadium from npb.jp.", + "verification": "Grader can confirm that the final itinerary contains 12 distinct stadiums with valid summer dates and matchups consistent with the schedule pages viewed on npb.jp.", + "weight": 0.24 + }, + "R2": { + "requirement": "The agent creates a feasible chronological route connecting all 12 stadium cities using the cheapest practical mix of flights and ground transport.", + "verification": "Grader can confirm that each intercity leg has a stated transport mode and estimated cost, and that the route order aligns with the selected game dates without impossible overlaps.", + "weight": 0.2 + }, + "R3": { + "requirement": "The agent finds one accommodation near each stadium for the corresponding game night and uses actual Booking.com listing pages for the selections.", + "verification": "Grader can confirm 12 lodging entries with hotel names, nightly prices, and proximity rationale, and can verify from open listing pages that real hotel detail pages with photos/maps were used.", + "weight": 0.18 + }, + "R4": { + "requirement": "The agent researches notable Japanese food experiences in each stadium city, including local specialties or food districts where relevant.", + "verification": "Grader can confirm that each city has food context beyond generic dining, such as a local specialty, market, neighborhood, or cuisine angle sourced from Google research.", + "weight": 0.12 + }, + "R5": { + "requirement": "The agent builds a detailed food itinerary for each city with named venues such as ramen shops, izakayas, sushi spots, or markets, using Yelp pages where available.", + "verification": "Grader can confirm that each stop includes specific venue names and meal ideas, and that at least some restaurant pages were opened in separate tabs for visual comparison.", + "weight": 0.12 + }, + "R6": { + "requirement": "The agent compiles a complete integrated itinerary in CryptPad Document with games, travel, hotels, food plans, and itemized costs in chronological order.", + "verification": "Grader can confirm the CryptPad Document is open and contains all 12 stops with the required fields combined into one coherent schedule.", + "weight": 0.14 + } + }, + "categories": [ + "Sports > Baseball", + "Travel and Tourism > Travel and Tourism - Other" + ], + "num_categories": 2 + }, + { + "task_id": "56f6e52a7d77ae7831e527f64e6544b1f929904b", + "confirmed_task": "I'm starting to get serious about going on the academic job market, and I want a really thorough browser-based sweep of faculty openings at the top 50 U.S. computer science schools, not just a quick skim of a couple of job boards. Please start by opening CryptPad Sheets and create a spreadsheet called \"CS Faculty Job Search\" with columns for university, department or school, job title, area, rank, deadline, posting link, and a short verification note, because I want a clean tracker I can review later. Then use CSRankings to pull the top 50 U.S. CS schools and keep that page open as the master checklist so I can see exactly which schools were covered. From there, go school by school and actually visit the relevant CS department pages, engineering school faculty hiring pages, and university jobs or careers sites as needed to verify whether there are any tenure-track, tenured, or open-rank faculty jobs in computer science, machine learning, AI, data science, robotics, vision, NLP, or closely related areas. If you find a relevant opening, open the actual posting page in its own tab and read it directly rather than relying on search snippets, and if you had to use a department hiring page or university careers page to confirm it, leave that verification page open too so I can inspect the trail myself. If a school seems to have no relevant opening, I still want you to verify that as carefully as possible from the department or university hiring information before moving on, because the absence is useful too. As you go, keep the spreadsheet updated so each row corresponds to a real posting page you currently have open in a tab, with the university, department or school, title, area, rank, deadline if listed, link, and a brief note saying how you verified it. At the end, do a completeness pass against the full top-50 checklist, make sure the important tabs are still open, and add a short summary report with the main patterns you noticed and any recommendations about where the strongest cluster of relevant openings seems to be.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A CryptPad Sheet titled \"CS Faculty Job Search\" exists and includes the required columns for university, department or school, job title, area, rank, deadline, posting link, and verification note.", + "verification": "Grader can see the spreadsheet open in the browser with the exact title and visible header row containing all required fields.", + "weight": 0.1 + }, + "R2": { + "requirement": "A credible top-50 U.S. computer science school list is identified from CSRankings and kept available as the master checklist.", + "verification": "Grader can see an open CSRankings tab showing the rankings source used for the top 50 schools.", + "weight": 0.12 + }, + "R3": { + "requirement": "All 50 schools from the checklist are systematically checked using department, school, and/or university hiring pages as needed.", + "verification": "Spreadsheet completion note and verification notes indicate coverage of all 50 schools, and open tabs/history show school-by-school checking from original institutional sources.", + "weight": 0.24 + }, + "R4": { + "requirement": "Every relevant faculty opening found is verified from the actual posting page and each posting is opened in its own browser tab.", + "verification": "Grader can inspect open tabs for actual posting pages and confirm the recorded links point to live original postings rather than summaries or search results.", + "weight": 0.18 + }, + "R5": { + "requirement": "When a posting or absence required additional confirmation, the corresponding department hiring page or university careers page is also kept open as verification evidence.", + "verification": "Open tabs include supporting department or careers pages for cases where the verification note says those pages were used.", + "weight": 0.1 + }, + "R6": { + "requirement": "The spreadsheet contains one row per verified posting, and each row accurately includes all required fields plus a brief note on how the posting was verified.", + "verification": "Grader can compare spreadsheet rows against open posting tabs and see complete, populated fields with matching links and verification notes.", + "weight": 0.16 + }, + "R7": { + "requirement": "A final completeness review is performed confirming coverage of all 50 schools, consistency between spreadsheet rows and open tabs, and a concise summary of schools with no verified relevant openings plus overall patterns or recommendations.", + "verification": "Grader can see a final summary/completion note in the spreadsheet and confirm the important tabs remain open for inspection.", + "weight": 0.1 + } + }, + "categories": [ + "Jobs and Career > Jobs and Employment", + "Science and Education > Universities and Colleges" + ], + "num_categories": 2 + }, + { + "task_id": "40735c71648c0ca0e6291d534685853acf1122c1", + "confirmed_task": "I’m trying to map out a summer concert run and I’d love your help doing it in a real browser so I can actually look at the pages afterward. Please use Google to find the official tour or ticket pages for Zedd, Bad Bunny, and TWICE, and for each artist open the real official tour page in its own tab and pull out every U.S. show happening during the summer months, with the city, venue, date, and the actual ticket or event link. I want at least 10 total concert options across the three artists, and I’d like them organized in a CryptPad Sheets spreadsheet titled Summer Concert Plan so I can compare everything in one place. Once you’ve got the concert list, look across the dates and figure out a realistic sequence where I could attend at least one show from each artist in the same summer without impossible timing. Then use Google Flights to check real flight options for each leg of that route, making sure the dates and airports line up with the concert schedule, and open the flight results you used in tabs so I can review them. After that, use Booking.com to find a hotel in each concert city for the nights I’d need, and open the actual hotel listing pages so I can see the photos, prices, and location. Then use Google Maps or Google Search to find a couple of food options near each venue or hotel, because I want this to feel like a full trip and not just a list of shows. Please finish the spreadsheet with the concert options plus the chosen three-concert itinerary, including the verified flights, hotels, and food picks, and keep the official artist pages and the key booking tabs open so I can compare everything visually.", + "website": "https://www.google.com", + "reference_length": 12, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "Zedd's official tour or ticket page is found and opened in a browser tab.", + "verification": "A visible tab shows Zedd's official site or official ticket page with tour/event information.", + "weight": 0.06 + }, + "R2": { + "requirement": "Qualifying U.S. summer Zedd concerts are extracted with city, venue, date, and ticket/event link.", + "verification": "Zedd entries appear in the working notes or spreadsheet with all required fields and correspond to the open official page.", + "weight": 0.08 + }, + "R3": { + "requirement": "Bad Bunny's official tour or ticket page is found and opened in a browser tab.", + "verification": "A visible tab shows Bad Bunny's official site or official ticket page with tour/event information.", + "weight": 0.06 + }, + "R4": { + "requirement": "Qualifying U.S. summer Bad Bunny concerts are extracted with city, venue, date, and ticket/event link.", + "verification": "Bad Bunny entries appear in the working notes or spreadsheet with all required fields and match the open official page.", + "weight": 0.08 + }, + "R5": { + "requirement": "TWICE's official tour or ticket page is found and opened in a browser tab.", + "verification": "A visible tab shows TWICE's official site or official ticket page with tour/event information.", + "weight": 0.06 + }, + "R6": { + "requirement": "Qualifying U.S. summer TWICE concerts are extracted with city, venue, date, and ticket/event link.", + "verification": "TWICE entries appear in the working notes or spreadsheet with all required fields and match the open official page.", + "weight": 0.08 + }, + "R7": { + "requirement": "A CryptPad Sheets spreadsheet titled 'Summer Concert Plan' is created with at least 10 total U.S. summer concert entries across the three artists.", + "verification": "The spreadsheet title is visible and the sheet contains at least 10 rows of concert options with artist, city, venue, date, and ticket link columns filled.", + "weight": 0.14 + }, + "R8": { + "requirement": "A feasible same-summer itinerary is selected that includes at least one concert from Zedd, Bad Bunny, and TWICE.", + "verification": "The sheet clearly marks or lists a three-concert route with one selected show per artist and no obvious date conflicts.", + "weight": 0.12 + }, + "R9": { + "requirement": "Real flight options are verified for each leg of the selected itinerary and aligned to the concert dates.", + "verification": "Google Flights results tabs are open and the sheet records airports, airlines, dates, times, and links matching the chosen route.", + "weight": 0.1 + }, + "R10": { + "requirement": "Hotels are verified for each stop in the selected itinerary using Booking.com listing pages.", + "verification": "Booking.com hotel listing tabs are open for each city and the sheet records hotel names, stay dates, and booking links.", + "weight": 0.08 + }, + "R11": { + "requirement": "Food options are included for each stop in the selected itinerary.", + "verification": "The sheet contains nearby dining choices tied to each concert city, venue, or hotel, sourced from Google Search or Maps.", + "weight": 0.05 + }, + "R12": { + "requirement": "The final sheet contains the full verified itinerary and the official artist pages plus key booking tabs remain open for visual review.", + "verification": "The completed sheet shows concert options and the chosen travel plan with flights, hotels, and food, while official tour pages and booking/result tabs are still visible in the browser.", + "weight": 0.09 + } + }, + "categories": [ + "Arts & Entertainment > Music", + "Ecommerce & Shopping > Tickets" + ], + "num_categories": 2 + }, + { + "task_id": "ea8661bcf1150ea65b11aa0f2041cd4b1047d90f", + "confirmed_task": "I’m trying to build a serious law school research sheet for myself, and I want to base it on a major ranking rather than random lists. Please start on U.S. News and use its law school rankings to pull the top 20 U.S. law schools, including each school’s city and state, so I have a clean shortlist to work from. Then, for each of those 20 schools, go to the school’s official law school website and open the JD admissions page in its own tab so I can visually compare the official pages later. On each admissions page, verify the current JD application deadline, whether they accept the LSAT, GRE, or both for JD applicants, and the application fee if it’s listed on the admissions or application requirements pages. After that, stay on the official school sites and look for at least one real funding opportunity per school when possible—things like merit scholarships, named scholar programs, public interest fellowships, or other law student funding—so that we end up with at least 20 official funding opportunities total across the 20 schools. When you find one, open the actual program page in a new tab and verify the eligibility requirements and the funding amount or benefit if the school gives one, because I want this based only on official pages I could click through myself. Once you’ve gathered everything, create a CryptPad Sheets spreadsheet titled Top Law Schools and Fellowships and enter one row for each school with the school name, location, application deadline, LSAT/GRE policy, application fee, and the official admissions link. Then add the funding opportunities in the same sheet or a clearly labeled second tab with the school name, scholarship or fellowship name, eligibility criteria, funding amount or benefit if listed, and the official program link. Please leave the key admissions tabs and several of the funding tabs open so I can spot-check them in the browser, and finish with a short written summary in the sheet or an adjacent CryptPad Document explaining which schools seem to offer the most generous funding and whether the programs you found lean more toward public interest, leadership, or academic excellence.", + "website": "https://www.google.com", + "reference_length": 10, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A top-20 list of U.S. law schools is taken from U.S. News and includes school names and locations.", + "verification": "Grader can see U.S. News ranking page used as the source and confirm 20 schools with city/state recorded in the spreadsheet.", + "weight": 0.1 + }, + "R2": { + "requirement": "The official JD admissions page for each of the 20 schools is opened in separate tabs from the schools' official domains.", + "verification": "Browser shows multiple official admissions tabs open, and spreadsheet admissions links point to official law school pages.", + "weight": 0.1 + }, + "R3": { + "requirement": "The JD application deadline is verified and recorded for each of the 20 schools.", + "verification": "Each school row contains a deadline, and spot-checking open admissions tabs confirms the recorded dates or deadline language.", + "weight": 0.09 + }, + "R4": { + "requirement": "The LSAT/GRE policy is verified and recorded for each of the 20 schools.", + "verification": "Each school row includes whether LSAT, GRE, or both are accepted, and spot-checking official admissions pages matches the entries.", + "weight": 0.09 + }, + "R5": { + "requirement": "The JD application fee is verified and recorded for each of the 20 schools when listed on official pages.", + "verification": "Each school row includes an application fee or a clearly indicated official absence/unavailability, supported by official admissions or application pages.", + "weight": 0.08 + }, + "R6": { + "requirement": "At least 20 official law student funding opportunities are identified across the selected schools.", + "verification": "Spreadsheet contains 20 or more funding entries tied to official school programs, with school names and program names populated.", + "weight": 0.13 + }, + "R7": { + "requirement": "Each identified funding opportunity has an official program page opened and verified for eligibility requirements and funding amount or benefit if listed.", + "verification": "Representative funding tabs remain open, and funding rows include eligibility details plus amount/benefit information or note that no amount was listed officially.", + "weight": 0.13 + }, + "R8": { + "requirement": "A CryptPad Sheets file titled 'Top Law Schools and Fellowships' is created and populated with all 20 school admissions records.", + "verification": "Grader can see the spreadsheet title and 20 school rows containing school name, location, deadline, LSAT/GRE policy, application fee, and admissions link.", + "weight": 0.1 + }, + "R9": { + "requirement": "The spreadsheet includes at least 20 funding opportunity entries with school name, program name, eligibility, funding amount or benefit if listed, and official link.", + "verification": "Funding sheet or section contains 20 or more complete entries with clickable official links and corresponding details.", + "weight": 0.1 + }, + "R10": { + "requirement": "A final synthesis compares which schools appear most generous and categorizes program emphasis as public interest, leadership, or academic excellence.", + "verification": "A written summary is present in the spreadsheet or adjacent CryptPad Document and references patterns visible in the compiled funding data.", + "weight": 0.08 + } + }, + "categories": [ + "Science and Education > Universities and Colleges", + "Law and Government > Legal" + ], + "num_categories": 2 + }, + { + "task_id": "ef766b69020befdc8e208f47401cb6bce5e9b931", + "confirmed_task": "I’m planning a pretty big house renovation in the Dallas–Fort Worth area and want to build a solid shortlist of contractors before I start calling people, so could you help me research this in the browser and keep the evidence visible? Please start in CryptPad Document and create a spreadsheet-style document titled DFW Renovation Contractors with sections for Landscapers, Plumbers, Electricians, and a Final Summary, because I want everything in one place while I compare options. Then use Google Search to find landscaping companies that clearly serve Dallas or Fort Worth, and for each one open the actual company website or business profile page in its own tab so I can inspect the details later; as you go, verify that the company really does landscaping work, note the rating or review score if one is shown, and confirm from the site or profile that the service area includes Dallas, Fort Worth, or the broader DFW area. I need at least 10 landscapers that meet those checks, and once you’ve verified them, record the company name, service type, location, rating if available, and the direct link to the company page in the document. After that, do the same thing for plumbers serving Dallas or Fort Worth, again making sure each qualifying company has its own tab left open on the actual page and that the document captures the same fields for at least 10 plumbers. Then repeat the process for electricians in the same area, with at least 10 verified entries and each company page still open in a tab so I can visually compare them afterward. When all three categories are filled out, do a cleanup pass in the document and across the open tabs to make sure every listed contractor still has a matching tab open, every entry really serves Dallas or Fort Worth, and every row has the required details. Finally, add a short summary explaining which companies seem to have the strongest reputation and the broadest Dallas/Fort Worth coverage based on the ratings, reviews, and service-area evidence you found, and leave the document plus the contractor tabs open so I can review everything on screen.", + "website": "https://www.google.com", + "reference_length": 9, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A CryptPad document titled 'DFW Renovation Contractors' exists and is organized with sections for Landscapers, Plumbers, Electricians, and Final Summary, including fields for company name, service type, location, rating if available, and company-page link.", + "verification": "Grader can see the document title and the structured sections or tables with the required columns/fields visible in CryptPad Document.", + "weight": 0.08 + }, + "R2": { + "requirement": "At least 10 qualifying landscapers serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.", + "verification": "Grader can inspect open tabs and confirm there are at least 10 landscaper company/profile pages corresponding to qualifying businesses with visible service and service-area evidence.", + "weight": 0.12 + }, + "R3": { + "requirement": "The document contains complete recorded details for at least 10 verified landscapers: company name, service type, location, rating if available, and direct link to the company page.", + "verification": "Grader can review the Landscapers section in the document and confirm at least 10 entries with all required fields populated and matching the open tabs.", + "weight": 0.12 + }, + "R4": { + "requirement": "At least 10 qualifying plumbers serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.", + "verification": "Grader can inspect open tabs and confirm there are at least 10 plumber company/profile pages corresponding to qualifying businesses with visible service and service-area evidence.", + "weight": 0.12 + }, + "R5": { + "requirement": "The document contains complete recorded details for at least 10 verified plumbers: company name, service type, location, rating if available, and direct link to the company page.", + "verification": "Grader can review the Plumbers section in the document and confirm at least 10 entries with all required fields populated and matching the open tabs.", + "weight": 0.12 + }, + "R6": { + "requirement": "At least 10 qualifying electricians serving Dallas, Fort Worth, or DFW are identified from Google Search and each has an actual company website or business profile page open in its own tab.", + "verification": "Grader can inspect open tabs and confirm there are at least 10 electrician company/profile pages corresponding to qualifying businesses with visible service and service-area evidence.", + "weight": 0.12 + }, + "R7": { + "requirement": "The document contains complete recorded details for at least 10 verified electricians: company name, service type, location, rating if available, and direct link to the company page.", + "verification": "Grader can review the Electricians section in the document and confirm at least 10 entries with all required fields populated and matching the open tabs.", + "weight": 0.12 + }, + "R8": { + "requirement": "A validation pass confirms every listed contractor has a corresponding open tab, clearly serves Dallas or Fort Worth, and the final list still contains at least 10 valid entries in each service type after any replacements.", + "verification": "Grader can compare the document entries against the open tabs and confirm that each recorded contractor is backed by a visible page and meets the service-area requirement.", + "weight": 0.1 + }, + "R9": { + "requirement": "The Final Summary identifies which contractors appear strongest based on reputation and Dallas/Fort Worth service coverage using evidence from the verified company pages and profiles.", + "verification": "Grader can read the Final Summary section and see comparative conclusions tied to ratings/reviews and service-area coverage from the collected entries and open tabs.", + "weight": 0.1 + } + }, + "categories": [ + "Home and Garden > Home Improvement and Maintenance", + "Heavy Industry and Engineering > Construction and Maintenance" + ], + "num_categories": 2 + }, + { + "task_id": "7aab821efa9c268801d21ad8cf2ca60a82c699b3", + "confirmed_task": "I'm seriously thinking about applying to MBA programs in the U.S., and I want a solid research sheet I can actually use to decide where to apply and where funding might be strongest. Please start by using U.S. News and then Poets&Quants to build a defensible list of the top 20 full-time MBA programs in the United States, mainly so I have a realistic shortlist based on major rankings rather than just reputation. Once that list looks settled, go to each school's official MBA admissions site and open every program's admissions page in its own tab so I can visually inspect the official pages later, and for each one capture the application deadline, the GMAT/GRE or test-waiver policy, the application fee, the program length, and the admissions URL. After that, use the official business school or university financial aid pages for those same schools to find MBA-specific fellowships, scholarships, or named funding programs, and whenever you find one, open the actual official funding page in its own tab and verify the eligibility rules and the funding amount if the page lists one. I want at least 20 schools and at least 20 total funding opportunities across them, so if a school's main admissions page is vague, keep digging on the official school domain until you find the clearest scholarship or fellowship source. Then create a spreadsheet or document titled Top MBA Programs and Fellowships and record, for each school, the school name, MBA program name, application deadline, GMAT/GRE policy, application fee, program length, and admissions link, followed by any associated fellowship or scholarship names, eligibility criteria, funding amount if listed, and the link to the official funding page. Please keep the official admissions tabs and the official funding tabs open so I can compare them side by side afterward, and before you finish, do one pass through the open tabs to make sure every row in the sheet has matching browser proof. At the end, add a short summary telling me which schools seem to offer the biggest funding opportunities and whether the awards you found are mostly merit-based, diversity-focused, leadership-oriented, need-based, or something else, because I want a quick sense of where I should spend the most application effort.", + "website": "https://www.google.com", + "reference_length": 8, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A defensible final list of 20 top U.S. full-time MBA programs is established using both U.S. News and Poets&Quants.", + "verification": "Grader can confirm that the document includes 20 schools and that the selected set is based on evidence gathered from both ranking sites.", + "weight": 0.12 + }, + "R2": { + "requirement": "Official admissions pages are opened in separate tabs for all 20 selected schools and the required admissions details are captured for each school.", + "verification": "Grader can inspect open official admissions tabs and match them to document rows containing deadline, GMAT/GRE policy, application fee, program length, and admissions URL for all 20 schools.", + "weight": 0.22 + }, + "R3": { + "requirement": "MBA-specific funding opportunities are identified on official school domains across the selected schools, with at least 20 total fellowships or scholarships found.", + "verification": "Grader can count at least 20 funding entries in the document and confirm they are tied to official school sources rather than third-party summaries.", + "weight": 0.16 + }, + "R4": { + "requirement": "Each listed fellowship or scholarship is verified on its own official page with eligibility criteria and funding amount recorded when available.", + "verification": "Grader can inspect open official funding tabs and confirm that each listed award has a matching page and includes eligibility details plus funding amount if the page provides one.", + "weight": 0.16 + }, + "R5": { + "requirement": "Coverage is complete, meaning all 20 MBA programs are documented and the total verified funding opportunities reach at least 20 after gap-filling.", + "verification": "Grader can verify final counts in the document and see that missing or unclear schools were supplemented with additional official-source research.", + "weight": 0.1 + }, + "R6": { + "requirement": "A document or spreadsheet titled 'Top MBA Programs and Fellowships' is created and organized with all required school and funding fields.", + "verification": "Grader can open the created file and confirm the exact title and the presence of school-level admissions data plus funding-level details and links.", + "weight": 0.1 + }, + "R7": { + "requirement": "Browser-proof is preserved by keeping official admissions and funding pages open for the documented entries.", + "verification": "Grader can inspect the browser state and confirm that official admissions tabs and official funding tabs remain open and correspond to the entries in the document.", + "weight": 0.06 + }, + "R8": { + "requirement": "The final document includes a comparative summary of the largest funding opportunities and the main funding categories observed.", + "verification": "Grader can read the concluding section and confirm it identifies schools with stronger funding and categorizes awards into patterns such as merit, diversity, leadership, need-based, or other.", + "weight": 0.08 + } + }, + "categories": [ + "Science and Education > Universities and Colleges", + "Science and Education > Business Training" + ], + "num_categories": 2 + }, + { + "task_id": "e7ae8abcf742d5ba2ef4eef88d16bbe26df978e2", + "confirmed_task": "I'm trying to get a realistic picture of the top U.S. medical schools before I go too far down the application rabbit hole, and I want this organized in a way I can actually review later. Please start by using U.S. News to pull together a top-20 list of U.S. medical schools, then cross-check that list on Times Higher Education so the schools we keep are broadly supported by major rankings rather than coming from just one source. Once that shortlist looks solid, go to each school's official MD admissions site and open the actual admissions page in its own tab so I can see the real pages later, and from those official pages verify the application deadline, whether the MCAT is required, and a brief program overview with the school name, university, and location. After that, stay on the official university or medical school sites and look for at least one funding opportunity, fellowship, scholarship, or special training program tied to medical students for each school where possible — things like merit scholarships, leadership fellowships, research fellowships, or service and specialized pathway programs. When you find a relevant program, open the actual program page in a new tab and verify the eligibility details and any funding amount if it's listed, because I want browser-proof tabs open to the official sources rather than just a summary. Then create a CryptPad document titled Top Medical Schools and Fellowships and fill it in with one structured entry for each of the 20 schools, including the admissions link and the associated program links, and make sure there are at least 20 total funding or special-program entries across the whole document. At the end, leave the document open and also keep a representative set of the admissions and program tabs open so I can visually spot-check them, then add a short summary explaining which schools seem to offer the strongest funding support and whether the programs you found lean more toward research, leadership, or service, along with your top recommendations for where I should focus first.", + "website": "https://www.google.com", + "reference_length": 9, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A top-20 list of U.S. medical schools is established from U.S. News and used as the initial candidate set.", + "verification": "Visible evidence from U.S. News ranking pages and a recorded list of 20 candidate schools reflected in the working notes or final document.", + "weight": 0.08 + }, + "R2": { + "requirement": "The selected schools are cross-validated against Times Higher Education and a final top-20 set is chosen based on major rankings support.", + "verification": "Visible THE ranking page or search results are used to confirm overlap or support for the selected schools, and the final 20-school set is consistent in the document.", + "weight": 0.08 + }, + "R3": { + "requirement": "Official MD admissions pages are opened for all 20 selected schools in separate tabs.", + "verification": "Browser shows official university or medical school admissions tabs for the selected schools, and the document includes official admissions links for each school.", + "weight": 0.14 + }, + "R4": { + "requirement": "Application deadline and MCAT requirement are accurately verified from official sources for all 20 schools.", + "verification": "Each school entry in the document contains a deadline and MCAT requirement sourced from official admissions pages, with values matching the visible source tabs.", + "weight": 0.14 + }, + "R5": { + "requirement": "Program overview information, school name, university, and location are accurately captured for all 20 schools from official pages.", + "verification": "Each school entry includes identifying details and a concise overview that align with the official program or school overview pages.", + "weight": 0.1 + }, + "R6": { + "requirement": "Funding opportunities or special programs relevant to medical students are identified across the selected schools.", + "verification": "The document contains associated scholarships, fellowships, or special programs tied to the schools, sourced from official university pages.", + "weight": 0.1 + }, + "R7": { + "requirement": "At least 20 funding opportunities or special programs are individually verified on official program pages, including eligibility and funding information when listed.", + "verification": "There are at least 20 program entries with official links, and representative open tabs show program pages containing eligibility details and funding amounts where available.", + "weight": 0.14 + }, + "R8": { + "requirement": "A CryptPad Document titled 'Top Medical Schools and Fellowships' is created and contains complete structured entries for all 20 schools and their associated programs.", + "verification": "The CryptPad Document title matches exactly, the document is open in the browser, and it includes 20 school records with admissions and program details plus links.", + "weight": 0.14 + }, + "R9": { + "requirement": "The final document includes a synthesis comparing which schools appear to provide the most funding support and whether programs emphasize research, leadership, or service, while confirming the minimum counts and leaving proof tabs open.", + "verification": "The bottom of the CryptPad Document contains a written summary and count confirmation, and the browser still shows the document plus representative admissions and program tabs open.", + "weight": 0.08 + } + }, + "categories": [ + "Science and Education > Universities and Colleges", + "Health > Medicine" + ], + "num_categories": 2 + }, + { + "task_id": "feba3355ecae6838d521294bbca8e50cf99f0a53", + "confirmed_task": "I'm getting serious about applying to computer science PhD programs in machine learning and AI, and I want a realistic shortlist of schools plus specific professors who might actually be good advisor matches. Please start on CSRankings and use it to pull together the top 25 U.S. computer science PhD programs as the base list, because I want something credible and standardized rather than a random blog ranking. Then open CryptPad Sheets and create a spreadsheet called \"ML PhD Programs and Advisors\" with columns for university, professor name, research area, whether they appear to be accepting students, the exact evidence or wording you found, professor page link, and lab or research group link if there is one. After that, go school by school through the CS department faculty directories for those top programs and look specifically for faculty working in machine learning, artificial intelligence, data science, NLP, robotics, or computer vision, since those are the areas I'm most likely to apply in. For each promising professor, open the actual faculty profile or personal website in a new tab, and if they have a lab page open that too in another tab so I can visually inspect the pages later; I especially want you to look for signs like \"accepting PhD students,\" \"recruiting,\" application guidance for prospective students, active lab rosters, recent projects, or anything else that suggests they are actively supervising graduate students right now. Please record at least 20 professors across the top schools, but try to spread them across the list instead of clustering everything at just MIT, Stanford, Berkeley, and CMU, because I want a broad application strategy. As you go, keep the relevant professor and lab tabs open so I can review the evidence on screen, and if a professor does not explicitly say they are accepting students, note that clearly rather than guessing. Once the sheet looks complete, do a consistency pass to make sure the links work, the research areas match what is actually on the pages, and the accepting-students status is backed by visible evidence. Then add a short summary section in the sheet or a companion doc explaining which universities seem to have the biggest clusters of ML/AI faculty who appear open to advising new PhD students, so I can see where my odds and fit might be strongest.", + "website": "https://www.google.com", + "reference_length": 8, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A credible top-25 list of U.S. computer science PhD programs is identified on CSRankings and used as the basis for the research.", + "verification": "Grader can see CSRankings open with the ranking view and a corresponding list of 25 U.S. universities reflected in the working materials.", + "weight": 0.12 + }, + "R2": { + "requirement": "A CryptPad Sheets titled \"ML PhD Programs and Advisors\" is created with the required columns: university, professor name, research area, accepting students status, evidence text, professor page link, and lab or research group link.", + "verification": "Grader can see the spreadsheet title and header row visible in CryptPad Sheets.", + "weight": 0.1 + }, + "R3": { + "requirement": "Faculty directories are opened for the target universities and ML/AI-aligned faculty are identified from those directories.", + "verification": "Grader can see multiple university CS faculty directory tabs open and relevant faculty entries visible on those pages.", + "weight": 0.13 + }, + "R4": { + "requirement": "Professor personal pages, faculty profiles, or lab pages are opened in separate tabs and used to verify both research alignment and evidence of active advising or student recruitment.", + "verification": "Grader can see professor and lab tabs open with visible text such as research topics, lab information, or statements about accepting or recruiting students.", + "weight": 0.18 + }, + "R5": { + "requirement": "The spreadsheet contains at least 20 professor entries across the top programs, each with all required fields completed.", + "verification": "Grader can count at least 20 filled rows in the sheet and confirm each row includes university, professor, research area, accepting status, evidence, and links.", + "weight": 0.2 + }, + "R6": { + "requirement": "The collected set shows broad coverage across the top-25 schools rather than being concentrated in only a small number of universities, and relevant professor or lab pages remain open in tabs.", + "verification": "Grader can see entries spanning a meaningful range of universities in the sheet and multiple supporting tabs still open for inspection.", + "weight": 0.1 + }, + "R7": { + "requirement": "The recorded data is internally consistent and links, research areas, and accepting-status entries are validated against the source pages.", + "verification": "Spot checks of several rows against the open tabs show matching research areas, working links, and accepting-status claims supported by visible evidence text.", + "weight": 0.09 + }, + "R8": { + "requirement": "A final summary identifies which universities appear to have the largest clusters of ML/AI faculty accepting PhD students, along with key patterns or recommendations.", + "verification": "Grader can see a summary section in the sheet or companion doc that names universities, describes cluster strength, and provides concise takeaways.", + "weight": 0.08 + } + }, + "categories": [ + "Science and Education > Universities and Colleges", + "Computers Electronics and Technology > Programming and Developer Software" + ], + "num_categories": 2 + }, + { + "task_id": "d8061d694d7a4276f12e8f15c5d3029ab084e7d1", + "confirmed_task": "I’m helping the same couple plan two separate weddings and I want something I can actually review in the browser afterward. First, use Google to find official venue pages for Napa Valley wedding venues that could realistically handle about 200 guests for a fall wedding, and please open each serious option in its own tab so I can compare them side by side later. I need about 10 Napa venues, and for each one please verify from the actual venue site the venue name, where it is in the Napa Valley area, the stated maximum wedding capacity or closest guest-count language you can find, plus a short description of what kind of place it is. Once you’ve verified those, go to CryptPad Sheets and create a spreadsheet called Napa Wedding Venues with columns for venue name, location, maximum guest capacity, venue description, and link to venue page, then fill it in with the Napa venues you confirmed, making sure the rows match the tabs you’ve kept open. After that, switch to Seoul and use Google to find wedding venues or wedding halls specifically in Gangnam-gu, opening each official site or the most authoritative live venue page you can find in its own tab so I can visually review the listings. I’d like around 10 Gangnam options too, and for each one please confirm the Gangnam location, the wedding capacity or at least an approximate size if that’s all the page gives, and what type of venue it is, like hotel, wedding hall, banquet hall, or something similar. Keep those Gangnam tabs open as well, and then give me a short comparison report that summarizes the 10 Seoul options by name, location, approximate size or capacity, and venue type, because I’m trying to see how the Napa and Gangnam venue pools compare before we narrow anything down.", + "website": "https://www.google.com", + "reference_length": 8, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "About 10 Napa Valley wedding venues suitable for around 200 guests are identified via Google and each venue page is opened in its own browser tab.", + "verification": "Grader can see multiple Napa venue tabs open from search results and confirm the pages correspond to distinct venue sites relevant to Napa Valley weddings.", + "weight": 0.14 + }, + "R2": { + "requirement": "Each Napa venue is verified from its official page for venue name, location, guest capacity or closest stated guest-count language, brief description, and source suitability.", + "verification": "Open Napa tabs visibly show venue details or event/wedding information that supports the extracted fields.", + "weight": 0.16 + }, + "R3": { + "requirement": "A CryptPad Sheets spreadsheet titled 'Napa Wedding Venues' is created with the required columns: venue name, location, maximum guest capacity, venue description, and link to venue page.", + "verification": "Grader can see the spreadsheet title and header row in CryptPad Sheets with the exact required columns present.", + "weight": 0.1 + }, + "R4": { + "requirement": "The Napa spreadsheet is populated with around 10 verified venue rows, and the entries correspond to the open Napa venue tabs.", + "verification": "Spreadsheet contains about 10 filled rows and the names/links align with the open Napa venue pages.", + "weight": 0.16 + }, + "R5": { + "requirement": "About 10 Gangnam wedding venues or wedding halls are identified and each official or authoritative live venue page is opened in its own browser tab.", + "verification": "Grader can see multiple Gangnam venue tabs open and confirm they are distinct venues relevant to weddings in Gangnam-gu.", + "weight": 0.12 + }, + "R6": { + "requirement": "Each Gangnam venue is verified for location, capacity or approximate size, and venue type from the opened pages.", + "verification": "Open Gangnam tabs visibly support the extracted location, size/capacity, and type fields, even if some capacities are approximate.", + "weight": 0.14 + }, + "R7": { + "requirement": "The Gangnam venue set is cross-checked to ensure around 10 distinct suitable venues with sufficient comparison data, and the tabs remain open for review.", + "verification": "Final open-tab set shows distinct Gangnam venues with enough visible information to compare, without obvious duplicates.", + "weight": 0.08 + }, + "R8": { + "requirement": "A short comparison report is produced for the Gangnam venues covering name, location, approximate size or capacity, and venue type.", + "verification": "Final output includes a concise Seoul/Gangnam comparison summary with the required fields for the identified venues.", + "weight": 0.1 + } + }, + "categories": [ + "Community and Society > Community and Society - Other", + "Travel and Tourism > Tourist Attractions", + "Lifestyle > Weddings" + ], + "num_categories": 3 + }, + { + "task_id": "a930fe364d5950d8cdcb74fe1316ecebb7d63009", + "confirmed_task": "I’m trying to get a genuinely rigorous picture of the current market for AI and machine learning postdoc roles, because I want a shortlist I could actually use for applications instead of a shallow search result dump. Please start in CryptPad Sheets and create a spreadsheet named AI Postdoc Opportunities with columns for university or lab, department or institute or PI, exact position title, research area, deadline or start date if shown, posting link, and a short note saying how you verified it. Then use Google to build a coverage list of roughly the top 30 U.S. computer science schools along with major AI labs and related institutes, and work through that list carefully using official university job boards, department hiring pages, institute sites, lab pages, faculty group pages, and central hiring portals. As you find anything promising, open the actual posting page in its own tab and read enough of it to confirm it is really a current postdoctoral research opening in AI, machine learning, or a closely related area, not a faculty search, PhD opening, internship, or industry research scientist role. If you need a second official page like a lab hiring page or institute announcement to confirm it, open that too and leave both tabs available so I can inspect the evidence myself. For schools or labs where nothing turns up, please do a second pass across multiple official sources so the absence is at least reasonably verified. As you go, fill the spreadsheet with every verified opening and make sure each row corresponds to a posting tab you still have open. Near the end, keep the spreadsheet visible along with several of the posting tabs and any supporting hiring pages, because I want visual proof of the search trail, and then add a short summary of the main patterns you noticed and which opportunities look strongest or most relevant.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A spreadsheet titled 'AI Postdoc Opportunities' is created in CryptPad Sheets with the requested columns for capturing openings and verification details.", + "verification": "Grader can see an open spreadsheet with the exact title and visible headers for university/lab, department/institute/PI, position title, research area, deadline/start date, posting link, and verification note.", + "weight": 0.1 + }, + "R2": { + "requirement": "A broad coverage checklist is assembled that includes roughly the top 30 U.S. computer science schools plus major AI labs and related institutes.", + "verification": "Grader can see the compiled institution list in the working materials or spreadsheet and confirm it spans top CS universities and major AI labs/institutes rather than only a handful of examples.", + "weight": 0.14 + }, + "R3": { + "requirement": "The search is carried out systematically across the checklist using official departmental, institute, lab, faculty, and university hiring sources.", + "verification": "Browser history/tabs show repeated navigation from Google to official university or lab domains for multiple institutions, demonstrating a broad audit rather than a few isolated searches.", + "weight": 0.18 + }, + "R4": { + "requirement": "Each included opportunity is verified on the actual posting page as a current postdoctoral research position relevant to AI/ML, with non-postdoc or irrelevant roles excluded.", + "verification": "Open tabs show live posting pages whose titles and text indicate postdoctoral roles; supporting tabs are open where needed to confirm relevance or status.", + "weight": 0.22 + }, + "R5": { + "requirement": "Every verified opening is entered in the spreadsheet with all requested fields completed as available, including a direct link and a verification note.", + "verification": "Spreadsheet rows contain populated fields for each verified role, and the posting links correspond to the open tabs used for verification.", + "weight": 0.18 + }, + "R6": { + "requirement": "Institutions with no identified openings receive an additional verification pass using multiple official sources to support the absence conclusion.", + "verification": "For at least several no-opening institutions, tabs or notes show checks across more than one official source such as department, institute, lab, or central jobs pages.", + "weight": 0.1 + }, + "R7": { + "requirement": "The final workspace remains visually inspectable, with the spreadsheet visible, important posting/supporting tabs left open, and a short summary of patterns and recommendations added.", + "verification": "Grader can see the spreadsheet open, multiple source tabs still present, and a visible summary section or note capturing takeaways and strongest opportunities.", + "weight": 0.08 + } + }, + "categories": [ + "Jobs and Career > Jobs and Employment", + "Science and Education > Science and Education - Other" + ], + "num_categories": 2 + }, + { + "task_id": "8464610469e9466cd87449671df0c4e761fa7434", + "confirmed_task": "I’m daydreaming about doing a full summer KBO baseball trip through South Korea and I want to make it feel like a real, bookable plan instead of just a rough idea. Please start on koreabaseball.com and pull the current KBO schedule, then identify all 10 active KBO stadiums and choose one actual summer game at each stadium, ideally in a route that won’t make me zigzag all over the country. Once you have those 10 game dates and matchups, use Google Flights and Google Maps or Google Travel to figure out the cheapest practical way to move between each stop, whether that means flights, trains, buses, or driving, because I want the route to be efficient and budget-conscious. After that, go to Booking.com and find one solid place to stay near each stadium for the corresponding game night, aiming for convenient locations and reasonable prices rather than luxury. Then use Google Search to research what each city is especially known for eating, and use Yelp to turn that into a real food plan for every stop with specific restaurants, markets, or street-food areas I could actually visit around the game. As you do this, please open the actual hotel listings in their own tabs so I can compare photos and map locations, and for at least a couple of the food stops, open the real listing pages so I can visually verify they look active and worth visiting. When it all comes together, put everything into a CryptPad Document with the stadium, city, game date and matchup, travel leg, lodging, food plan, and estimated costs for each stop, and leave the finished doc open so I can review it.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "Retrieve the current KBO schedule and identify 10 distinct KBO stadiums, selecting one real summer game date and matchup for each.", + "verification": "Grader can confirm the chosen games correspond to visible schedule information on koreabaseball.com and that all 10 stadiums are distinct and summer-dated.", + "weight": 0.22 + }, + "R2": { + "requirement": "Create a complete ordered route covering all 10 selected stadium stops with the cheapest practical transport choice for each leg.", + "verification": "Grader can confirm each travel leg connects consecutive selected cities from the itinerary and includes a transport mode with estimated cost/time derived from Google tools.", + "weight": 0.2 + }, + "R3": { + "requirement": "Find one accommodation option near each stadium for the corresponding game night and visually verify the chosen listings by opening the actual hotel pages.", + "verification": "Grader can confirm 10 lodging selections exist, align with the itinerary dates/cities, and that Booking.com listing pages or tabs show real hotel details, photos, and map context.", + "weight": 0.18 + }, + "R4": { + "requirement": "Research city-specific Korean food specialties or notable food areas for each stadium city.", + "verification": "Grader can confirm each city has at least one locally relevant dish, market, food street, or culinary specialty sourced from Google research rather than generic cuisine labels.", + "weight": 0.12 + }, + "R5": { + "requirement": "Build a detailed food itinerary for each city with named restaurants, markets, and/or street-food stops that fit the trip schedule, including visible verification on at least some actual Yelp listing pages.", + "verification": "Grader can confirm each city has concrete food stops tied to the itinerary and that at least a couple of Yelp business pages were opened and appear active.", + "weight": 0.13 + }, + "R6": { + "requirement": "Compile a coherent final itinerary in CryptPad Document that integrates games, stadiums, travel, accommodations, food plans, and estimated costs into one document.", + "verification": "Grader can confirm the CryptPad Document contains all 10 stops in order with the required fields and remains open at the end for review.", + "weight": 0.15 + } + }, + "categories": [ + "Sports > Baseball", + "Travel and Tourism > Travel and Tourism - Other" + ], + "num_categories": 2 + }, + { + "task_id": "8f005e9f09101dd540f1f666063483931e8faa59", + "confirmed_task": "I’m helping a family get settled in Raleigh, North Carolina, and I want to line up both healthcare and childcare in one pass so they have real options to review on screen. Please start with Google and figure out three major health insurance plans that are actually relevant in Raleigh, then use the official insurer sites for Aetna, Blue Cross NC, and UnitedHealthcare to open each plan’s consumer overview page in its own tab and also open the matching provider directory or doctor search page in its own tab, because I want to be able to look at both the plan details and the network search later. Once those networks are confirmed, use the insurer directories and actual doctor profile pages to find at least 10 pediatricians in Raleigh who accept one or more of those plans; for every pediatrician you include, open the real profile or listing page in its own tab and verify both the accepted insurance and the clinic location so I can inspect the pages myself. After that, switch to childcare and use Google to find around 10 daycare centers in Raleigh, then open the actual daycare page, official site, or a reputable listing page for each one in its own tab and verify the age range served and the location, because I need to compare realistic options for a family with young kids. Please keep all the pediatrician and daycare tabs open as proof, and at the end give me a structured report with three sections for the insurance plans, the pediatricians who take those plans, and the daycare options, plus a short judgment on which insurance plan seems to have the biggest pediatrician network in Raleigh and which daycare centers look the most highly rated.", + "website": "https://www.google.com", + "reference_length": 9, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "Three major insurers relevant to Raleigh are identified and official overview pages are selected for Aetna, Blue Cross NC, and UnitedHealthcare.", + "verification": "Grader can see official insurer overview tabs open for all three carriers and the report names the three plans/insurers.", + "weight": 0.12 + }, + "R2": { + "requirement": "Aetna overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.", + "verification": "Aetna overview tab and Aetna provider search/directory tab are visibly open, with the directory showing a pediatrician search context for Raleigh or equivalent.", + "weight": 0.08 + }, + "R3": { + "requirement": "Blue Cross NC overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.", + "verification": "Blue Cross NC overview tab and provider search/directory tab are visibly open, with pediatrician search capability shown for Raleigh or equivalent.", + "weight": 0.08 + }, + "R4": { + "requirement": "UnitedHealthcare overview and provider directory are both opened and directory access for Raleigh pediatricians is verified.", + "verification": "UnitedHealthcare overview tab and provider search/directory tab are visibly open, with pediatrician search capability shown for Raleigh or equivalent.", + "weight": 0.08 + }, + "R5": { + "requirement": "At least 10 pediatricians in Raleigh who accept one or more of the selected plans are identified and verified.", + "verification": "Final report lists at least 10 pediatricians with accepted insurance and clinic location, and the open tabs show matching doctor profile or listing pages.", + "weight": 0.24 + }, + "R6": { + "requirement": "Every listed pediatrician corresponds to an open tab showing an actual doctor profile or listing page.", + "verification": "Grader can count open pediatrician tabs and match them to the doctors named in the report.", + "weight": 0.1 + }, + "R7": { + "requirement": "Around 10 Raleigh daycare centers are identified with verified location and age range served.", + "verification": "Final report lists about 10 daycare centers, each with location and age range, supported by visible daycare listing or official tabs.", + "weight": 0.16 + }, + "R8": { + "requirement": "Every listed daycare corresponds to an open tab showing the actual daycare page or reputable listing page.", + "verification": "Grader can match each daycare named in the report to an open tab on Care.com, Winnie, or an official site.", + "weight": 0.06 + }, + "R9": { + "requirement": "A final structured report is produced with the three required sections and a brief comparative summary of pediatrician network size and daycare ratings.", + "verification": "Report includes sections for insurance plans, pediatricians, and daycare options, plus a concise conclusion naming the plan with the broadest apparent pediatrician network and the daycare centers that seem most highly rated.", + "weight": 0.08 + } + }, + "categories": [ + "Health > Health - Other", + "Science and Education > Education", + "Lifestyle > Childcare" + ], + "num_categories": 3 + }, + { + "task_id": "b421f308e18fc92b84ed676609e62a2b536b28b7", + "confirmed_task": "I'm flying from Pittsburgh to a wedding in Palm Springs on the 3rd of next month and I need help putting together a full trip plan in the browser so I can actually see everything. Please start on Google Flights and search for round-trip flights from Pittsburgh to LAX, picking dates so I land at least 2 days before the wedding and get back to Pittsburgh by the 5th, and prioritize non-stop options if they exist — open the best result in its own tab so I can review it. Then do the same search but for Pittsburgh to Palm Springs International Airport (PSP) instead, because flying directly into Palm Springs might save me the drive entirely, and open that result in its own tab too so I can compare the two side by side. Once you've got both flight options, pull up Google Maps and check the drive time from LAX to Palm Springs, because I only want to drive between 9am and 4pm — so if the LAX flight lands too late to make that window, either adjust the flight date or find me a hotel near LAX on Booking.com for an overnight stay before driving, and leave the hotel page open so I can see the price and location. If PSP ends up being the better option and skips the drive issue entirely, flag that clearly. After that, search for car rental options at whichever airport makes more sense for the dates I'd need, and open at least one rental listing so I can see the vehicle type and daily rate. Then check whether I could squeeze in a stop at either Soban or Holbox on any of the drives between the airport and Palm Springs — look up both on Google Maps, see how far each detour would add, and recommend which one is actually worth it given the 9am–4pm driving constraint. Finally, open CryptPad and create a new document where you lay out the full day-by-day itinerary covering the chosen flights, the drive or lack thereof, hotel if needed, car rental, the wedding on the 3rd, and the recommended detour stop, and leave the CryptPad doc open so I can edit it later.", + "website": "https://www.google.com", + "reference_length": 8, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "Search Google Flights for round-trip non-stop flights from Pittsburgh to LAX, arriving at least 2 days before the 3rd and returning by the 5th, and open the best result in its own tab.", + "verification": "Grader can confirm a Google Flights tab is open showing Pittsburgh to LAX results with correct dates, and the selected option is visible with airline, times, and price.", + "weight": 0.15 + }, + "R2": { + "requirement": "Search Google Flights for round-trip flights from Pittsburgh to Palm Springs International Airport (PSP) for the same date constraints, and open the best result in its own tab for comparison.", + "verification": "Grader can confirm a Google Flights tab is open showing Pittsburgh to PSP results with correct dates, and the selected option is visible with airline, times, and price.", + "weight": 0.15 + }, + "R3": { + "requirement": "Look up the drive time from LAX to Palm Springs on Google Maps and assess whether the LAX flight's landing time allows driving within the 9am–4pm window. If not, either adjust the flight or find a hotel near LAX on Booking.com with the page left open.", + "verification": "Grader can confirm a Google Maps lookup was performed, the driving window constraint is addressed, and if a hotel is needed, a Booking.com property page is open with price and dates.", + "weight": 0.12 + }, + "R4": { + "requirement": "Explicitly compare the LAX vs PSP flight options and flag which airport makes more sense given price, convenience, and the driving constraint.", + "verification": "Final response clearly states the trade-offs between LAX and PSP and names the recommended airport with reasoning.", + "weight": 0.1 + }, + "R5": { + "requirement": "Find car rental options at the recommended airport for the trip dates, and open at least one rental listing showing vehicle type and daily rate.", + "verification": "Grader can confirm a car rental search was performed and at least one concrete option is visible with provider, vehicle type, and price.", + "weight": 0.1 + }, + "R6": { + "requirement": "Look up both Soban and Holbox on Google Maps, assess detour feasibility on the drives between the airport and Palm Springs within the 9am–4pm window, and recommend one.", + "verification": "Grader can confirm both locations were searched, detour distances/times are reported, and a clear recommendation is made with reasoning tied to the driving constraint.", + "weight": 0.13 + }, + "R7": { + "requirement": "Create a CryptPad document containing the full day-by-day itinerary covering flights, driving, hotel if needed, car rental, the wedding, and the recommended detour, and leave the document open.", + "verification": "Grader can confirm a CryptPad document is open with a structured itinerary that includes all required components.", + "weight": 0.15 + }, + "R8": { + "requirement": "Provide a concise final summary naming the chosen flights, airport, car rental, hotel if applicable, and detour recommendation.", + "verification": "Grader can confirm the final response integrates all components into a coherent trip plan that respects the 2-day-early arrival, return by the 5th, and 9am–4pm driving constraints.", + "weight": 0.1 + } + }, + "categories": [ + "Travel and Tourism > Air Travel", + "Travel and Tourism > Accommodation and Hotels", + "Travel and Tourism > Car Rentals" + ], + "num_categories": 3 + }, + { + "task_id": "1fd26abb3743ca1dfdc648af0fcab2c3a2def6e9", + "confirmed_task": "I’m moving from Pittsburgh to San Francisco and want to get a realistic side-by-side view of my options before I decide whether to hire movers, use a container, or just drive a truck myself. Please start on MovingAPT.com and get me a long-distance estimate for a 1-bedroom apartment move from Pittsburgh, PA to San Francisco, CA, and keep the quote page or results open so I can look at what assumptions they used. Then do the same on International Van Lines for the same 1-bedroom move, because I want at least two full-service mover quotes to compare. After that, check PODS for a container option that would make sense for a 1-bedroom apartment on that same route, and then check U-Pack for the equivalent portable moving setup, making note of whether they’re pricing by container count, trailer space, delivery, or monthly rental. Once those are open, go to U-Haul and price out a one-way 15-foot truck from Pittsburgh to San Francisco, then on U-Haul’s site find the MPG or fuel economy info for that truck so we can estimate the real driving cost. Use Google Maps to pull up the driving route from Pittsburgh, PA to San Francisco, CA and record the mileage, and leave the map visible so I can sanity-check the route distance on screen. I also have State Farm renters insurance, so please look on State Farm’s site to see whether my belongings are covered while they’re in transit during a move or whether I’d probably need separate moving coverage or valuation. After that, check Trustpilot for MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul, and open each company’s Trustpilot page in its own tab so I can visually compare the ratings and review counts. In the end, pull everything together into one comparison with the estimated total cost for each option, and for U-Haul please calculate the truck rental plus estimated fuel cost using the route mileage and the truck MPG, so I can see which option is cheapest and which seems safest.", + "website": "https://www.google.com", + "reference_length": 10, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A MovingAPT quote or estimate for a 1-bedroom move from Pittsburgh, PA to San Francisco, CA is obtained, with price and visible assumptions or included services captured.", + "verification": "Grader can confirm a MovingAPT quote/results page is open or was visited, and the final notes include a price plus assumptions/services shown on that page.", + "weight": 0.1 + }, + "R2": { + "requirement": "An International Van Lines quote or estimate for a 1-bedroom move from Pittsburgh, PA to San Francisco, CA is obtained, with price and visible assumptions or included services captured.", + "verification": "Grader can confirm an International Van Lines quote/results page is open or was visited, and the final notes include a price plus assumptions/services shown on that page.", + "weight": 0.1 + }, + "R3": { + "requirement": "PODS pricing for the route is captured with the major fee structure or assumptions visible on the pricing page.", + "verification": "Grader can confirm a PODS pricing page is open or was visited, and the response includes the estimated total plus details such as container size, delivery, transport, storage, or rental assumptions.", + "weight": 0.09 + }, + "R4": { + "requirement": "U-Pack pricing for the route is captured with the major fee structure or assumptions visible on the quote page.", + "verification": "Grader can confirm a U-Pack quote page is open or was visited, and the response includes the estimated total plus details such as trailer footage, cube count, transit, or related assumptions.", + "weight": 0.09 + }, + "R5": { + "requirement": "A one-way U-Haul 15-foot truck rental estimate for Pittsburgh to San Francisco is found and recorded with visible pricing details.", + "verification": "Grader can confirm a U-Haul estimate page is open or was visited, and the response includes the 15-foot truck estimate with base rental and visible fees or truck details.", + "weight": 0.1 + }, + "R6": { + "requirement": "The U-Haul 15-foot truck MPG or fuel economy figure used for fuel estimation is correctly captured from U-Haul’s site.", + "verification": "Grader can confirm the U-Haul truck info page or specification page was visited, and the response includes the MPG/fuel economy figure tied to the 15-foot truck.", + "weight": 0.07 + }, + "R7": { + "requirement": "The Pittsburgh to San Francisco driving distance is obtained from Google Maps and recorded for the fuel calculation.", + "verification": "Grader can confirm a Google Maps route is visible or was visited, and the response includes the route mileage used in the calculation.", + "weight": 0.07 + }, + "R8": { + "requirement": "State Farm renters insurance coverage during a move is researched and summarized accurately, including whether separate moving coverage or valuation may be needed.", + "verification": "Grader can confirm State Farm pages were visited, and the response includes a coverage conclusion plus caveats or limitations about property in transit.", + "weight": 0.1 + }, + "R9": { + "requirement": "Trustpilot review information is collected for MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul, with pages opened in separate tabs for visual comparison.", + "verification": "Grader can confirm Trustpilot pages for all five providers were visited or left open in tabs, and the response includes each provider’s rating and review count or clear review sentiment.", + "weight": 0.12 + }, + "R10": { + "requirement": "A complete final comparison is produced covering all five moving options, including estimated total costs, U-Haul total with calculated fuel estimate, Trustpilot review data, and State Farm insurance findings.", + "verification": "Grader can confirm the final output includes MovingAPT, International Van Lines, PODS, U-Pack, and U-Haul in one comparison, with U-Haul total derived from rental plus fuel and with review and insurance context included.", + "weight": 0.16 + } + }, + "categories": [ + "Business and Consumer Services > Moving & Relocation" + ], + "num_categories": 1 + }, + { + "task_id": "e53065fe786881377e88667a80ccc2edcb321320", + "confirmed_task": "I’m trying to help my 16-year-old figure out a good summer pre-college option, and I want to do a pretty careful browser-based search across top U.S. universities rather than just rely on a generic list. Please start with Northeast schools first on Google and check places like Harvard, Yale, Princeton, Columbia, Penn, Cornell, Brown, Dartmouth, and MIT for official pre-college or summer programs for high school students. I only want programs that are actually in person on campus, are meant for high school students around age 16, let students take real college-level classes or courses taught by university instructors, and run for less than about 8 weeks total, because I’m trying to find something academically serious but not too long. As you find anything that looks promising, open the actual official program page in its own tab and read it closely to verify those details, then open the official application or admissions page in another tab so I can see what applying would really involve. While you do that, create a spreadsheet called Pre-College Summer Programs in CryptPad Sheets and log every verified match with the university name, program name, location, length, whether it offers college credit or clearly college-level classes, the application deadline if it’s listed, the program-page link, and the application-page link. If you check a school and it doesn’t seem to have a qualifying option, add a quick note for that too so we know it was reviewed. After you’ve covered the Northeast, expand to a few other strong top-30 schools like Stanford, UChicago, Duke, Northwestern, Rice, Vanderbilt, WashU, UCLA, or Berkeley and apply the same standards. Please keep the matching program tabs and their application tabs open so I can visually inspect them myself afterward, and when you’re done give me a short summary that highlights the strongest Northeast fits first, with a few especially good non-Northeast options as backups.", + "website": "https://www.google.com", + "reference_length": 8, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A spreadsheet titled 'Pre-College Summer Programs' is created and used to track both verified matches and schools checked that did not qualify.", + "verification": "Grader can see a spreadsheet with that exact title open in CryptPad Sheets containing entries for matches and non-match notes.", + "weight": 0.1 + }, + "R2": { + "requirement": "The search is systematic across top-30 universities with clear emphasis on Northeast schools before expanding to a few strong non-Northeast options.", + "verification": "Browser history/tabs and spreadsheet entries show Northeast universities were searched first, followed by a smaller set of non-Northeast top universities.", + "weight": 0.12 + }, + "R3": { + "requirement": "Northeast programs included as matches are verified on official university pages as in-person, intended for high school students around age 16, offering real college-level classes or courses taught by university instructors, and shorter than about 8 weeks.", + "verification": "Open official program tabs and spreadsheet notes visibly support each required criterion for the Northeast matches.", + "weight": 0.2 + }, + "R4": { + "requirement": "Each qualifying Northeast program has its official application or admissions page opened in a tab and documented with application link and deadline if listed.", + "verification": "For each Northeast match, a corresponding application/admissions tab is open and the spreadsheet contains the application URL plus any visible deadline.", + "weight": 0.12 + }, + "R5": { + "requirement": "A few strong non-Northeast top-30 universities are also searched for matching pre-college programs.", + "verification": "Tabs and spreadsheet entries show searches and checks for several named non-Northeast universities such as Stanford, UChicago, Duke, Northwestern, Rice, Vanderbilt, WashU, UCLA, or Berkeley.", + "weight": 0.08 + }, + "R6": { + "requirement": "Non-Northeast programs included as matches are verified against the same criteria and documented, with official application/admissions pages opened and linked.", + "verification": "Open official tabs and spreadsheet entries for non-Northeast matches show the same eligibility, format, academic rigor, duration, and application details.", + "weight": 0.16 + }, + "R7": { + "requirement": "The spreadsheet contains complete entries for every verified program and quick notes for checked schools that did not fit, and the corresponding program and application tabs remain open for visual inspection.", + "verification": "Spreadsheet rows include university name, program name, location, length, college credit or college-level class info, application deadline if listed, program-page link, and application-page link, while matching browser tabs are still open.", + "weight": 0.14 + }, + "R8": { + "requirement": "A concise final summary highlights the strongest Northeast options first and includes the best non-Northeast backups, consistent with the spreadsheet and open tabs.", + "verification": "Final response prioritizes Northeast fits, references only programs documented in the spreadsheet, and matches the visible open tabs.", + "weight": 0.08 + } + }, + "categories": [ + "Science and Education > Education", + "Science and Education > Universities and Colleges" + ], + "num_categories": 2 + }, + { + "task_id": "3bfacd06345631511177bf106f9040300e5875da", + "confirmed_task": "I’m helping someone look into treatment options for lung cancer in the U.S., and I want a solid browser-based shortlist I can actually review myself afterward. Please go to ClinicalTrials.gov and search for interventional lung cancer studies in the United States that are currently recruiting, then use the site filters so we’re only looking at active recruiting trials that are really relevant. As you find good candidates, open the official ClinicalTrials.gov record for each one in its own tab and keep those tabs open so I can compare them later. I need at least 15 distinct trials, and for each one please verify on the actual trial page that it’s recruiting, note the study phase, identify the treatment or intervention type, and capture the U.S. locations where it’s available. Once you’ve gathered the set, create a CryptPad Documents file titled “Lung Cancer Clinical Trials” and record one entry per trial with the trial name, treatment type, trial phase, recruiting status, locations, and the official ClinicalTrials.gov link. After that, add a short summary telling me which treatment approaches seem to come up most often and which cities or hospitals show up most frequently across the location lists. Before you finish, do one last pass to make sure the document has at least 15 complete entries and that each entry still matches an open official trial tab so I have visual proof to review.", + "website": "https://www.google.com", + "reference_length": 8, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "ClinicalTrials.gov is searched with filters that limit results to U.S.-based interventional lung cancer studies that are currently recruiting.", + "verification": "Grader can see the ClinicalTrials.gov results page with relevant search terms and visible recruiting/interventional/U.S. filtering applied.", + "weight": 0.12 + }, + "R2": { + "requirement": "At least 15 distinct official ClinicalTrials.gov study records are opened in separate tabs and left open.", + "verification": "Browser shows 15 or more open tabs corresponding to individual ClinicalTrials.gov trial pages, not just search results.", + "weight": 0.14 + }, + "R3": { + "requirement": "Each selected trial has its official trial name and recruiting status captured from the official study page, with recruiting status verified as active recruiting.", + "verification": "Document entries match visible trial titles and recruiting status on the open ClinicalTrials.gov tabs.", + "weight": 0.14 + }, + "R4": { + "requirement": "Each selected trial includes the study phase and treatment or intervention type taken from the official record.", + "verification": "For sampled entries, the phase and intervention details in the document match the corresponding fields on the open ClinicalTrials.gov pages.", + "weight": 0.12 + }, + "R5": { + "requirement": "Each selected trial includes U.S. recruiting locations from the official record.", + "verification": "For sampled entries, the listed cities or hospitals in the document match the locations section on the corresponding ClinicalTrials.gov pages.", + "weight": 0.12 + }, + "R6": { + "requirement": "The final set is validated so every included study is lung cancer related, interventional, currently recruiting, U.S.-based, and complete for all required fields, with any invalid studies replaced.", + "verification": "Final document contains only qualifying studies, and any replacements correspond to open official tabs that satisfy the criteria.", + "weight": 0.14 + }, + "R7": { + "requirement": "A CryptPad Document titled 'Lung Cancer Clinical Trials' is created and contains at least 15 entries with trial name, treatment type, trial phase, recruiting status, locations, and official ClinicalTrials.gov link.", + "verification": "CryptPad Document title is visible and the body contains 15 or more complete entries with all required fields and links.", + "weight": 0.12 + }, + "R8": { + "requirement": "The document includes a summary of the most common treatment approaches and the most frequent cities or hospitals, and the open official tabs remain available for visual cross-checking.", + "verification": "A summary section is visible in the CryptPad Document, and the browser still shows the official ClinicalTrials.gov tabs open.", + "weight": 0.1 + } + }, + "categories": [ + "Health > Medicine", + "Science and Education > Science and Education - Other" + ], + "num_categories": 2 + }, + { + "task_id": "5d157ce3b5a1d2ecbd01bc29e8b2c0a309971c33", + "confirmed_task": "I’m helping a friend who’s moving to Stanford for work and wants a realistic shortlist of apartments they could actually consider, so please use Apartments.com to search around Stanford University in Stanford/Palo Alto and keep it to places that look like about a 20-minute commute or less to campus. The budget is pretty specific: for a 1-bedroom, stay under $3,500 a month, and for a 2-bedroom that would work for roommates, stay under $6,000 a month. As you find matches, open the actual listing page for each apartment in its own tab so I can visually compare them later, and make sure each one really shows the rent and bedroom count on the listing itself before you keep it. For commute time, use Google Maps or the map/location details from the listing to estimate how long it would take to get to Stanford University, and only keep the ones that are still roughly within that 20-minute window. I’d like around 20 solid options if possible. Then create a CryptPad Sheets spreadsheet titled Stanford Apartment Options and log each one with the building name or street address, monthly rent, number of bedrooms, estimated commute time to Stanford University, and the direct listing link. Once the sheet is filled out, add a short note in the sheet about which nearby neighborhoods seem to have the most within-budget options, and leave the spreadsheet open along with the apartment tabs so I can look through the listings myself.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A relevant Apartments.com search near Stanford University/Palo Alto is performed using criteria aligned with 1-bedroom under $3,500 and 2-bedroom under $6,000.", + "verification": "Grader can see Apartments.com search results or filters reflecting the Stanford/Palo Alto area and the stated bedroom and price constraints.", + "weight": 0.14 + }, + "R2": { + "requirement": "Promising apartment listings are opened in separate browser tabs from the search results.", + "verification": "Browser shows multiple open listing tabs corresponding to apartment result pages rather than only the search page.", + "weight": 0.11 + }, + "R3": { + "requirement": "Each included apartment is verified directly on its listing page for building name/address, rent, and bedroom count, and only qualifying listings are retained.", + "verification": "Open listing pages visibly display rent and bedroom information matching what is later recorded in the spreadsheet.", + "weight": 0.18 + }, + "R4": { + "requirement": "Each included apartment has an approximate commute time to Stanford University checked and is kept only if it is about 20 minutes or less.", + "verification": "Google Maps pages, map snippets, or recorded commute values show commute checks tied to the retained listings.", + "weight": 0.16 + }, + "R5": { + "requirement": "Around 20 qualifying apartments are collected, and each retained listing still corresponds to an open live listing tab.", + "verification": "There are approximately 20 entries retained and the browser still shows the associated apartment tabs open for visual confirmation.", + "weight": 0.17 + }, + "R6": { + "requirement": "A CryptPad Sheets spreadsheet titled 'Stanford Apartment Options' is created and includes for each listing the building name/address, monthly rent, bedroom count, estimated commute time, and listing URL.", + "verification": "Open CryptPad Sheets shows the specified title and rows with all required columns populated for the collected apartments.", + "weight": 0.16 + }, + "R7": { + "requirement": "The spreadsheet includes a short summary identifying which nearby neighborhoods appear to have the most within-budget listings, and the sheet remains open for review.", + "verification": "A visible note or summary section in the sheet names neighborhoods with the strongest concentration of qualifying listings, and the sheet is left open.", + "weight": 0.08 + } + }, + "categories": [ + "Business and Consumer Services > Real Estate" + ], + "num_categories": 1 + }, + { + "task_id": "f23a062af7be0d5a28f1dcb1f06cc79a89dd04d6", + "confirmed_task": "I’m helping a professor who works in natural language processing put together a serious funding list, and I want this to be something they can actually review in the browser afterward. Please start in CryptPad Sheets and create a spreadsheet called NLP Grant Opportunities so we have a clean place to track everything. Then use Google to search for active funding opportunities on official funder sites that are relevant to artificial intelligence, machine learning, computational linguistics, or NLP, focusing on opportunities that university faculty, professors, principal investigators, or academic researchers can apply for. As you find promising results, open the official opportunity page in its own tab, read enough of the page to confirm the call is still active and that academic applicants are eligible, and then record the program name, funding organization, research area or topic, award amount if the page lists one, application deadline, and the official link in the sheet. I need at least 20 distinct verified opportunities, and every row in the sheet should match an official grant page tab that stays open so I can visually review them one by one later. Once you’ve built the list, use the collected set to add a short summary in the sheet about what kinds of funders seem to support the most AI/NLP research—like federal agencies, foundations, nonprofits, or industry-backed research programs—and include your quick take on the strongest opportunities. Please leave the spreadsheet open at the end with the official grant tabs still open too.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A CryptPad Sheets document titled 'NLP Grant Opportunities' is created and used as the main workspace.", + "verification": "Grader can see an open CryptPad Sheets document with the exact title visible in the header/tab.", + "weight": 0.08 + }, + "R2": { + "requirement": "The agent performs broad Google searches that target official funding sources relevant to AI, ML, computational linguistics, or NLP and academic eligibility.", + "verification": "Browser history or open search result pages show multiple relevant Google queries and results pointing to official funder domains.", + "weight": 0.12 + }, + "R3": { + "requirement": "Each included opportunity is verified on an official opportunity page as currently active.", + "verification": "Open tabs show official grant pages with visible status indicators, current cycle language, open call text, or deadlines that demonstrate the opportunity is active.", + "weight": 0.18 + }, + "R4": { + "requirement": "Each included opportunity is verified as open to academic researchers, professors, universities, principal investigators, or equivalent academic applicants.", + "verification": "Official pages or eligibility sections in open tabs visibly mention universities, faculty, academic institutions, PIs, or similar eligible applicant categories.", + "weight": 0.16 + }, + "R5": { + "requirement": "At least 20 distinct verified grant opportunities are collected, and each one corresponds to its own open official opportunity tab.", + "verification": "Spreadsheet contains at least 20 distinct rows and the browser shows a matching set of official grant tabs left open for review.", + "weight": 0.2 + }, + "R6": { + "requirement": "For each verified grant, the spreadsheet includes program name, funding organization, research area or topic, award amount if listed, application deadline, and official opportunity link.", + "verification": "Rows in the spreadsheet visibly contain all required fields, with links present and award cells filled when the official page lists an amount.", + "weight": 0.18 + }, + "R7": { + "requirement": "The spreadsheet includes a summary identifying which types of organizations appear to fund the most AI/NLP research and gives brief recommendations on strong opportunities.", + "verification": "A visible summary section in the sheet describes funder patterns such as federal agencies, foundations, nonprofits, or industry-backed programs and includes recommendation language.", + "weight": 0.08 + } + }, + "categories": [ + "Science and Education > Grants Scholarships and Financial Aid", + "Computers Electronics and Technology > Programming and Developer Software" + ], + "num_categories": 2 + }, + { + "task_id": "47b251d71185920165b7645139ead965cd47441a", + "confirmed_task": "I'm seriously thinking about boarding school for my child for high school, and I want a solid college-prep shortlist I can actually look through myself afterward. Please start on Google and use credible ranking or review sources to identify about 15 to 20 of the strongest U.S. boarding schools with strong academic reputations, then for each school open the actual admissions page in its own tab so I can compare them side by side. As you go, please make sure each school really does offer boarding and is clearly a college-preparatory high school, not just a day school or a specialty program. Then create a CryptPad Sheets spreadsheet called Top Boarding Schools and log each verified school with the school name, city and state, annual boarding tuition or total boarding cost, application deadline if the admissions site lists one, and the direct admissions page link. I also want browser-proof here, so please leave every admissions tab open for the schools you include, and if a tuition or deadline is buried on a separate tuition or apply page, open that page long enough to verify it before recording the number and then keep the admissions tab available. Once the sheet is filled out with around 15 to 20 strong options, add a short summary in the sheet or a companion CryptPad Document about the typical tuition range and where these schools are concentrated geographically, and finish with a brief recommendation note about the most compelling options so I have a practical starting point.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A credible initial pool of top U.S. college-preparatory boarding schools is identified from authoritative Google search results or ranking sources.", + "verification": "Grader can confirm relevant Google results and/or opened source pages showing recognized rankings, reviews, or roundup lists that support the candidate pool.", + "weight": 0.14 + }, + "R2": { + "requirement": "Admissions pages are opened in separate tabs for about 15 to 20 promising schools.", + "verification": "Browser shows roughly 15 to 20 school-domain tabs open on admissions pages, one per included school.", + "weight": 0.14 + }, + "R3": { + "requirement": "Each included school is verified to offer boarding and to be a college-preparatory high school.", + "verification": "Visible content on school admissions, residential life, academics, or about pages confirms boarding availability and college-preparatory secondary education for each included school.", + "weight": 0.18 + }, + "R4": { + "requirement": "The required fields are accurately extracted for each verified school: school name, location, annual boarding tuition or total boarding cost, application deadline if listed, and admissions page link.", + "verification": "Spreadsheet entries match the visible information on the school sites, including cost and deadline values where available and direct admissions URLs.", + "weight": 0.22 + }, + "R5": { + "requirement": "A CryptPad Sheets file titled Top Boarding Schools is created and populated in a clear structured format.", + "verification": "CryptPad Sheets shows a spreadsheet with the correct title and a usable table containing the collected school data.", + "weight": 0.12 + }, + "R6": { + "requirement": "A final synthesis is added summarizing the typical tuition range, geographic distribution, and brief recommendations.", + "verification": "The sheet or companion CryptPad Document contains a concise written summary discussing tuition patterns, regional concentration, and standout schools.", + "weight": 0.12 + }, + "R7": { + "requirement": "The final set includes about 15 to 20 strong schools and each documented school corresponds to an open admissions tab left available for review.", + "verification": "The number of spreadsheet rows aligns with the number of open admissions tabs, and the tabs remain open on the relevant school admissions pages at task end.", + "weight": 0.08 + } + }, + "categories": [ + "Science and Education > Education" + ], + "num_categories": 1 + }, + { + "task_id": "940d8aaa7700347c9fd9a0508e5de2e07c23cdb5", + "confirmed_task": "I’m helping a friend figure out housing near MIT in Cambridge, so could you use a real browser to look for apartments that are roughly within a 20-minute commute to MIT and keep this organized for me? Start with Google to find solid rental sites that actually have Cambridge and nearby Boston-area listings, then use places like Apartments.com and any other major listing sources you find to search for either 1-bedroom apartments under $3,000 a month or 2-bedroom apartments under $5,000 a month for a roommate setup. As you find listings that seem to fit, open each actual listing page in its own tab so I can visually compare the photos, addresses, and details later, and only keep tabs open for listings that really match the bedroom and budget limits. For every listing you keep, verify the rent, bedroom count, and building name or address on the listing page itself, then use Google Maps to check the commute to MIT and keep only the ones that look to be about 20 minutes or less. After that, create a spreadsheet in CryptPad Sheets called MIT Apartment Options and record about 20 good options with the building or address, monthly rent, number of bedrooms, estimated commute time to MIT, and the direct listing link. Please make sure every row in the sheet corresponds to a listing tab that is still open on the actual apartment page, because I want to be able to click around and inspect them afterward. Once the sheet is filled out, add a short note summarizing which neighborhoods seem to have the most within-budget options so I can see where the best concentration of listings is.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "Identify suitable apartment listing sources via Google and begin a search focused on rentals near MIT in Cambridge and nearby neighborhoods.", + "verification": "Browser history or visible search results show Google used to locate apartment marketplaces relevant to Cambridge/Boston rentals near MIT.", + "weight": 0.1 + }, + "R2": { + "requirement": "Open candidate apartment listings from rental sites in separate tabs using the specified bedroom and price constraints.", + "verification": "Multiple apartment listing tabs are visibly open from rental marketplace sites, and the listings reflect searches for 1-bedroom under $3,000 or 2-bedroom under $5,000.", + "weight": 0.2 + }, + "R3": { + "requirement": "Verify each kept listing’s rent, bedroom count, building/address, and direct listing URL from the actual listing page, removing non-qualifying options.", + "verification": "Open tabs show listing pages with visible rent and bedroom details, and only qualifying listings remain represented in the working set.", + "weight": 0.2 + }, + "R4": { + "requirement": "Check commute times to MIT in Google Maps and keep only listings that are roughly within a 20-minute commute.", + "verification": "Google Maps routes are used for listing addresses, and the final set reflects commute times at or around 20 minutes or less.", + "weight": 0.18 + }, + "R5": { + "requirement": "Create a CryptPad Sheets spreadsheet titled 'MIT Apartment Options' with about 20 qualifying listings and the required columns.", + "verification": "A CryptPad Sheets file with the exact title is visible and contains approximately 20 rows of apartment entries with building/address, rent, bedrooms, commute time, and listing link.", + "weight": 0.18 + }, + "R6": { + "requirement": "Ensure each spreadsheet entry corresponds to a currently open tab on that apartment’s actual listing page.", + "verification": "The browser shows open listing tabs matching the entries in the spreadsheet, allowing direct visual cross-checking between rows and tabs.", + "weight": 0.08 + }, + "R7": { + "requirement": "Provide a brief summary of which neighborhoods appear to have the most qualifying within-budget listings.", + "verification": "A visible note or text summary identifies neighborhoods with the highest concentration of qualifying options based on the collected sheet entries.", + "weight": 0.06 + } + }, + "categories": [ + "Business and Consumer Services > Real Estate" + ], + "num_categories": 1 + }, + { + "task_id": "a5724e1c94ac221f0a53765c51f625b7bc3cc58e", + "confirmed_task": "I’m helping a university researcher who works on AI safety and alignment, and I need a solid list of current funding options they could realistically apply for. Please start on Google and search for active grant programs from official funder sites that support research in artificial intelligence, AI safety, alignment, trustworthy machine learning, or responsible AI, and focus on opportunities where academic researchers or university-based investigators are eligible. As you find promising ones, open the official opportunity page for each grant in its own tab so I can visually compare them later, and only keep it if the page itself makes it clear the opportunity is still active or open rather than archived or closed. I need at least 15 verified opportunities, and for each one please pull the program name, funding organization, research topic or focus area, award size if the page lists one, the application deadline, and the official URL. Once you’ve gathered enough, go to CryptPad Documents and create a document titled AI Safety Grant Opportunities, then put the grants into a clean table or structured list that matches the open tabs one-for-one. After that, add a short summary at the end explaining what kinds of organizations seem to fund AI safety-related research most often—like government agencies, private foundations, nonprofits, industry labs, or academic consortia—because I want to see where the strongest funding patterns are. Please leave the CryptPad Document open at the end, and keep the official grant tabs open too so I can spot-check the pages myself.", + "website": "https://www.google.com", + "reference_length": 9, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "Searches broadly on Google and identifies plausible AI safety, alignment, responsible AI, AI, or machine learning research funding opportunities from official sources.", + "verification": "Browser history or visible search results show Google queries and candidate results leading to official funding pages.", + "weight": 0.12 + }, + "R2": { + "requirement": "Opens official grant or opportunity pages in separate tabs for candidate opportunities.", + "verification": "Multiple browser tabs are open on official funding domains, each showing a distinct opportunity page.", + "weight": 0.1 + }, + "R3": { + "requirement": "Confirms that retained opportunities are active or open rather than expired, archived, or clearly closed.", + "verification": "Visible page text on retained tabs indicates active status, open call language, current cycle information, or upcoming deadlines.", + "weight": 0.14 + }, + "R4": { + "requirement": "Verifies that academic researchers, universities, or academic institutions are eligible for each included opportunity.", + "verification": "Eligibility sections on retained tabs mention universities, faculty, investigators, academic institutions, or equivalent academic participation.", + "weight": 0.14 + }, + "R5": { + "requirement": "Collects at least 15 verified grant opportunities relevant to AI safety, alignment, AI, machine learning, or responsible AI research.", + "verification": "The final CryptPad Document contains 15 or more distinct entries, each corresponding to a verified official grant tab.", + "weight": 0.16 + }, + "R6": { + "requirement": "For each included opportunity, records program name, funding organization, research topic or focus area, award size if listed, application deadline, and official link.", + "verification": "Each row or entry in the CryptPad Document includes all required fields, with award size marked only when available on the source page.", + "weight": 0.14 + }, + "R7": { + "requirement": "Creates a CryptPad Document titled 'AI Safety Grant Opportunities' containing the compiled grant records.", + "verification": "An open CryptPad Document with the exact title is visible and includes the compiled opportunities.", + "weight": 0.08 + }, + "R8": { + "requirement": "Ensures each final document entry corresponds to an open tab with the official grant page.", + "verification": "The number and identity of listed opportunities can be matched against open official tabs still visible in the browser.", + "weight": 0.06 + }, + "R9": { + "requirement": "Adds a concluding summary identifying which types of organizations most frequently fund AI safety-related research among the collected opportunities and highlights key takeaways or recommendations.", + "verification": "The CryptPad Document ends with a written summary discussing organization categories and observed funding patterns.", + "weight": 0.06 + } + }, + "categories": [ + "Science and Education > Grants Scholarships and Financial Aid", + "Computers Electronics and Technology > Computers Electronics and Technology - Other" + ], + "num_categories": 2 + }, + { + "task_id": "e96aa77ab19737990cfa7a4da23533f2b0a0de92", + "confirmed_task": "I’m trying to put together a solid shortlist of the best hospitals in Texas for cardiac surgery for a family reference, so could you research this in a real browser and keep it grounded in actual hospital program pages and recognizable rankings? Start on Google and look for authoritative sources that would help identify strong Texas heart surgery centers, like U.S. News, Healthgrades, Leapfrog, CMS-related quality pages, or major hospital recognition pages, because I want the final list to be based on visible quality signals rather than guesswork. Then use U.S. News and those other quality indicators to narrow it down to the top 10 Texas hospitals for cardiac care or heart surgery. For each hospital you choose, open the actual cardiac surgery, heart surgery, or heart and vascular program page in its own tab and make sure the page clearly shows they offer advanced heart surgery services like CABG, valve repair or replacement, aortic surgery, or similar procedures. After that, create a CryptPad Sheets spreadsheet titled Top Texas Cardiac Hospitals and enter one row per hospital with the hospital name, city, a short description of the cardiac surgery program, whether it appears in rankings or quality indicators, and the direct link to the program page. Please leave all 10 hospital program tabs open so I can visually compare them later, and also keep the spreadsheet open in another tab. Once the sheet is filled out, add a short written summary in the sheet or a companion CryptPad Document explaining which Texas cities seem to have the strongest cardiac surgery centers based on how many top hospitals show up there and how prominent they are, so I can quickly see the main patterns and your top recommendations.", + "website": "https://www.google.com", + "reference_length": 7, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "Authoritative ranking or quality indicator sources relevant to Texas cardiac care hospitals are identified and used as the basis for selection.", + "verification": "Grader can confirm from browser history, open tabs, or notes that Google results and recognized sources such as U.S. News, Healthgrades, Leapfrog, CMS-related pages, or comparable quality sources were consulted.", + "weight": 0.14 + }, + "R2": { + "requirement": "A final set of exactly 10 Texas hospitals is selected based on visible ranking presence or quality indicators for cardiac care or heart surgery.", + "verification": "Grader can count exactly 10 hospitals in the final sheet and see that each one has some ranking or quality-indicator notation tied to the researched sources.", + "weight": 0.18 + }, + "R3": { + "requirement": "Each selected hospital has its cardiac surgery, heart surgery, or heart and vascular program page opened in a separate browser tab.", + "verification": "Grader can visually confirm 10 distinct hospital program tabs are open, each corresponding to one hospital listed in the spreadsheet.", + "weight": 0.14 + }, + "R4": { + "requirement": "For each selected hospital, the agent verifies that advanced heart surgery services are offered.", + "verification": "Grader can inspect the open hospital pages and see explicit references to advanced cardiac surgery services such as CABG, valve procedures, aortic surgery, or equivalent surgical offerings.", + "weight": 0.16 + }, + "R5": { + "requirement": "For each of the 10 hospitals, the required fields are accurately captured: hospital name, city, cardiac program description, ranking or quality appearance, and program page link.", + "verification": "Grader can compare the spreadsheet rows against the open hospital tabs and ranking sources to confirm all five fields are present and consistent for all 10 entries.", + "weight": 0.18 + }, + "R6": { + "requirement": "A CryptPad Sheets spreadsheet titled 'Top Texas Cardiac Hospitals' is created and populated with the 10 hospital records.", + "verification": "Grader can see an open CryptPad Sheets tab with the exact title and a structured table containing 10 rows of hospital data.", + "weight": 0.12 + }, + "R7": { + "requirement": "The final output includes a concise summary identifying which Texas cities appear to have the strongest cardiac surgery centers and the spreadsheet and hospital tabs remain open for visual review.", + "verification": "Grader can see the summary text in the sheet or companion CryptPad Document and confirm the spreadsheet tab plus the hospital program tabs are still open.", + "weight": 0.08 + } + }, + "categories": [ + "Health > Medicine", + "Health > Health - Other" + ], + "num_categories": 2 + }, + { + "task_id": "ca5c6ddf8b347ee0935c6044fe65cd182e4fb26c", + "confirmed_task": "I’m trying to piece together a pretty complicated trip and want your help doing it in the browser so I can actually see the options. On Google Flights, please start with an early-December flight from Pittsburgh to Hawaii, using a real Hawaii destination like Honolulu if that gives the best deal, because I want to break up the trip with a few days there before heading to Australia; I’d prefer a morning or late-night departure from Pittsburgh if possible, and since there are no direct flights, find me a reasonable connecting itinerary and open the best option in its own tab so I can look at the timing and price. Once you’ve got those Hawaii dates, go to Booking.com and find a good-value resort for 2 adults in Hawaii for about 3 nights that fits those flight dates, ideally somewhere well-reviewed near the beach with a private room and free cancellation if available, and open the actual property page with photos and map view so I can judge whether it feels worth it. After that, go back to Google Flights and look for a Hawaii-to-Sydney flight that leaves in the morning or at night, not the afternoon, using the Hawaii stay you picked to set the departure date; find a reasonable option and keep that result open in a separate tab too. Then on Booking.com, find me a 1-week stay in Sydney for 2 adults that’s close to the Sydney Opera House, ideally walkable or clearly nearby on the map, and open the listing page plus the map so I can verify the location myself. Once that’s set, use Google Flights again to find a Sydney-to-Tokyo flight after the Sydney stay, with only morning or night departures, and pick a reasonable option that keeps the trip flowing logically. Finally, put everything into a CryptPad Document with the flights, hotels, dates, times, airports, nightly or total lodging costs, and a full trip total, and leave the doc open along with the key tabs for the Hawaii resort and Sydney hotel so I can review the visual details.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "Identify at least one viable early-December Pittsburgh-to-Hawaii connecting flight itinerary, including airline(s), departure and arrival airports, dates, times, and total price, with preference given to morning or late-night departure from Pittsburgh when available.", + "verification": "Grader can confirm a Google Flights results/details tab is open showing a PIT to Hawaii itinerary with connection(s), visible dates, times, airports, airline(s), and fare.", + "weight": 0.18 + }, + "R2": { + "requirement": "Select a Hawaii resort for 2 adults for about 3 nights that aligns with the chosen Hawaii stopover dates, including property name, occupancy/room details, nightly or total cost, and location/value characteristics.", + "verification": "Grader can confirm a Booking.com property page is open with matching dates, 2-adult occupancy, resort details, visible price, and map or photo evidence.", + "weight": 0.16 + }, + "R3": { + "requirement": "Find at least one Hawaii-to-Sydney flight option that departs in the morning or at night and avoids afternoon departure, with departure airport, date, departure time, arrival time, airline(s), route, and total price.", + "verification": "Grader can confirm a Google Flights tab is open for Hawaii to Sydney showing the selected itinerary and visible departure time outside the afternoon window.", + "weight": 0.18 + }, + "R4": { + "requirement": "Select accommodation in Sydney for 2 adults for one week that is close to the Sydney Opera House, including property name, room/occupancy details, dates, total cost, and clear proximity information.", + "verification": "Grader can confirm a Booking.com listing and map view are open showing the property location relative to the Sydney Opera House, along with dates and pricing.", + "weight": 0.16 + }, + "R5": { + "requirement": "Find at least one reasonable Sydney-to-Tokyo flight option after the Sydney stay with morning or night departure only, including date, departure time, arrival time, airline(s), route, and total price.", + "verification": "Grader can confirm a Google Flights result is shown for Sydney to Tokyo with a visible departure time that is morning or night, plus fare and route details.", + "weight": 0.14 + }, + "R6": { + "requirement": "Compile a complete multi-city itinerary in CryptPad Document covering Pittsburgh to Hawaii, Hawaii stay, Hawaii to Sydney, Sydney stay, and Sydney to Tokyo, with dates in chronological order, itemized costs, and an overall total.", + "verification": "Grader can confirm a CryptPad Document is open containing all five trip components with dates, times, airports/properties, prices, and a summed total.", + "weight": 0.18 + } + }, + "categories": [ + "Travel and Tourism > Air Travel", + "Travel and Tourism > Accommodation and Hotels", + "Travel and Tourism > Car Rentals" + ], + "num_categories": 3 + }, + { + "task_id": "9ad01a4a4bda2e8df7489c9831931b044c646a20", + "confirmed_task": "I’m trying to get a realistic shortlist of shoulder surgeons in Chicago because I may need surgery for a rotator cuff or labrum issue, and I want something more trustworthy than random review sites. Please start on Google and search for Chicago orthopedic surgeons who clearly specialize in shoulder surgery, especially rotator cuff repair or labrum repair, and use official hospital or orthopedic practice profile pages as the main sources. As you find strong candidates, open each surgeon’s official profile in its own tab so I can visually compare them later, and only keep people whose actual profile page clearly mentions shoulder surgery, shoulder conditions, rotator cuff repair, labrum repair, sports medicine with shoulder focus, or similar shoulder-specific procedures. While you work, create a CryptPad Sheets spreadsheet called Top Shoulder Surgeons Chicago and track the finalists there with columns for surgeon name, hospital or medical center affiliation, specialty focus, the exact confirmation that shoulder surgery is listed, and the link to the profile page. From the verified candidates, narrow it to the top 10 Chicago surgeons who seem especially strong for shoulder surgery based on what you can see on their official profiles, like shoulder specialization, fellowship training, leadership roles, sports medicine focus, or detailed shoulder procedure listings. Please leave all 10 profile tabs open so I can inspect the pages myself, then finish the sheet with a short summary of which hospitals or orthopedic centers show up most often among the 10 specialists.", + "website": "https://www.google.com", + "reference_length": 6, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A CryptPad Sheets spreadsheet titled 'Top Shoulder Surgeons Chicago' is created and used as the workspace.", + "verification": "Grader can see a spreadsheet with the exact title open in CryptPad Sheets.", + "weight": 0.1 + }, + "R2": { + "requirement": "Research is conducted on Google using official hospital or orthopedic practice sources to build a relevant Chicago candidate pool for shoulder surgery, rotator cuff repair, or labrum repair.", + "verification": "Browser history/tabs show Google searches and resulting official physician or hospital profile pages relevant to Chicago shoulder specialists.", + "weight": 0.15 + }, + "R3": { + "requirement": "Official profile pages are opened in separate tabs and each selected surgeon is verified from the page itself as performing shoulder surgery or treating shoulder-specific conditions/procedures.", + "verification": "Open tabs display official surgeon profile pages, and visible page text confirms shoulder surgery, shoulder conditions, rotator cuff repair, labrum repair, or equivalent shoulder-focused treatment.", + "weight": 0.25 + }, + "R4": { + "requirement": "Exactly 10 Chicago surgeons are selected as the top specialists based on evidence visible on their official profiles.", + "verification": "Spreadsheet contains exactly 10 surgeon entries, each corresponding to a Chicago-based surgeon supported by an official profile tab.", + "weight": 0.2 + }, + "R5": { + "requirement": "Each of the 10 spreadsheet entries includes surgeon name, hospital or medical center affiliation, specialty focus, explicit confirmation that shoulder surgery is listed, and the profile link.", + "verification": "Each row in the spreadsheet has all required fields populated with usable links and shoulder-specific confirmation text.", + "weight": 0.2 + }, + "R6": { + "requirement": "Every surgeon listed in the spreadsheet has a corresponding official profile tab left open, and the sheet includes a brief summary of which hospitals or orthopedic centers appear most frequently among the top 10.", + "verification": "There are 10 matching open profile tabs for the 10 listed surgeons, and the spreadsheet contains a written frequency summary of recurring hospitals or orthopedic centers.", + "weight": 0.1 + } + }, + "categories": [ + "Health > Medicine", + "Health > Health - Other" + ], + "num_categories": 2 + }, + { + "task_id": "e1b99d1777a0aa911745b7ca02ba94ef10d7d45b", + "confirmed_task": "I’m helping a high school student who’s pretty serious about engineering and wants an on-campus summer program that actually feels academic, not just a general camp, so could you use Google to find university-hosted engineering summer programs for high school students that are in person, take place on the university campus, run for less than 6 weeks, and involve college-level or clearly advanced coursework. As you find promising ones, open each actual university program page in its own tab and verify from the page itself that it really is for high school students, that it’s in person, and that the length fits; I want at least 12 that genuinely match. Then create a CryptPad Sheets spreadsheet called Engineering Summer Programs and record, for each one, the university, program name, program length, subject focus, application deadline if it’s listed, and the direct link to the program page. Please keep the tabs for all qualifying programs open so I can visually compare the pages afterward, and if a page has photos or campus details visible, open the actual listing rather than a summary page so I can see that it’s a real campus-based program. Once the sheet is filled in, add a short note at the bottom about which U.S. regions seem to have the most engineering summer programs based on the set you found, just so I have a quick sense of where the strongest concentration is.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "A broad but relevant candidate pool of university-hosted engineering summer programs for high school students is gathered through Google Search.", + "verification": "Grader can confirm Google was used to surface multiple university program candidates relevant to the stated constraints before verification on university sites.", + "weight": 0.14 + }, + "R2": { + "requirement": "At least 12 programs are verified on actual university program pages as meeting all required constraints: high school audience, engineering-focused, in person on a university campus, less than 6 weeks long, and involving college-level or clearly advanced coursework.", + "verification": "Grader can confirm at least 12 open university tabs or corresponding entries tied to live program pages showing the qualifying details on-page, including evidence of college-level or advanced academic content.", + "weight": 0.34 + }, + "R3": { + "requirement": "For each qualifying program, the university, program name, program length, subject focus, application deadline if listed, and direct program link are accurately extracted.", + "verification": "Grader can compare spreadsheet entries against the open program tabs and confirm the required fields are present and consistent with the source pages.", + "weight": 0.24 + }, + "R4": { + "requirement": "A CryptPad Sheets spreadsheet titled 'Engineering Summer Programs' is created and populated with at least 12 qualifying program entries in a clear structured format.", + "verification": "Grader can see a CryptPad Sheet with the exact title and at least 12 rows of program data organized into usable columns.", + "weight": 0.18 + }, + "R5": { + "requirement": "The spreadsheet includes a brief summary stating which U.S. regions appear to have the most engineering summer programs based on the identified set.", + "verification": "Grader can see a written summary note in the sheet that references regional distribution derived from the collected programs.", + "weight": 0.1 + } + }, + "categories": [ + "Science and Education > Education", + "Science and Education > Universities and Colleges", + "Heavy Industry and Engineering > Heavy Industry and Engineering - Other" + ], + "num_categories": 3 + }, + { + "task_id": "bacbe73cdb06541360047d8c90677f7d569172bd", + "confirmed_task": "I want to do market research on the most popular cafes in Singapore. Analyse the menus of the top 10 cafes in singapore (by Google reviews/ratings), and make sure we include at least 1 from the North/South/East/West/Central regions of Singapore. Keep the relevant pages of each cafe open, and summarise their pricing, menu offerings, unique selling points, making sure to reference which tab is opened for each cafe. For each cafe, also help me figure out how long it would take to get to it from Tampines MRT, and include this in your final summary.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "The top 10 most popular cafes in Singapore are identified using Google reviews/ratings, with evidence of their popularity (e.g. star ratings, review counts).", + "verification": "Grader can confirm that 10 cafes are listed with Google review ratings or review counts as evidence of ranking.", + "weight": 0.12 + }, + "R2": { + "requirement": "At least 1 cafe from each of the 5 Singapore regions (North, South, East, West, Central) is included in the selection.", + "verification": "Grader can confirm at least 5 distinct regions are represented with at least 1 cafe each, and the region assignment is geographically accurate.", + "weight": 0.14 + }, + "R3": { + "requirement": "The relevant menu or information page for each of the 10 cafes is kept open in a separate tab, with each tab clearly referenced in the summary.", + "verification": "Grader can see 10 open tabs corresponding to the 10 cafes, and the summary text references which tab belongs to which cafe.", + "weight": 0.16 + }, + "R4": { + "requirement": "Pricing information is summarised for each cafe, including specific menu item prices or price ranges.", + "verification": "Grader can confirm each cafe entry includes concrete pricing data (not just vague descriptors) sourced from the open menu pages.", + "weight": 0.16 + }, + "R5": { + "requirement": "Menu offerings and unique selling points are summarised for each cafe.", + "verification": "Grader can confirm each cafe entry includes a description of menu highlights and at least one unique selling point or differentiator.", + "weight": 0.16 + }, + "R6": { + "requirement": "Travel time from Tampines MRT to each of the 10 cafes is calculated and included in the final summary.", + "verification": "Grader can confirm each cafe entry includes an estimated travel time from Tampines MRT with the transport mode indicated (e.g. public transit, driving).", + "weight": 0.14 + }, + "R7": { + "requirement": "A final structured summary combines all information (cafe name, region, pricing, menu highlights, USPs, tab reference, travel time) in a clear format.", + "verification": "Grader can see a complete summary or table that consolidates all required fields for all 10 cafes in an organised, readable format.", + "weight": 0.12 + } + }, + "categories": [ + "Food and Drink > Restaurants and Delivery", + "Food and Drink > Beverages" + ], + "num_categories": 2 + }, + { + "task_id": "543918a53f9196e0f77783e1dc4a9db90ebc6eb9", + "confirmed_task": "I want to develop the best banana bread recipe. Look up the top 10 recipes online (by engagement, popularity, reviews) and compare the recipes (e.g. composition of ingredients, additions, cooking method), identifying and highlighting similarities and unique points that make each recipe good. Keep the most unique or highly reviewed 3 recipes in open tabs so I can reference them, and make sure at least one has a YouTube video (also keep this video open and start playing it). Then, from these three, create the best recipe you can combining aspects of these and provide me with step by step instructions.", + "website": "https://www.google.com", + "reference_length": 5, + "level": "hard", + "rubrics": { + "R1": { + "requirement": "The top 10 banana bread recipes online are identified and ranked by engagement, popularity, or reviews, with evidence of their ranking.", + "verification": "Grader can confirm 10 recipes are listed with source URLs and evidence of popularity such as review counts, star ratings, or engagement metrics.", + "weight": 0.12 + }, + "R2": { + "requirement": "Ingredient compositions, additions, and cooking methods are extracted and compared across all 10 recipes, identifying similarities and unique differentiators.", + "verification": "Grader can confirm a comparison analysis is provided that highlights common base ingredients across recipes and calls out unique additions or techniques for each.", + "weight": 0.2 + }, + "R3": { + "requirement": "The 3 most unique or highly reviewed recipes are selected and kept open in separate tabs for reference.", + "verification": "Grader can see 3 recipe tabs open and the selection rationale (uniqueness or review quality) is explained.", + "weight": 0.16 + }, + "R4": { + "requirement": "At least one of the 3 selected recipes has an associated YouTube video that is opened in a tab and started playing.", + "verification": "Grader can confirm a YouTube video tab is open and playing for at least one of the selected recipes.", + "weight": 0.14 + }, + "R5": { + "requirement": "A combined best banana bread recipe is created drawing from the strengths of the top 3 selected recipes, with a complete ingredient list and step-by-step instructions.", + "verification": "Grader can see a full recipe with numbered steps and a complete ingredient list, with clear attribution of which elements were drawn from which source recipes.", + "weight": 0.22 + }, + "R6": { + "requirement": "The combined recipe explains why specific elements were chosen from each source recipe.", + "verification": "Grader can confirm the final recipe includes reasoning for ingredient or method choices tied back to the comparison analysis.", + "weight": 0.16 + } + }, + "categories": [ + "Food and Drink > Cooking and Recipes" + ], + "num_categories": 1 + }, +{ + "confirmed_task": "I’m trying to buy a genuinely good budget suitcase for a future trip, and I don’t want a fake bargain that only looks good because one eBay listing has stars. Start on eBay with suitcase listings under $100, but then broaden into a real shopping decision: find 10 total live suitcase options under $100 from public product or listing pages across eBay and other major public retailers or marketplaces if needed, with at least 4 of the options coming from eBay. Keep this focused on actual suitcases rather than backpacks or duffels, and prefer recognizable luggage brands when they fit the budget, but include lesser-known brands too if the value looks strong. For each option, compare the visible price, estimated shipping if shown, size category, dimensions, weight, hard-side versus soft-side, wheel configuration, visible customer rating and review count, and return policy or seller/retailer protection if that is shown; if any field is missing, just mark it as not shown. I also want you to sanity-check quality instead of trusting one marketplace blindly, so for the most promising 5 options, look for the same model or a closely matching product page on another public site to verify specs or to see whether the price and review story still look good. As you work, keep the strongest 3 candidate product pages open, plus at least 2 comparison or evidence tabs that helped you rule things in or out, so I can inspect the reasoning visually. At the end, recommend exactly 1 best overall suitcase under $100, exactly 2 runner-up picks for different priorities like lowest cost or best reviews, and briefly explain the tradeoffs that made you choose them.", + "level": "hard", + "reference_length": 7, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies 10 total live suitcase options under $100 from public product or listing pages, with at least 4 of those options coming from eBay.", + "verification": "Count the recorded options and verify that all are suitcases priced under $100 on public pages, and that at least 4 are sourced from eBay pages.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each of the 10 options includes the requested comparison fields: visible price, estimated shipping if shown, size category, dimensions, weight, hard-side versus soft-side, wheel configuration, visible customer rating and review count, and return policy or seller/retailer protection, with missing fields marked as not shown.", + "verification": "Inspect the final comparison output and confirm that every option has all requested fields populated or explicitly marked not shown where unavailable.", + "weight": 0.22 + }, + "R3": { + "requirement": "The set stays focused on actual suitcases rather than backpacks or duffels, and it prefers recognizable luggage brands when they fit the budget while still allowing lesser-known brands when the value appears strong.", + "verification": "Review the 10 selected products and confirm they are suitcases; check that the mix includes recognizable luggage brands where available within budget rather than only generic items.", + "weight": 0.1 + }, + "R4": { + "requirement": "For the most promising 5 options, the session finds the same model or a closely matching product page on another public site to verify specs or compare whether the price and review story still hold up.", + "verification": "Confirm that 5 shortlisted options each have an additional public cross-site verification page or closely matching page used for spec, price, or review sanity-checking.", + "weight": 0.18 + }, + "R5": { + "requirement": "The browser is left with the strongest 3 candidate product pages open, plus at least 2 comparison or evidence tabs that were used to support or reject options.", + "verification": "Check the open tabs at the end for 3 finalist product pages and at least 2 additional evidence or comparison pages relevant to the decision.", + "weight": 0.12 + }, + "R6": { + "requirement": "The final recommendation names exactly 1 best overall suitcase under $100 and exactly 2 runner-up picks for different priorities such as lowest cost or best reviews.", + "verification": "Inspect the final summary and confirm there is one clearly labeled best overall pick and two clearly labeled runner-up picks with distinct priorities.", + "weight": 0.1 + }, + "R7": { + "requirement": "The final summary briefly explains the tradeoffs behind the best overall pick and the two runner-up picks using the compared evidence.", + "verification": "Check that the concluding recommendation includes concise reasoning tied to factors like total cost, features, review strength, protection/returns, or spec verification.", + "weight": 0.1 + } + }, + "task_id": "a89f1d8d1b3d2b74d784c82e09c2511e7e1587fd", + "website": "https://www.ebay.com" + }, + { + "confirmed_task": "I want to file a Virginia Department of Education public-records request, but before we submit anything I want you to help me figure out what request is actually worth filing and how to phrase it so it is specific, lower-cost, and more likely to succeed. Start by checking the VDOE FOIA page and the VDOE NextRequest portal so we understand the real submission options, response timeline, fee rules, and the main exemptions or privacy limits that could matter. Then search the public VDOE site to see whether likely records are already posted, because I do not want to waste a FOIA request on material that is already public. After that, identify 4 plausible VDOE record-request angles that an ordinary requester could credibly pursue from public clues on the site, and for each one note what page suggests the records exist, what office or records custodian seems most relevant, what narrowing choices could reduce fees, and any obvious exemption or FERPA/privacy problem. From those 4, choose the 2 strongest candidates and draft a complete FOIA request for each with a clear date range, reasonably specific record description, preferred electronic delivery, and a request for a cost estimate before any chargeable work proceeds. Keep the most useful evidence tabs open, including the VDOE FOIA page, the NextRequest new-request page, and the key public pages that support the 2 finalist request ideas. Finally, carry the better of the 2 drafts through the VDOE NextRequest workflow to the final submission or confirmation step if that can be done on public pages, and leave that page open. End with a concise comparison memo explaining which draft you chose, why it looks like the best balance of usefulness versus fee/exemption risk, what response timeline VDOE states, and what follow-up I should expect next. Put each of the drafts as seperate documents in CryptoPad Documents for me to review.", + "level": "hard", + "reference_length": 9, + "rubrics": { + "R1": { + "requirement": "The session verifies the live VDOE FOIA rules and VDOE NextRequest workflow, including submission options, stated response timeline, fee rules, and relevant exemption or privacy limits.", + "verification": "Browser evidence shows the VDOE FOIA page and the VDOE NextRequest portal were opened, and the final memo accurately summarizes those live rules and workflow details.", + "weight": 0.17 + }, + "R2": { + "requirement": "The session checks the public VDOE site for whether likely records are already available before filing, rather than assuming FOIA is necessary.", + "verification": "Opened public VDOE pages are used as evidence of record categories or already-published materials, and the memo distinguishes between what appears public versus what still seems FOIA-worthy.", + "weight": 0.14 + }, + "R3": { + "requirement": "Exactly 4 plausible VDOE record-request angles are identified, and for each one the work notes the public clue that the records exist, the most relevant office or custodian, narrowing choices that could reduce fees, and any obvious exemption or FERPA/privacy concern.", + "verification": "The final output lists 4 distinct request angles with all four requested elements for each, supported by the public pages visited.", + "weight": 0.2 + }, + "R4": { + "requirement": "From those 4 angles, exactly 2 strongest candidates are selected and each is turned into a complete draft FOIA request with a clear date range, specific record description, preferred electronic delivery, and a request for a cost estimate before charges proceed, and each of the 2 draft requests is placed in its own CryptoPad Document for review.", + "verification": "The final output contains 2 full draft requests and each includes all four requested drafting elements; the final deliverables include 2 separate CryptoPad Documents containing those full draft requests.", + "weight": 0.19 + }, + "R5": { + "requirement": "The most useful evidence tabs are kept open, including the VDOE FOIA page, the NextRequest new-request page, and the key public pages supporting the 2 finalist request ideas.", + "verification": "The browser state at the end includes those required pages open and visibly available for review.", + "weight": 0.12 + }, + "R6": { + "requirement": "The better of the 2 drafts is carried through the VDOE NextRequest workflow to the final submission or confirmation step if reachable on public pages, and the task ends with a concise comparison memo explaining the chosen draft, the usefulness-versus-fee/exemption tradeoff, VDOE’s stated timeline, and expected follow-up.", + "verification": "A final portal step or confirmation page is left open when reachable, and the closing memo covers all four requested comparison points.", + "weight": 0.18 + } + }, + "task_id": "48a6cb73b8d54934ae1ad3d50abdf17a4dcd6b42", + "website": "https://vaedu.nextrequest.com" + }, + { + "confirmed_task": "I’m trying to decide whether Silver Cross is actually a strong baby brand to buy from for a future first-child stroller setup, not just whether the marketing sounds nice, so please do a serious browser-based comparison that would help me make the call. Start by identifying one current full-size stroller or travel-system option from Silver Cross that seems representative of the brand, then compare it against at least 5 competing premium or upper-midrange brands that people realistically cross-shop, using public product pages plus independent reviews and owner feedback. For Silver Cross specifically, I want you to gather at least 6 concrete reputation or quality claims from independent sources, with at least 3 positives and at least 2 meaningful drawbacks, and make sure they are specific things like build quality, ride quality, fold, weight, fabric quality, durability, customer service, or long-term value rather than vague praise. Then build a side-by-side comparison across the 6 total brands for the factors a real buyer would care about: approximate price, stroller weight, fold or portability, newborn compatibility, car-seat or travel-system ecosystem, warranty or after-sales support, and any clearly stated safety or recall context from public pages; if a field is not shown, record it as not shown. As you work, keep the most useful evidence tabs open, including the Silver Cross product page, at least 3 independent review pages discussing Silver Cross, at least 2 competitor product pages, and at least 1 public safety or recall-check page so I can verify the reasoning afterward. After that, synthesize everything into one clear recommendation that answers three questions: whether Silver Cross seems like a genuinely good brand overall, what kind of buyer it fits best versus who should skip it, and whether I’d be better off buying Silver Cross or one of the compared alternatives for a practical everyday stroller setup.", + "level": "hard", + "reference_length": 6, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies 1 current representative Silver Cross full-size stroller or travel-system option and compares it against at least 5 competing premium or upper-midrange brands, for 6 total brands.", + "verification": "Final comparison includes the Silver Cross model plus at least 5 named competing brands/models drawn from public product pages.", + "weight": 0.18 + }, + "R2": { + "requirement": "For Silver Cross, at least 6 concrete reputation or quality claims are gathered from independent sources, including at least 3 positives and at least 2 meaningful drawbacks.", + "verification": "Final synthesis cites independent review or owner-feedback pages for the Silver Cross claims, and the claims are specific attributes such as build quality, ride quality, fold, weight, fabric quality, durability, customer service, or value.", + "weight": 0.22 + }, + "R3": { + "requirement": "A side-by-side comparison is produced across the 6 total brands for approximate price, stroller weight, fold or portability, newborn compatibility, car-seat or travel-system ecosystem, warranty or after-sales support, and any clearly stated safety or recall context from public pages, using 'not shown' where needed.", + "verification": "The final comparison covers all requested factors for all 6 brands and uses 'not shown' for missing fields rather than omitting them.", + "weight": 0.22 + }, + "R4": { + "requirement": "Useful browser evidence is kept open, including the Silver Cross product page, at least 3 independent review pages discussing Silver Cross, at least 2 competitor product pages, and at least 1 public safety or recall-check page.", + "verification": "Open tabs at the end visibly include the requested categories and counts of evidence pages.", + "weight": 0.14 + }, + "R5": { + "requirement": "The final recommendation explicitly answers whether Silver Cross seems like a genuinely good brand overall, what kind of buyer it fits best versus who should skip it, and whether Silver Cross or one of the compared alternatives is the better choice for a practical everyday stroller setup.", + "verification": "Closing synthesis addresses all three decision questions directly and ties the recommendation to the gathered comparison evidence.", + "weight": 0.16 + }, + "R6": { + "requirement": "The work relies on public product pages plus independent reviews and owner feedback rather than brand marketing alone.", + "verification": "The reasoning incorporates both official product information and non-brand public sources, and the Silver Cross brand assessment is not based solely on Silver Cross pages.", + "weight": 0.08 + } + }, + "task_id": "a8a95699fe40ca1439fb714dcba8e01b022b3e6a", + "website": "https://www.silvercrossbaby.com" + }, + { + "confirmed_task": "I’m considering booking easyJet for a future Europe trip, but before I do I want a serious browser-based risk check rather than just a couple of random anecdotes. Please research easyJet complaints across public sources and build me a practical picture of what goes wrong most often and whether the airline’s published support and passenger-rights information actually covers those situations. Start by finding at least 18 distinct recent customer complaints from at least 4 different public sources, and group them into the biggest recurring themes like delays, cancellations, refunds, baggage, check-in, customer service, or anything else that clearly shows up. For each complaint you keep, note the topic, where it was posted, and a short plain-English summary of what happened. Then open easyJet’s own public help pages for the main complaint themes you found and compare what the airline says customers should do in those situations, including contact/help options and any refund, disruption, or baggage guidance that is publicly available. After that, check at least 3 independent public passenger-rights or consumer-guidance sources so I can see what travelers may actually be entitled to when flights are disrupted or problems occur. Keep the most useful evidence tabs open from both the complaint sources and the official/help pages so I can inspect them myself. Finally, give me one organized decision memo that covers: the top recurring complaint themes, how often each theme appeared in your 18-plus examples, what easyJet’s own pages say about those issues, where outside guidance seems to support or contradict the practical customer experience, and your bottom-line judgment on whether easyJet looks acceptable for a budget-conscious traveler, risky unless the fare savings are large, or worth avoiding for certain trip types.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The final memo uses at least 18 distinct easyJet customer complaints drawn from at least 4 different public sources.", + "verification": "Count the complaint examples and confirm the cited/publicly named sources total 4 or more distinct sources.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each retained complaint includes the complaint topic, where it was posted, and a short plain-English summary of what happened.", + "verification": "Review the complaint list and confirm every example has all three requested fields.", + "weight": 0.16 + }, + "R3": { + "requirement": "The complaints are grouped into recurring themes, and the final memo reports the top recurring complaint themes with how often each theme appeared in the 18-plus examples.", + "verification": "Check that the memo contains named complaint categories and a frequency count for each major theme based on the collected examples.", + "weight": 0.18 + }, + "R4": { + "requirement": "EasyJet’s own public help pages are opened and used for the main complaint themes found, including public contact/help information and any publicly available refund, disruption, or baggage guidance relevant to those themes.", + "verification": "Confirm the final memo references easyJet help content for the major themes and that relevant official pages were opened during the session.", + "weight": 0.16 + }, + "R5": { + "requirement": "At least 3 independent public passenger-rights or consumer-guidance sources are checked and incorporated into the analysis of what travelers may be entitled to when problems occur.", + "verification": "Count the independent non-easyJet guidance sources cited or summarized in the memo and confirm there are at least 3.", + "weight": 0.12 + }, + "R6": { + "requirement": "The most useful evidence tabs are kept open from both complaint sources and official/help pages so the user can inspect the evidence directly.", + "verification": "Inspect the final browser state and confirm that representative complaint-source tabs and easyJet official/help tabs remain open.", + "weight": 0.08 + }, + "R7": { + "requirement": "The final organized decision memo gives a bottom-line judgment on whether easyJet looks acceptable for a budget-conscious traveler, risky unless fare savings are large, or worth avoiding for certain trip types, and it explains that judgment using the complaint patterns and policy/guidance comparison.", + "verification": "Check that the memo ends with one of the requested decision outcomes and that the reasoning explicitly ties back to complaint themes, easyJet’s published pages, and outside guidance.", + "weight": 0.12 + } + }, + "task_id": "a24232e8f6e7e654f09be4e219e692af4fec62a5", + "website": "https://www.easyjet.com" + }, + { + "confirmed_task": "I’m planning a future downtown Pittsburgh stay and I’ll be driving, so I don’t just want to know whether the Fairmont Pittsburgh has parking — I want to know whether it’s actually a smart hotel choice once parking and total convenience are factored in. Start with Fairmont Pittsburgh and then compare it against 7 other well-reviewed upscale or upper-upscale hotels in downtown Pittsburgh or the immediately adjacent core, so I end up with 8 total hotels. For each hotel, use the official property site first and record whether parking is on-site, whether it’s valet or self-parking, whether the parking is free or paid, the stated nightly fee if shown, and any publicly stated details like in/out privileges, oversized vehicle limits, EV charging, or nearby garage arrangements; if an official page leaves something unclear, cross-check one public booking or listing page and mark anything still unavailable as \"not shown\" instead of guessing. Then use maps and hotel photo pages to sanity-check the arrival setup for each place — things like whether the entrance and garage situation look straightforward, whether it seems attached or off-site, and how walkable the hotel is to at least 3 downtown anchors: Market Square, PPG Paints Arena, and PNC Park. After that, compare the 8 hotels on driver convenience and likely all-in nightly cost, using one sample future 2-night stay price from a public booking flow or listing page for each hotel when available, and make me a ranked top-5 shortlist for a driver who wants a nice stay without getting killed on parking or ending up with awkward car logistics. Keep the Fairmont parking/location page open, keep open the exact parking or hotel details pages for the 3 strongest alternatives, and finish with a concise decision memo that tells me whether Fairmont Pittsburgh looks competitive, overpriced once parking is added, or worth it for the location and overall experience.", + "level": "hard", + "reference_length": 5, + "rubrics": { + "R1": { + "requirement": "The work compares exactly 8 total hotels: Fairmont Pittsburgh plus 7 other well-reviewed upscale or upper-upscale hotels in downtown Pittsburgh or the immediately adjacent core.", + "verification": "The final comparison clearly lists 8 hotels by name, with Fairmont Pittsburgh included, and each hotel is within the stated geographic and quality scope.", + "weight": 0.17 + }, + "R2": { + "requirement": "For each of the 8 hotels, the result records whether parking is on-site, whether it is valet or self-parking, whether it is free or paid, the stated nightly fee if shown, and any publicly stated details such as in/out privileges, oversized vehicle limits, EV charging, or nearby garage arrangements, using the official property site first and marking unresolved items as \"not shown\" if needed.", + "verification": "Each hotel entry contains the requested parking fields, shows that the official property site was used first, and uses \"not shown\" rather than unsupported guesses for missing details.", + "weight": 0.23 + }, + "R3": { + "requirement": "When official pages are unclear, one public booking or listing page is used to cross-check the missing parking or stay details for the affected hotels.", + "verification": "At least the hotels with incomplete official information show a secondary public source check, and the final notes distinguish between official details and cross-checked listing details.", + "weight": 0.12 + }, + "R4": { + "requirement": "Each of the 8 hotels is sanity-checked with maps and hotel photo pages for arrival setup and walkability, including how straightforward the entrance/garage situation appears and how walkable the hotel is to Market Square, PPG Paints Arena, and PNC Park.", + "verification": "Every hotel entry includes arrival/logistics observations plus walkability notes covering all 3 named downtown anchors.", + "weight": 0.16 + }, + "R5": { + "requirement": "The comparison includes likely all-in nightly cost for each of the 8 hotels, using one sample future 2-night stay price from a public booking flow or listing page when available, so parking can be evaluated alongside room cost.", + "verification": "Each hotel has a sample stay price or a clearly marked \"not shown\" if unavailable, and the final comparison uses those figures alongside parking terms rather than treating parking in isolation.", + "weight": 0.14 + }, + "R6": { + "requirement": "A ranked top-5 shortlist is produced for a driver who wants a nice stay without excessive parking cost or awkward car logistics, and it explicitly states whether Fairmont Pittsburgh looks competitive, overpriced once parking is added, or worth it for the location and experience.", + "verification": "The final output contains a ranked top 5 and an explicit Fairmont conclusion in one of the requested forms, supported by the comparison findings.", + "weight": 0.12 + }, + "R7": { + "requirement": "The browser is left with the Fairmont parking/location page open and the exact parking or hotel details pages for the 3 strongest alternative hotels also open.", + "verification": "Open tabs at the end visibly include the Fairmont evidence page plus 3 alternative evidence pages that correspond to the final recommendation.", + "weight": 0.06 + } + }, + "task_id": "4523f5c4b7d3a82e73209a447340dd7d46c53907", + "website": "https://www.fairmont.com" + }, + { + "confirmed_task": "I’m trying to turn a dialogue-heavy script into a polished text-to-speech demo with distinct voices for each identifiable character, and before I start paying for anything I want a serious browser-based casting and platform check. Start with ElevenLabs’ public site and use only public pages to figure out whether it looks practical for a multi-character script workflow: review the relevant docs or help pages, pricing or plan pages, and any public voice-library or sample pages that help show how character voice selection would work. Then build me a casting shortlist of exactly 12 candidate ElevenLabs voices total, spread across 6 character slots of your choice that would make sense for a typical script cast—for example narrator, lead male, lead female, older character, younger character, and one wildcard or villain role—with 2 candidate voices per slot. For each of the 12, capture the voice name, apparent gender or style if shown, accent or language if shown, the reason it fits that slot, and mark anything missing as not shown. Keep the most useful public voice pages open so I can compare them later.\n\nAfter that, compare ElevenLabs against exactly 3 other public text-to-speech platforms that look relevant for multi-speaker character work. On public pages only, check whether each one appears to support the kinds of things I’d care about for this project: multiple voices, dialogue or long-form narration suitability, emotional or style control if shown, script or project organization features if shown, and pricing or usage limits if shown. I do not need private trials or sign-ins; if something is unclear, just say not shown. Keep the key pricing or feature pages open for the strongest alternatives.\n\nFinally, put everything into one organized decision memo or document I could actually use before opening an account: a short recommendation on whether ElevenLabs is the best choice for this kind of script, the 6 role slots with the 12-voice shortlist, a side-by-side comparison of the 4 total platforms, the biggest risks or unknowns from public information, and a simple first-pass voice assignment recommendation for the 6 roles. Leave the finished memo open at the end, and also leave open the most useful ElevenLabs voice pages, the most relevant ElevenLabs pricing/docs pages, and the best comparison pages from the alternative platforms.", + "level": "hard", + "reference_length": 6, + "rubrics": { + "R1": { + "requirement": "The final memo concludes whether ElevenLabs appears practical for a multi-character script workflow using public evidence from its docs/help pages, pricing/plan pages, and relevant public voice or sample pages.", + "verification": "Check that the memo includes a clear recommendation about ElevenLabs and cites or is visibly grounded in opened public ElevenLabs documentation/help, pricing, and voice/sample pages.", + "weight": 0.18 + }, + "R2": { + "requirement": "The memo includes exactly 6 character slots and exactly 12 total ElevenLabs voice candidates, with 2 candidate voices per slot.", + "verification": "Count the role slots and voice entries in the memo and confirm the 6-by-2 structure is followed exactly.", + "weight": 0.2 + }, + "R3": { + "requirement": "Each of the 12 ElevenLabs voice candidates includes the voice name, apparent gender or style if shown, accent or language if shown, a fit rationale for the slot, and 'not shown' where information is missing.", + "verification": "Review each voice entry in the memo and confirm all requested fields are present, using 'not shown' where needed rather than inventing details.", + "weight": 0.16 + }, + "R4": { + "requirement": "The most useful public ElevenLabs voice pages are kept open so the shortlisted voices can be compared later.", + "verification": "Inspect the remaining browser tabs and confirm relevant public ElevenLabs voice pages for shortlisted candidates are still open.", + "weight": 0.12 + }, + "R5": { + "requirement": "ElevenLabs is compared against exactly 3 other public text-to-speech platforms, for 4 total platforms, on the requested dimensions: multiple voices, dialogue or long-form narration suitability, emotional or style control if shown, script or project organization features if shown, and pricing or usage limits if shown.", + "verification": "Check the comparison section of the memo and confirm there are exactly 4 platforms total and that each requested comparison dimension is addressed or marked not shown.", + "weight": 0.18 + }, + "R6": { + "requirement": "The key pricing or feature pages for the strongest alternative platforms, along with the most relevant ElevenLabs pricing/docs pages, are left open as browser evidence.", + "verification": "Inspect the open tabs and confirm that the memo’s recommended or strongest comparison pages remain open for ElevenLabs and the alternative platforms.", + "weight": 0.08 + }, + "R7": { + "requirement": "The finished memo is left open and includes a short overall recommendation, the 6-role casting shortlist, the 4-platform comparison, the biggest risks or unknowns from public information, and a simple first-pass voice assignment recommendation for the 6 roles.", + "verification": "Open the final memo and confirm all five requested sections are present and populated.", + "weight": 0.08 + } + }, + "task_id": "86cc69e23296a471c4e9e3da30d63ff54f31665f", + "website": "https://elevenlabs.io" + }, + { + "confirmed_task": "I want help turning a cheap-flights idea into a real decision for a future 7-night holiday in late November from the UK. Please start on Skyscanner and do a broad search from the UK, using a UK-wide departure search if the site supports it or else checking 4 major departure airports: London, Manchester, Birmingham, and Edinburgh. Scan exactly 16 destination candidates for a 7-night round-trip economy trip in late November, then narrow them to exactly 4 destinations that look like genuinely good holiday options, not just random cheap fares. For each of the 4 finalists, keep the Skyscanner results page open and record the destination city and airport, the departure airport used, the lowest displayed round-trip price, and any obvious routing drawback shown on the results page. Please treat a fare of £250 or less as strong value, and if none of the best options for a finalist are under £250, note that clearly instead of forcing it. Then sanity-check each finalist on public pages by opening 2 lodging options per destination and using map/photos to judge whether a full week there looks realistic on a moderate budget, with a target of no more than about £140 per night if possible; if that is not available, say so. Also open 1 reliable weather or climate page per finalist so I can see what late November is likely to feel like. After that, compare the 4 finalists side by side on flight cost, lodging realism, likely weather, and the kind of trip each suits best—beach, city break, nature, or mixed sightseeing. Finish by recommending exactly 1 best overall choice, 1 best budget pick, and 1 best warm-weather option, and leave the 4 Skyscanner tabs plus the most useful hotel and weather tabs open so I can review the evidence.", + "level": "hard", + "reference_length": 7, + "rubrics": { + "R1": { + "requirement": "The session performs a broad Skyscanner discovery sweep covering exactly 16 destination candidates for a 7-night late-November round-trip economy trip from the UK, using a UK-wide departure search if supported or else the 4 specified departure airports: London, Manchester, Birmingham, and Edinburgh.", + "verification": "The final output states that exactly 16 candidates were scanned and indicates whether the search used a UK-wide departure option or the 4 named airports, with browser evidence showing a broad discovery process rather than only four destinations.", + "weight": 0.18 + }, + "R2": { + "requirement": "Exactly 4 finalist destinations are selected, and for each finalist the result includes the destination city and airport, the departure airport used, the lowest displayed round-trip price, and any obvious routing drawback shown on the Skyscanner results page.", + "verification": "The final comparison lists exactly four finalists with all requested flight fields, and the corresponding four Skyscanner results pages remain open as evidence.", + "weight": 0.24 + }, + "R3": { + "requirement": "The flight assessment explicitly applies the stated value threshold by identifying whether each finalist has a fare of £250 or less, or clearly noting when the best option for that finalist is above £250.", + "verification": "Each of the four finalist entries includes a clear value note tied to the £250 threshold, with no missing assessment.", + "weight": 0.12 + }, + "R4": { + "requirement": "Each of the 4 finalists is sanity-checked with exactly 2 public lodging options that provide enough map and/or photo context to judge whether a full week there looks realistic on a moderate budget, using the target of about £140 per night if possible and stating when that target is not achievable.", + "verification": "The final output includes two lodging checks for each finalist with nightly-price context and a clear note on whether the roughly £140 per night target appears feasible, supported by relevant hotel or lodging tabs.", + "weight": 0.17 + }, + "R5": { + "requirement": "Each of the 4 finalists is also checked against exactly 1 reliable public weather or climate page for late November.", + "verification": "The final comparison includes late-November weather or climate context for all four finalists, with one supporting weather or climate source referenced for each and useful tabs left open.", + "weight": 0.11 + }, + "R6": { + "requirement": "The 4 finalists are compared side by side on the four requested decision factors: flight cost, lodging realism, likely weather, and whether the destination is best suited to beach, city break, nature, or mixed sightseeing.", + "verification": "The final synthesis presents a structured four-way comparison that covers all four factors for every finalist, not just separate destination notes.", + "weight": 0.1 + }, + "R7": { + "requirement": "The session ends with exactly 1 best overall choice, 1 best budget pick, and 1 best warm-weather option, and leaves open the 4 Skyscanner finalist tabs plus the most useful hotel and weather tabs for review.", + "verification": "The final recommendation names all three requested picks, and the browser state shows the four finalist flight tabs and selected lodging/weather evidence tabs still open.", + "weight": 0.08 + } + }, + "task_id": "45408d6942b6c26bd37b4675e183993602b940bb", + "website": "https://www.skyscanner.net" + }, + { + "confirmed_task": "I need a serious browser-based weekly briefing on Brazil’s finance and macroeconomic news, not just a quick skim of one portal. Please use public pages to find and compare at least 18 finance-related items published within the past 7 days from a mix of major Brazilian news outlets and official sources such as the central bank, finance ministry, statistics agency, stock exchange, or major banks’ research pages whenever relevant. For each item, capture the headline, publication date, source, link, and a short English summary or translation of the key point. Deduplicate overlapping coverage, then narrow the set to the 10 most important developments and group them into the major themes driving the week, such as inflation, rates, fiscal policy, currency, trade, markets, commodities, regulation, or corporate finance. For each of those 10, explain why it matters inside Brazil and whether it could plausibly affect the United States through trade, commodities, capital flows, inflation, supply chains, multinational earnings, or broader market sentiment; if there is no clear U.S. angle, say so. Keep the most useful evidence tabs open, including at least 6 source pages that show the strongest or most representative stories and at least 2 official-source pages used for verification. Then produce one organized briefing document in CryptoPad Documents with three parts: a source log of the 18+ items, a ranked top-10 summary in English, and a final section that highlights the 3 to 5 developments most likely to matter to a U.S.-based investor, policymaker, or business reader. ", + "level": "hard", + "reference_length": 6, + "rubrics": { + "R1": { + "requirement": "A briefing document is created and organized into the three requested parts: a source log of at least 18 finance-related items from the past 7 days, a ranked top-10 summary in English, and a final section highlighting the 3 to 5 developments most likely to matter to a U.S.-based investor, policymaker, or business reader, and the briefing is created in CryptoPad Documents and left open for review.", + "verification": "Check that the final document exists, is left accessible, and contains all three sections with the requested counts and structure; the open deliverable is a CryptoPad Document containing the requested three-part briefing structure.", + "weight": 0.2 + }, + "R2": { + "requirement": "The source log includes at least 18 qualifying items from the past 7 days drawn from a mix of major Brazilian news outlets and official sources, and each item records the headline, publication date, source, link, and a short English summary or translation.", + "verification": "Review the source log entries for count, date window, source diversity, and the presence of all required fields for each item.", + "weight": 0.2 + }, + "R3": { + "requirement": "Overlapping coverage is deduplicated and the research is narrowed to exactly 10 most important developments, grouped into the major themes requested such as inflation, rates, fiscal policy, currency, trade, markets, commodities, regulation, or corporate finance.", + "verification": "Inspect the ranked top-10 section to confirm there are exactly 10 developments, that duplicates are consolidated, and that the items are grouped into coherent major themes.", + "weight": 0.16 + }, + "R4": { + "requirement": "Each of the 10 most important developments includes an English explanation of why it matters inside Brazil and whether it could plausibly affect the United States through trade, commodities, capital flows, inflation, supply chains, multinational earnings, or broader market sentiment, with 'no clear U.S. angle' stated when appropriate.", + "verification": "Check each top-10 entry for both a Brazil significance note and a U.S.-impact assessment aligned to the requested channels or an explicit statement that no clear U.S. angle exists.", + "weight": 0.18 + }, + "R5": { + "requirement": "The most useful evidence tabs are kept open, including at least 6 source pages showing the strongest or most representative stories and at least 2 official-source pages used for verification.", + "verification": "Confirm that the browser still has open tabs meeting the requested minimums for representative source pages and official verification pages.", + "weight": 0.12 + }, + "R6": { + "requirement": "The final section clearly identifies 3 to 5 developments most likely to matter to a U.S.-based investor, policymaker, or business reader and explains why those were prioritized over the other stories.", + "verification": "Review the final section for a shortlist of 3 to 5 items with explicit prioritization reasoning tied to U.S.-based readers.", + "weight": 0.14 + } + }, + "task_id": "985d8f98bc40053112f164b8d6dd010db1dcfd10", + "website": "https://www.globo.com" + }, + { + "confirmed_task": "I’m trying to figure out the best 5-day camper-van road trip in Iceland for a future trip, and I don’t want a generic blog itinerary. Please use public sources to compare the most realistic 5-day options first, then build one final route that focuses on the famous highlights but is still reasonable to drive in a camper van. Start by checking a campground source like Tjalda plus maps and a few reputable Iceland travel pages so you can decide whether this should be a South Coast-heavy loop, a Golden Circle plus South Coast plan, or another clearly better 5-day route. Once you’ve chosen the best route, make me a day-by-day itinerary for exactly 5 days with the driving sequence, estimated driving time for each day, and the main stops and attractions in order. Include at least 12 total sightseeing stops across the trip, but keep the plan realistic rather than cramming everything in. For each night, verify one legal campground option on a public page and note the location plus any obvious camper-relevant details like showers, power, season, or check-in rules; if a detail is missing, say not shown. Also flag any day that looks especially weather- or road-sensitive and give 2 backup substitutions for the whole trip in case conditions are poor. Please keep the key evidence visible in the browser by leaving open the final map view for the chosen route, the 5 campground pages you relied on, and 3 especially useful attraction or planning pages with photos or visitor info so I can sanity-check the plan afterward. End with a concise recommendation explaining why this is the best 5-day camper-van version of Iceland rather than trying to overreach into an unrealistic Ring Road sprint.", + "level": "hard", + "reference_length": 21, + "rubrics": { + "R1": { + "requirement": "The final plan chooses one clear 5-day Iceland camper-van route after comparing the most realistic route shapes and explains why that route was selected.", + "verification": "Check that the final answer explicitly compares plausible 5-day route options and then names one chosen route with a short justification tied to camper-van practicality and famous highlights.", + "weight": 0.18 + }, + "R2": { + "requirement": "The itinerary covers exactly 5 days and gives the driving sequence, estimated driving time for each day, and the main stops and attractions in order.", + "verification": "Check that there are exactly 5 day sections and that each includes route order plus an estimated driving time and ordered stops.", + "weight": 0.2 + }, + "R3": { + "requirement": "The itinerary includes at least 12 total sightseeing stops across the trip while keeping the plan realistic.", + "verification": "Count the named sightseeing stops across all 5 days and confirm there are 12 or more, with no sign that the route contradicts the stated driving feasibility.", + "weight": 0.14 + }, + "R4": { + "requirement": "Each of the 5 nights includes one verified legal campground option from a public page, with location and any obvious camper-relevant details such as showers, power, season, or check-in rules; missing details are marked not shown.", + "verification": "Check that there are 5 overnight campground entries, each tied to a public campground page and containing the requested details or an explicit not shown note.", + "weight": 0.2 + }, + "R5": { + "requirement": "The plan flags any especially weather- or road-sensitive day and provides 2 backup substitutions for the trip in case conditions are poor.", + "verification": "Check that at least one sensitive day is identified when applicable and that exactly 2 backup substitutions are included and clearly connected to the itinerary.", + "weight": 0.12 + }, + "R6": { + "requirement": "Key browser evidence is left open: the final map view for the chosen route, the 5 campground pages used, and 3 useful attraction or planning pages with photos or visitor information.", + "verification": "Check open tabs or end-state evidence for 1 route map, 5 campground pages, and 3 attraction/planning pages that match the itinerary.", + "weight": 0.16 + } + }, + "task_id": "a5a6aae50363919e5500634aa0f97211c3d06feb", + "website": "https://tjalda.is" + }, + { + "confirmed_task": "I’m seriously thinking about buying a robot vacuum soon, and I don’t just want three random YouTube picks — I want a browser-based shortlist I could actually use to choose the right one for 2026. Please start on YouTube and find 6 strong video reviews or roundups about the best robot vacuums for 2026, with at least 3 from channels that appear to do hands-on testing rather than generic affiliate slideshows. From those videos, pull out the models that come up most often and build a candidate set of exactly 9 robot vacuums across 3 price tiers: 3 budget, 3 mid-range, and 3 premium. Then verify each candidate on its official product page and on at least one major retailer listing so I can see that it’s a real, current product with live pricing or 'not shown' if the price isn’t visible. For each of the 9, compare the things I’d actually care about for a normal home with mixed floors and some hair pickup: vacuuming performance, mopping approach if it has one, obstacle avoidance, auto-empty or dock features, battery/runtime claims, and whether replacement parts or consumables are easy to find on public pages. After that, narrow the 9 down to a final shortlist of 4 models: best overall, best value, best for pet hair, and best premium splurge. Keep the most useful evidence visible in the browser by leaving open the 3 strongest YouTube review tabs, the 4 final product pages, and at least 2 retailer pages for the finalists so I can sanity-check prices and photos myself. Can you start by playing the Youtube video for the best rated one?End with a concise recommendation that tells me which single model you’d buy if you were optimizing for value, and which one you’d buy if money were less important.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies exactly 6 YouTube video reviews or roundups about the best robot vacuums for 2026, with at least 3 coming from channels that appear to do hands-on testing rather than generic affiliate-style compilations.", + "verification": "Check that 6 distinct YouTube videos were selected and that the notes or final synthesis explicitly indicate which 3 or more were judged to be hands-on-testing sources.", + "weight": 0.16 + }, + "R2": { + "requirement": "A candidate set of exactly 9 robot vacuums is built from the video findings, split into 3 budget, 3 mid-range, and 3 premium models.", + "verification": "Check that the final comparison includes exactly 9 models and that each is assigned to one of the three requested price tiers with 3 models per tier.", + "weight": 0.17 + }, + "R3": { + "requirement": "Each of the 9 candidates is verified on its official product page and on at least one major retailer listing, with live pricing captured where visible or marked 'not shown' when missing.", + "verification": "Check that every one of the 9 models has both an official-page reference and a retailer-page reference, and that the pricing field is filled with a visible price or the literal fallback 'not shown'.", + "weight": 0.19 + }, + "R4": { + "requirement": "For each of the 9 robot vacuums, the comparison covers vacuuming performance, mopping approach if present, obstacle avoidance, auto-empty or dock features, battery/runtime claims, and replacement-parts or consumables availability on public pages.", + "verification": "Check that all 9 entries include all six requested comparison dimensions, allowing 'not shown' only where a public page does not provide the information.", + "weight": 0.19 + }, + "R5": { + "requirement": "The 9-model comparison is narrowed to a final shortlist of exactly 4 models labeled best overall, best value, best for pet hair, and best premium splurge.", + "verification": "Check that the final output contains exactly 4 finalists and that each is explicitly assigned to one of the four requested recommendation categories.", + "weight": 0.14 + }, + "R6": { + "requirement": "Useful browser evidence is left open at the end: the 3 strongest YouTube review tabs, the 4 final product pages, and at least 2 retailer pages for the finalists, and the best-rated selected YouTube review video is started playing at the beginning of the task.", + "verification": "Confirm that the specified tabs remain open and correspond to the chosen 3 review videos, 4 finalist official product pages, and 2 or more retailer listings for those finalists; one of the chosen YouTube evidence tabs is visibly the selected best-rated review and is playing.", + "weight": 0.15 + } + }, + "task_id": "148dc4d3bebc57698033c4189935c3e5be4f38c1", + "website": "https://www.youtube.com" + }, + { + "confirmed_task": "I want to stop treating chess like a vague someday hobby and build myself a real beginner on-ramp that I could actually follow over the next month. Please use Lichess as the core hands-on tool, but don’t stay trapped on one site if a broader public-web comparison would help me learn more intelligently. Start by finding and comparing at least 8 beginner-friendly chess learning resources across public pages, with at least 3 of them coming from Lichess features or study pages and the rest coming from other reputable public resources like beginner lesson hubs, video-based instruction, or structured practice pages. For each one, figure out what it actually teaches a true novice, how interactive it is, and whether it seems more useful for rules, tactics, opening principles, endgames, or guided practice. Then on Lichess, actually complete the most foundational beginner material needed to understand legal moves, check and checkmate, basic tactics, and simple game play, and keep the key lesson or practice tabs open so I can see exactly what was used. After that, play exactly 3 practice games on public chess tools: 2 against the computer at beginner-friendly strength and 1 full game against a human if a public no-signup route is available, otherwise use a third computer game and note that the human option was not shown. While doing that, pay attention to the kinds of mistakes a novice is likely to make and use that to decide what I should practice first. Finally, give me one organized beginner study plan for my first 4 weeks that includes exactly 12 concrete practice items: 4 interactive lesson modules, 3 puzzle or tactics activities, 2 annotated game-watching resources, 2 practice-game habits to use every time I play, and 1 very simple opening principle guide. Recommend the best 5 tabs for me to keep open as my starter kit, leave those useful pages open, and explain why those 5 made the cut.", + "level": "hard", + "reference_length": 8, + "rubrics": { + "R1": { + "requirement": "At least 8 beginner-friendly chess learning resources are compared on public pages, with at least 3 from Lichess and the rest from other reputable public resources, and each resource is evaluated for what it teaches and how useful it is for a true novice.", + "verification": "The final response lists 8 or more named resources, identifies which are from Lichess, and records for each the learning focus such as rules, tactics, opening principles, endgames, or guided practice, plus a brief usefulness judgment.", + "weight": 0.18 + }, + "R2": { + "requirement": "The browsing session actually uses Lichess as the core hands-on tool by completing the foundational beginner material needed for legal moves, check and checkmate, basic tactics, and simple game play, with the key lesson or practice tabs kept open.", + "verification": "The final response names the Lichess lessons or practice sections used for those four beginner topics and indicates that the key Lichess lesson/practice pages remain open.", + "weight": 0.2 + }, + "R3": { + "requirement": "Exactly 3 practice games are played on public chess tools: 2 against the computer at beginner-friendly strength and 1 full human game if a public no-signup option is available, otherwise a third computer game with the limitation noted.", + "verification": "The final response reports exactly 3 practice games, specifies opponent type and beginner-friendly computer use, and clearly notes whether the human game was completed or replaced because a public no-signup route was not shown.", + "weight": 0.17 + }, + "R4": { + "requirement": "The practice-game phase is used diagnostically by identifying the kinds of novice mistakes that showed up and using those observations to decide what should be practiced first.", + "verification": "The final response includes specific mistake patterns observed from the games and ties them to prioritized next-step practice recommendations.", + "weight": 0.14 + }, + "R5": { + "requirement": "One organized 4-week beginner study plan is produced with exactly 12 concrete practice items: 4 interactive lesson modules, 3 puzzle or tactics activities, 2 annotated game-watching resources, 2 practice-game habits, and 1 simple opening principle guide.", + "verification": "The final plan is structured over 4 weeks and contains exactly 12 items in the requested category counts with no missing or extra items.", + "weight": 0.19 + }, + "R6": { + "requirement": "The session ends with the best 5 tabs kept open as a starter kit, and the final response explains why those 5 pages were chosen.", + "verification": "The final response identifies exactly 5 kept-open tabs and gives a reason for each that matches its role in the beginner learning path.", + "weight": 0.12 + } + }, + "task_id": "407381583456934981fbca0f1b91e4fa8a0883b2", + "website": "https://lichess.org" + }, + { + "confirmed_task": "I want to buy a starter tool kit soon, but I don’t want to get fooled by inflated piece counts or end up with a case full of filler. Please do a serious browser-based comparison of 12 all-in-one tool kits from at least 4 major brands across Lowe’s and other big public retailers, with a target budget of about $75 to $200, aimed at someone who needs one kit that can realistically handle apartment setup, basic home fixes, furniture assembly, and light car or bike work. For each kit, use the actual product pages and any available manuals or contents lists to note the real included tools, not just the headline piece count, and call out obvious gaps like missing adjustable wrench, locking pliers, socket depth variety, precision screwdrivers, hex keys, tape measure, or utility knife. Also compare warranty terms, case organization, whether replacement tools are easy to find in that brand ecosystem, current listed price, and whether the page suggests shipping or store pickup. Keep the strongest 4 product tabs open, plus at least 2 warranty or official brand-support pages that helped verify the choice. Then recommend exactly 3 finalists: best overall, best budget, and best compact kit, with a short explanation of who each is for. After that, go to Lowe’s and find the best Lowe’s-listed option that matches your top recommendation as closely as possible, add it to the cart, and leave the Lowe’s cart open along with the chosen product page and the key comparison tabs so I can review everything before deciding whether to check out.", + "level": "hard", + "reference_length": 5, + "rubrics": { + "R1": { + "requirement": "The session compares exactly 12 all-in-one tool kits from at least 4 major brands using Lowe’s and other major public retailer product pages, within the stated target budget range of about $75 to $200.", + "verification": "Evidence in open product tabs and the final synthesis shows 12 kits total, at least 4 brands, and recorded listed prices within or near the requested budget frame.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each of the 12 kits, the comparison records the real included tools from product pages and/or manuals or contents lists, and explicitly calls out obvious gaps such as missing adjustable wrench, locking pliers, socket depth variety, precision screwdrivers, hex keys, tape measure, or utility knife.", + "verification": "The final comparison contains per-kit contents notes and gap callouts grounded in official listings or manuals/contents pages.", + "weight": 0.22 + }, + "R3": { + "requirement": "The comparison also covers warranty terms, case organization, replacement-tool ecosystem, current listed price, and whether shipping or store pickup appears available for the kits reviewed.", + "verification": "The final synthesis includes those fields for the compared kits, with supporting retailer and brand-support pages open where used.", + "weight": 0.17 + }, + "R4": { + "requirement": "Exactly 3 finalists are recommended: best overall, best budget, and best compact kit, each with a short explanation of who it is for.", + "verification": "The final recommendation section contains exactly three labeled finalists matching the requested categories and explanations.", + "weight": 0.14 + }, + "R5": { + "requirement": "The strongest 4 product tabs remain open, along with at least 2 warranty or official brand-support pages that were used to verify the choice.", + "verification": "Open browser tabs include four finalist product pages and two relevant warranty/support pages tied to the comparison.", + "weight": 0.14 + }, + "R6": { + "requirement": "A Lowe’s-listed option that matches the top recommendation as closely as possible is identified, added to the Lowe’s cart, and the Lowe’s cart and chosen Lowe’s product page are left open at the end.", + "verification": "The final browser state shows the selected Lowe’s product page and a Lowe’s cart containing the chosen kit.", + "weight": 0.15 + } + }, + "task_id": "5bca23145f1a3e0eaf5d207c4fe3eb4275707ecc", + "website": "https://www.lowes.com" + }, + { + "confirmed_task": "I’m thinking seriously about applying to Indiana University for graduate school, but I don’t want a one-program skim or a dead-end application portal. Please do a thorough public-web sweep across Indiana University’s official graduate program and department pages and help me narrow this down into a real shortlist I could use. I want you to identify 12 distinct IU graduate programs that are still publicly described on official pages, spread across at least 4 different schools or departments and, if possible, more than one IU campus. For each of the 12 programs, capture the degree type, the school or department, the campus, the main focus or specialization area, the published admissions requirements, the application deadline or note 'not shown' if it isn’t clearly posted, and any obvious funding or assistantship information if it is publicly listed. Then go one level deeper on the 6 strongest options by opening their program pages plus the most relevant admissions or deadline pages in separate tabs, and also check faculty, research, or curriculum pages so I can see what makes each program meaningfully different. After that, recommend the best 3 programs for a broadly motivated applicant who wants a strong academic fit, clear admissions information, and a realistic path to applying in the next cycle. In your final summary, explain why those 3 rose to the top, note any missing or ambiguous information that would require a direct inquiry, and leave the most useful official evidence tabs open for the final 6 programs so I can review them myself.", + "level": "hard", + "reference_length": 8, + "rubrics": { + "R1": { + "requirement": "Exactly 12 distinct Indiana University graduate programs are identified from official public IU pages, covering at least 4 different schools or departments and, where publicly available, more than one IU campus.", + "verification": "Check the final comparison to confirm there are 12 unique programs, each tied to an official IU source, and that the set spans at least 4 schools or departments; campus labels are recorded where shown.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each of the 12 programs, the summary includes the degree type, school or department, campus, main focus or specialization area, published admissions requirements, application deadline or 'not shown,' and any publicly listed funding or assistantship information.", + "verification": "Review the final program-by-program summary and confirm that each of the requested fields is filled in or marked 'not shown' when absent on the public pages.", + "weight": 0.22 + }, + "R3": { + "requirement": "The 6 strongest options are investigated more deeply using separate official program pages plus the most relevant admissions or deadline pages, with faculty, research, or curriculum pages also consulted for each of those 6 programs.", + "verification": "Confirm that browser evidence exists for all 6 deeper-review programs and that each one has both core program/admissions evidence and at least one faculty, research, or curriculum page informing the comparison.", + "weight": 0.17 + }, + "R4": { + "requirement": "The final comparison clearly explains what meaningfully differentiates the 6 deeper-review programs using faculty, research, or curriculum evidence rather than only repeating generic catalog descriptions.", + "verification": "Inspect the write-up for the 6 deeper-review programs and confirm that each has at least one concrete differentiator grounded in the consulted official pages.", + "weight": 0.15 + }, + "R5": { + "requirement": "A final recommendation names exactly 3 programs as the best options for a broadly motivated applicant seeking strong academic fit, clear admissions information, and a realistic path to applying in the next cycle.", + "verification": "Check that exactly 3 programs are recommended and that the reasoning for each explicitly addresses academic fit, clarity of admissions information, and application realism.", + "weight": 0.14 + }, + "R6": { + "requirement": "The final summary explicitly notes any missing, unclear, or ambiguous information that would require direct follow-up with the program or admissions office.", + "verification": "Confirm that the final summary contains a dedicated note on missing or ambiguous items, tied to the relevant programs rather than omitted silently.", + "weight": 0.07 + }, + "R7": { + "requirement": "The most useful official evidence tabs for the final 6 programs are left open at the end so the user can review the program and admissions/deadline pages directly.", + "verification": "Check the final browser state for open official IU tabs corresponding to the 6 deeper-review programs, including their key program and admissions/deadline evidence pages.", + "weight": 0.07 + } + }, + "task_id": "1d20e51c4aa23eb68d37f1eaea56431140ae53c9", + "website": "https://iugraduate2025.cas.myliaison.com" + }, + { + "confirmed_task": "I’m organizing a 16-player golf day and I don’t just want a quick guess at the teams — I want you to use the browser to help me build a fair setup I could actually use. Please research reputable public golf sources to compare at least 6 different pages covering 4-person competition formats, handicap-allocation methods, and common scoring or tie-break rules for casual tournaments or outings. Focus on formats that could work cleanly for exactly four teams of four, and compare things like whether the format rewards low handicaps too heavily, how much handicap adjustment is usually recommended, and whether the scoring is simple enough for one-day recreational play. Then choose the single best format for this group and explain why it fits better than the alternatives. After that, use this exact player list to create exactly 4 teams of 4: Cody: 1, C: 3, Greg: 4, Johnny: 4, Dave: 4, Sal: 4, Lane: 5, Steve: 10, Christian: 11, Carl: 11, Rawley: 11, Gary: 13, Albert: 14, Eric: 16, Doric: 17, Marcus: 18. For the final team recommendation, show each team’s players, total handicap, average handicap, and handicap spread, and make the teams as balanced as practical for the chosen format. Also recommend one clear scoring method, one handicap allowance rule, and one tie-break procedure based on the public sources you found. Keep the most useful rules and format-comparison pages open in separate tabs — including the chosen format source and at least 2 credible alternative-format pages — so I can review the evidence afterward. After this, can you help me find a public golf course near Cleveland where this would be possible? Find 5 possible candidates, then open them up in separate tabs. When this is done, write up a CrpytoPad document with the teams and the possible golf course candidates, ranked by feasibility and quality.", + "level": "hard", + "reference_length": 15, + "rubrics": { + "R1": { + "requirement": "At least 6 reputable public golf pages are researched and compared for 4-person competition formats, handicap-allocation methods, and common scoring or tie-break rules relevant to a 16-player outing.", + "verification": "The final output cites or clearly uses findings from 6 or more public golf rules, format, or tournament-guidance pages.", + "weight": 0.12 + }, + "R2": { + "requirement": "The format comparison focuses on options that work cleanly for exactly four teams of four and discusses fairness across handicap levels, handicap-adjustment approach, and scoring simplicity for one-day recreational play.", + "verification": "The comparison explicitly addresses format fit for four teams of four and covers the requested fairness, handicap, and simplicity tradeoffs.", + "weight": 0.14 + }, + "R3": { + "requirement": "One single best format is chosen for this exact group and is explained as a better fit than the alternatives reviewed.", + "verification": "The final recommendation names one format and gives a specific why-this-one explanation tied to the compared alternatives.", + "weight": 0.12 + }, + "R4": { + "requirement": "Using the exact provided player list, the final recommendation creates exactly 4 teams of 4 and shows each team’s players, total handicap, average handicap, and handicap spread, with teams balanced as practically as possible for the chosen format.", + "verification": "All 16 named players appear exactly once across 4 teams, and each team entry includes the required handicap metrics.", + "weight": 0.18 + }, + "R5": { + "requirement": "The final recommendation includes one clear scoring method, one handicap allowance rule, and one tie-break procedure based on the public-source research, and the key format-comparison pages remain open, including the chosen format source and at least 2 credible alternatives.", + "verification": "The closing recommendation specifies the requested rules and the browser keeps open the chosen-format page plus at least 2 alternative-format evidence pages.", + "weight": 0.14 + }, + "R6": { + "requirement": "The browsing session identifies exactly 5 public golf course candidates near Cleveland where the outing looks plausible, opens those candidate pages in separate tabs, and compares them for feasibility and quality.", + "verification": "There are 5 distinct public golf course candidates near Cleveland in the final output, and their pages remain open in separate tabs with comparative feasibility/quality notes.", + "weight": 0.14 + }, + "R7": { + "requirement": "A CryptoPad Document is created that includes the final team setup plus the 5 golf-course candidates ranked by feasibility and quality.", + "verification": "The open CryptoPad Document contains both the team recommendations and a ranked course-candidate section using the requested feasibility/quality framing.", + "weight": 0.16 + } + }, + "task_id": "591e86ed962faf5eddbee560c99b020b1c835aaf", + "website": "https://ww3.unipark.de" + }, + { + "confirmed_task": "I’m trying to choose a consumer DNA test primarily for health predisposition insights, not just ancestry, and I want you to do a serious browser-based comparison so I can make one confident purchase instead of guessing from Amazon listings. Start by identifying 6 to 8 consumer DNA kits that publicly advertise health-related genetic reporting or health predisposition information, using official company product pages first and expanding beyond Amazon if that gives a clearer picture. For each kit, check the official product page and the company’s help, FAQ, or report-description pages to record exactly what kind of health information is included, such as predisposition reports, carrier status, wellness traits, pharmacogenomics, or anything else relevant; if something is not shown clearly, mark it as not shown rather than assuming. Then verify the medical framing on public authoritative pages where useful, like FDA or major health-system guidance pages, so the comparison distinguishes between true consumer risk reports and broader wellness-style claims. I also want the practical decision details that matter before buying: list price, whether there is a subscription or paid add-on, sample type, stated turnaround time, whether raw data download is offered, whether data deletion is described, and whether the company says customer data may be used for research or shared with partners. Open and keep the most important evidence tabs available as you work: at least 4 official kit pages, at least 3 privacy or data-control pages from different companies, and 2 authoritative public guidance pages that help interpret what these tests can and cannot tell me. After comparing everything, narrow it to the best 3 options for someone whose top priority is meaningful health predisposition reporting with reasonable privacy controls, and explain the tradeoffs between them. End by recommending exactly 1 kit as the best overall choice, 1 runner-up for stronger privacy sensitivity, and 1 budget-conscious alternative, citing which health-predisposition-related features each one actually offers and clearly noting any important limitations or not-shown details. Leave the final recommendation pages and the most useful comparison evidence tabs open. Create a presentation for me to read your analysis in on CrpytoPad Presentations.", + "level": "hard", + "reference_length": 9, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies 6 to 8 consumer DNA kits that publicly advertise health-related genetic reporting or health predisposition information, using official company product pages as the core evidence.", + "verification": "Check that the final output names 6 to 8 kits and that each is supported by an official public product page or equivalent official kit page visited during the session.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each of the 6 to 8 kits, the comparison records what health-related information is included, such as predisposition reports, carrier status, wellness traits, pharmacogenomics, or other relevant report categories, with 'not shown' used where the public pages do not make this clear.", + "verification": "Review the final comparison and confirm every kit has report-scope details filled in from official product/help/report-description pages, with no unsupported assumptions where details are missing.", + "weight": 0.2 + }, + "R3": { + "requirement": "The final comparison includes the practical pre-purchase details explicitly requested for each kit: list price, subscription or paid add-on status, sample type, stated turnaround time, raw data download availability, whether data deletion is described, and whether customer data may be used for research or shared with partners.", + "verification": "Check that each kit entry contains all requested decision fields or 'not shown' where needed, based on public company pages such as product, FAQ, terms, or privacy pages.", + "weight": 0.2 + }, + "R4": { + "requirement": "The session uses authoritative public guidance where useful to distinguish more meaningful health predisposition reporting from broader wellness-style claims, including 2 authoritative public guidance pages kept as evidence.", + "verification": "Confirm that 2 authoritative public guidance pages were opened and that the final write-up uses them to explain limits or interpretation differences relevant to consumer genetic health testing.", + "weight": 0.12 + }, + "R5": { + "requirement": "Key browser evidence remains visible: at least 4 official kit pages, at least 3 privacy or data-control pages from different companies, and 2 authoritative public guidance pages are opened and kept available.", + "verification": "Inspect the open tabs or recorded evidence set and confirm the required counts and source types are present and correspond to the products discussed.", + "weight": 0.12 + }, + "R6": { + "requirement": "The work narrows the field to the best 3 options for someone prioritizing meaningful health predisposition reporting with reasonable privacy controls, and explains the tradeoffs between those 3 options.", + "verification": "Check that exactly 3 finalists are presented and that each finalist includes a clear tradeoff explanation tied to report scope, privacy controls, and other compared factors.", + "weight": 0.1 + }, + "R7": { + "requirement": "The final recommendation names exactly 1 best overall kit, 1 runner-up for stronger privacy sensitivity, and 1 budget-conscious alternative, and cites which health-predisposition-related features each one actually offers while noting important limitations or not-shown details, and a CryptoPad Presentation is also created so the analysis can be read through as a presentation.", + "verification": "Confirm the conclusion contains exactly these 3 recommendation roles and that each recommendation is justified with feature-specific evidence plus any relevant caveats; an open CryptoPad Presentation summarizing the comparison and recommendation is available for review.", + "weight": 0.08 + } + }, + "task_id": "6365c47591545e1a214a0eae70d5bb7421b21ed8", + "website": "https://www.amazon.com" + }, + { + "confirmed_task": "I’m trying to buy a Nvidia GeForce RTX 3060 Ti in the UK without getting burned on a sketchy used listing, and I don’t want just the single cheapest result. Please do a proper browser-based buying sweep and help me figure out the best-value option under a hard cap of £500. Start on eBay UK and find 12 to 15 live RTX 3060 Ti listings that look plausibly fully functional, then narrow that set by checking the title, condition notes, photos, seller feedback, return policy, shipping cost, and whether the listing clearly appears to be for an actual Nvidia GeForce RTX 3060 Ti rather than a different card, broken card, parts-only listing, or ambiguous rebadge. Keep the strongest 4 eBay listing tabs open so I can review them later. After that, check at least 4 public UK alternatives such as refurbished or second-hand GPU pages from major retailers or marketplaces like CeX, Overclockers UK, Scan, Facebook Marketplace listings if publicly viewable, or other public UK computer parts sites, so we can see whether eBay is genuinely the best value or just the cheapest-looking option. Compare all viable options on total price, condition, seller trust, return protection, and any obvious risk flags from the photos or description. Then give me a final ranked shortlist of exactly 5 options, with one clear best pick and one safer-but-not-cheapest pick, and explain why. Leave the final shortlist pages open, including the best 3 eBay options and the 2 most relevant non-eBay comparison pages. Fill out a CrpytoPad Spreadsheet with all the options in an easily parsible manner.", + "level": "hard", + "reference_length": 27, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies 12 to 15 live eBay UK RTX 3060 Ti listings under the user’s £500 cap and evaluates whether each is plausibly a fully functional Nvidia GeForce RTX 3060 Ti rather than a broken, parts-only, wrong-model, or ambiguous listing.", + "verification": "Check that the final response references 12 to 15 distinct eBay UK listings and records screening judgments tied to listing title, condition, or description details showing they were actively evaluated against the stated functional and model constraints.", + "weight": 0.2 + }, + "R2": { + "requirement": "For the eBay sweep, the task checks the title, condition notes, photos, seller feedback, return policy, shipping cost, and listing clarity for the candidate listings.", + "verification": "Confirm the comparison notes explicitly cover those requested fields for the viable eBay candidates, with no major requested factor omitted from the assessment.", + "weight": 0.16 + }, + "R3": { + "requirement": "The agent keeps the strongest 4 eBay listing tabs open for later review.", + "verification": "Verify that 4 eBay listing pages remain open at the end and that they correspond to listings described as the strongest or most viable eBay options.", + "weight": 0.12 + }, + "R4": { + "requirement": "The session checks at least 4 public UK non-eBay alternatives, such as refurbished or second-hand GPU pages from major retailers or marketplaces, to test whether eBay is actually the best value.", + "verification": "Confirm that at least 4 distinct public UK comparison pages outside eBay were opened and used in the analysis, and that they are relevant alternatives for buying an RTX 3060 Ti or closely comparable option within the same buying decision.", + "weight": 0.16 + }, + "R5": { + "requirement": "All viable options are compared on total price, condition, seller trust, return protection, and any obvious risk flags from the photos or description.", + "verification": "Check that the final synthesis compares viable options using each of those requested dimensions, including explicit mention of risk flags where present or a clear note when none are obvious.", + "weight": 0.16 + }, + "R6": { + "requirement": "The final output gives a ranked shortlist of exactly 5 options, including one clear best pick and one safer-but-not-cheapest pick, with explanations.", + "verification": "Verify that exactly 5 options are ranked in the final answer and that the response explicitly identifies both the overall best pick and the safer-but-not-cheapest pick with reasons tied to the comparison criteria.", + "weight": 0.12 + }, + "R7": { + "requirement": "The final shortlist pages are left open, including the best 3 eBay options and the 2 most relevant non-eBay comparison pages, and all compared options are also recorded in a CryptoPad Spreadsheet in an easily parsable format.", + "verification": "Confirm that 5 final evidence tabs remain open at the end: 3 eBay shortlist pages and 2 non-eBay comparison pages that were actually used to support the recommendation; an open CryptoPad Spreadsheet captures the compared options in a parsable comparison format.", + "weight": 0.08 + } + }, + "task_id": "7e31dc76193fea7e052767ce6ccd65ecb9f2628d", + "website": "https://apply-worker-ad097d28ee25.herokuapp.com" + }, + { + "confirmed_task": "I need help doing a serious browser-based gift search for a USC Trojans fan, not just grabbing the first sale item I see. Start on Fanatics and use its USC Trojans sale section as the anchor, but then widen naturally to other public retailers that carry licensed USC gear if that gives us better options. I want a final shortlist of exactly 12 gift options total, spread across 4 recipient types: 3 gifts for a student, 3 for an alum, 3 for a parent, and 3 for a general fan. Keep the budget practical by making sure each recipient group includes one option under $25, one from $25 to $60, and one over $60, using “not shown” if a size or variant price is unclear. For every gift, capture the product name, current listed price, whether it is marked as a sale item, the retailer, product type, any notable quality signal from the page such as brand or material, estimated shipping timing if publicly shown, and the return-policy summary from that retailer’s public pages. While you work, keep key product tabs open for at least the best 2 options in each recipient group, plus the return-policy or shipping page for each retailer you actually use, so I can review the evidence. At the end, recommend exactly 4 winners: best budget gift overall, best apparel gift, best home or office gift, and best premium gift, with a short explanation of why each one beat the alternatives.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The final shortlist contains exactly 12 USC Trojans gift options total, with 3 gifts each for a student, an alum, a parent, and a general fan.", + "verification": "Check the final organized results and confirm there are exactly 12 entries and that each of the 4 recipient types has exactly 3 gifts assigned.", + "weight": 0.18 + }, + "R2": { + "requirement": "Within each recipient group, the 3 gifts follow the requested budget spread: one under $25, one from $25 to $60, and one over $60.", + "verification": "Review the recorded prices for all 12 gifts and confirm that each recipient group matches the exact three-tier budget structure.", + "weight": 0.18 + }, + "R3": { + "requirement": "Each gift entry includes the requested shopping details: product name, current listed price, whether it is marked as a sale item, retailer, product type, notable quality signal, estimated shipping timing if publicly shown, and the retailer’s return-policy summary, using 'not shown' where needed.", + "verification": "Inspect each shortlist entry and confirm all required fields are present, with 'not shown' used only when the public page does not provide the field.", + "weight": 0.2 + }, + "R4": { + "requirement": "The browsing session starts from Fanatics’ USC Trojans sale context and then broadens to other public retailers only as needed to build a stronger USC gift shortlist.", + "verification": "Confirm that Fanatics USC sale listings were used as an anchor source and that any additional retailers included are public USC merchandise pages relevant to the gift comparison.", + "weight": 0.12 + }, + "R5": { + "requirement": "Key browser evidence is left open: at least the best 2 product tabs for each of the 4 recipient groups, plus the return-policy or shipping page for each retailer actually used.", + "verification": "Count the open tabs at the end and confirm there are product pages left open for 8 shortlisted gifts total across the recipient groups, along with policy or shipping pages for each participating retailer.", + "weight": 0.14 + }, + "R6": { + "requirement": "The final recommendation identifies exactly 4 winners: best budget gift overall, best apparel gift, best home or office gift, and best premium gift, each with a short reason it beat the alternatives.", + "verification": "Check the closing synthesis and confirm there are exactly 4 named winners matching the requested categories, each with a comparative justification.", + "weight": 0.18 + } + }, + "task_id": "9dd06f1da59b2d84a9852960561309e02b343476", + "website": "https://www.fanatics.com" + }, + { + "confirmed_task": "I’m trying to book a future trip to Manila and I don’t want to be fooled by one headline fare that turns ugly once the details matter. Please use Google Flights to do a serious Bay Area-to-Manila comparison for an evening departure, keeping the original spirit of max 1 stop, but broaden it into a real booking decision. Check 3 departure airports — SFO, OAK, and SJC — and compare 4 evening departure dates in the same future travel window, so I end up with 12 total candidate searches. For each search, identify the cheapest itinerary Google Flights shows that still respects the evening-departure preference and the maximum-1-stop rule, and note the price, airline, departure time, arrival time, layover airport, and total trip duration. Then narrow those 12 down to the 5 strongest options overall, not just the 5 absolute cheapest, by weighing price against total travel time and whether the layover looks reasonable. For those 5 finalists, open the airline’s own public booking or fare-details pages when possible and verify the practical catch points: whether the fare appears bookable there, the basic carry-on and first checked-bag situation if shown, and any obvious change/refund restrictions or fare-class limitations shown on public pages; if something is not shown, say 'not shown.' Keep the most useful Google Flights result tabs open for the final 5 options, plus at least 2 airline fare or policy pages that help explain the tradeoffs. At the end, give me one clear recommendation for the best value option, one backup that is the absolute cheapest, and one backup that is the least painful in total travel time, with a short explanation of why each earned that slot.", + "level": "hard", + "reference_length": 7, + "rubrics": { + "R1": { + "requirement": "The session compares exactly 12 total candidate searches covering 3 departure airports — SFO, OAK, and SJC — across 4 evening departure dates in the same future travel window, all with a maximum of 1 stop.", + "verification": "Final output lists 12 candidate search results and each one is traceable to a Google Flights search matching one of the 3 airports and one of the 4 dates, with the max-1-stop and evening-departure framing applied.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each of the 12 candidate searches, the cheapest qualifying Google Flights itinerary is recorded with price, airline, departure time, arrival time, layover airport, and total trip duration.", + "verification": "All 12 entries include those six fields, using 'not shown' only where a field is genuinely unavailable on the public page.", + "weight": 0.18 + }, + "R3": { + "requirement": "The 12 candidates are narrowed to exactly 5 strongest options overall using a stated comparison of price, total travel time, and layover reasonableness rather than price alone.", + "verification": "Final synthesis explicitly identifies 5 finalists and explains the comparison logic in terms of the three requested tradeoff factors.", + "weight": 0.18 + }, + "R4": { + "requirement": "For each of the 5 finalists, public airline booking or fare-details pages are checked when possible to verify whether the fare appears bookable there and to capture the basic carry-on situation, first checked-bag situation, and any obvious change/refund restrictions or fare-class limitations shown; missing details may be marked 'not shown.'", + "verification": "Each of the 5 finalists includes those verification notes, and the notes are grounded in airline public pages when available rather than only in Google Flights.", + "weight": 0.18 + }, + "R5": { + "requirement": "The browser evidence is left in a useful state: the most useful Google Flights result tabs remain open for the final 5 options, plus at least 2 airline fare or policy pages that explain the tradeoffs.", + "verification": "Open tabs at the end visibly include Google Flights pages for the 5 finalists and at least 2 relevant airline fare or policy pages.", + "weight": 0.14 + }, + "R6": { + "requirement": "The final recommendation gives exactly 3 named outcomes: one best value option, one backup that is the absolute cheapest, and one backup that is the least painful in total travel time, each with a short explanation.", + "verification": "Final response clearly labels all 3 outcomes and explains why each earned its slot based on the comparison work.", + "weight": 0.14 + } + }, + "task_id": "8956c9188720c23c45bbb0a5028ce5d61ddb6648", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I’m getting a poetry packet ready and I don’t just want one isolated guideline check — I want a real submission plan built around Margie / The American Journal of Poetry as one of my targets. Please start by finding the official submission page for Margie / The American Journal of Poetry, confirm the submission method, any fee, and whether there’s an active reading period or any stated window, and keep that official page open. Then build me a serious shortlist of 10 poetry journals total, including Margie / The American Journal of Poetry plus 9 comparable U.S. literary journals that are publicly open enough to evaluate from their websites. For each of the 10 journals, use the official journal site or official submissions page to capture the submission method, fee or no-fee status, reading period or 'not shown', simultaneous-submission policy if stated, and any obvious packet limits like number of poems or page count if stated. After that, open publicly viewable recent-poem, archive, or current-issue pages for at least 6 of the 10 journals so you can compare aesthetic fit rather than just logistics, and keep the strongest evidence tabs open. I want you to synthesize all of this into one browser-based final recommendation that ranks the 10 journals into three buckets for this packet: top 4 best first-wave submissions, next 3 worth sending if the first wave doesn’t land, and bottom 3 lower-priority or situational targets. In the final synthesis, explain the ranking using both logistics and editorial fit, call out which journals seem best for a more literary/traditional voice versus a more experimental voice when that can be inferred from public samples, and clearly mark anything as 'not shown' when the site doesn’t say. Leave the key official guidelines pages and the most useful sample-poem pages open at the end so I can review the evidence myself.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The browsing session finds and uses the official submission page for Margie / The American Journal of Poetry, confirms the submission method, any fee, and the active reading period or stated submission window if shown, and leaves that official page open.", + "verification": "Check that the final result explicitly records those three APJ fields from an official page and that an APJ official guidelines tab remains open.", + "weight": 0.18 + }, + "R2": { + "requirement": "A total of exactly 10 poetry journals are evaluated, consisting of Margie / The American Journal of Poetry plus 9 comparable U.S. literary journals that can be assessed from public pages.", + "verification": "Count the journals in the final comparison and confirm the set includes APJ plus 9 others, with no duplicates.", + "weight": 0.16 + }, + "R3": { + "requirement": "For each of the 10 journals, the final comparison records the submission method, fee or no-fee status, reading period or 'not shown', simultaneous-submission policy if stated, and any obvious packet limits such as number of poems or page count if stated, using official journal or official submissions pages.", + "verification": "Inspect the completed comparison and confirm all requested fields are present for all 10 journals, with 'not shown' used where needed and the information sourced from official pages.", + "weight": 0.22 + }, + "R4": { + "requirement": "Publicly viewable recent-poem, archive, or current-issue pages are opened for at least 6 of the 10 journals so aesthetic fit can be compared, and the strongest evidence tabs are kept open.", + "verification": "Check that at least 6 journals have public sample-work evidence consulted and that multiple relevant sample-poem or archive tabs remain open at the end.", + "weight": 0.14 + }, + "R5": { + "requirement": "The final synthesis ranks all 10 journals into exactly three buckets: top 4 best first-wave submissions, next 3 worth sending if the first wave does not land, and bottom 3 lower-priority or situational targets.", + "verification": "Confirm the final recommendation contains all 10 journals assigned once each into the specified 4/3/3 bucket structure.", + "weight": 0.16 + }, + "R6": { + "requirement": "The ranking explanation explicitly uses both logistics and editorial fit, and it calls out which journals seem better suited to a more literary/traditional voice versus a more experimental voice whenever that can be inferred from public samples.", + "verification": "Review the written rationale for the rankings and confirm it discusses both operational submission factors and sample-based fit, including the traditional-versus-experimental distinction where inferable.", + "weight": 0.14 + } + }, + "task_id": "2ab5a542b36b238ca0b84b812d7846e38e99aa13", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I’m seriously thinking about getting a dog, but I want a thorough browser-based reality check on whether dog ownership is likely to make me healthier, sicker, or a mix of both depending on the situation. Please treat this as a real decision, not just a quick fact check. Start by finding current public guidance from authoritative sources like the CDC, NIH or NLM/PubMed, major allergy organizations, and major medical systems, and build a balanced evidence base across at least 12 public pages total. I want this broken into five explicit questions: zoonotic or household infection risk, allergies and asthma risk, whether childhood exposure to dogs seems protective or harmful, claimed adult health benefits such as mental-health or cardiovascular effects, and what practical steps actually reduce risk if someone does own a dog. For each question, compare what the stronger sources say, note where the evidence looks consistent versus mixed, and distinguish clearly between evidence for healthy adults, young children, older adults, pregnant people, and immunocompromised households whenever a source addresses those groups. Also check at least 3 public discussion threads or Q&A-style pages, including Reddit if useful, to capture what people commonly believe or worry about, and explicitly compare those claims against the stronger medical or research sources. Keep the most useful evidence tabs open, including at least 2 official guidance pages, at least 2 research-review or PubMed-style pages, and 2 discussion pages that show common public concerns. Finish with one organized decision memo on-page, not a spreadsheet, that gives me: a bottom-line verdict on whether owning a dog increases, decreases, or mixes illness risk overall; a short section on who should be most cautious; a short section on who is probably overestimating the risk; and a practical checklist of at least 8 steps that would make dog ownership safer if I decide to go ahead.", + "level": "hard", + "reference_length": 6, + "rubrics": { + "R1": { + "requirement": "The browsing session uses at least 12 public pages total and includes current authoritative sources from categories explicitly requested in the prompt: official guidance, research-review or PubMed-style material, and public discussion or Q&A-style pages.", + "verification": "Count the pages used and confirm the source mix includes authoritative guidance pages, research-review/PubMed-style pages, and at least 3 discussion or Q&A-style pages.", + "weight": 0.16 + }, + "R2": { + "requirement": "The final memo is organized around the 5 explicit questions requested: zoonotic or household infection risk, allergies and asthma risk, childhood exposure effects, claimed adult health benefits, and practical risk-reduction steps.", + "verification": "Inspect the final memo and confirm that all 5 named sections are present and substantively completed.", + "weight": 0.16 + }, + "R3": { + "requirement": "For each of the 5 questions, the memo compares what the stronger sources say and clearly identifies where the evidence appears consistent versus mixed.", + "verification": "Check each of the 5 sections for comparative synthesis across multiple sources and an explicit statement of whether the evidence is consistent, mixed, or uncertain.", + "weight": 0.18 + }, + "R4": { + "requirement": "The memo explicitly distinguishes evidence for the household groups named in the prompt whenever sources address them: healthy adults, young children, older adults, pregnant people, and immunocompromised households.", + "verification": "Review the memo for group-specific treatment and confirm that these five groups are addressed where supported by the gathered sources rather than collapsed into one generic conclusion.", + "weight": 0.16 + }, + "R5": { + "requirement": "At least 3 public discussion threads or Q&A-style pages are checked, and their common claims or worries are explicitly compared against stronger medical or research sources.", + "verification": "Confirm that at least 3 discussion/Q&A pages were consulted and that the final memo directly contrasts those public beliefs with higher-authority evidence.", + "weight": 0.12 + }, + "R6": { + "requirement": "Useful browser evidence is left visible at the end, including at least 2 official guidance pages, at least 2 research-review or PubMed-style pages, and 2 discussion pages showing common concerns.", + "verification": "Inspect the remaining open tabs and confirm the required counts and categories match the prompt.", + "weight": 0.1 + }, + "R7": { + "requirement": "The final on-page decision memo includes all requested deliverables: a bottom-line verdict on whether owning a dog increases, decreases, or mixes illness risk overall; a section on who should be most cautious; a section on who is probably overestimating the risk; and a practical checklist of at least 8 steps to make dog ownership safer.", + "verification": "Review the final memo for the requested verdict, the two audience-specific sections, and a checklist with at least 8 concrete safety steps.", + "weight": 0.12 + } + }, + "task_id": "341d7a70562b121c9702aa8845d51e0843575170", + "website": "https://www.reddit.com" + }, + { + "confirmed_task": "I’m seriously considering a future backpacking trip through China and want a version of the research that I could actually use, not just a generic average. Please build me a realistic backpacker budget across exactly 8 major tourist destinations in China, and make it structured enough that I could use it to decide whether the trip is affordable and which route makes the most sense. Start by identifying 8 widely recognized stops that would make a coherent backpacking route for a first-time visitor, then compare each one using public pages for hostels or budget hotels, food options, and major paid activities or sights. For every destination, record a typical nightly budget stay price, a low-cost daily food estimate, and 2 to 3 representative activity costs, using 'not shown' if a field truly is not available. Then figure out the most practical budget-minded way to travel between each stop, with an estimated fare and rough travel time, comparing trains, buses, or flights when that actually matters. Please open and keep useful evidence tabs along the way, including a few lodging pages with photos, a few attraction or ticket pages, and map pages for representative cities so I can visually sanity-check the route. Put the final result into one organized planning document that lists all 8 destinations in route order, the per-day and per-stop cost assumptions, the intercity transport costs, and a realistic total for the full trip. At the end, leave the finished document open along with the most useful lodging, transport, and map tabs so I can review the evidence.", + "level": "hard", + "reference_length": 6, + "rubrics": { + "R1": { + "requirement": "The final plan identifies exactly 8 major tourist destinations in China and presents them in one coherent backpacking route for a first-time visitor.", + "verification": "Check the finished document for exactly 8 destination entries in route order, with no extra or missing stops.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each of the 8 destinations includes a typical nightly budget stay price, a low-cost daily food estimate, and 2 to 3 representative paid activity costs, using 'not shown' where necessary.", + "verification": "Review each destination entry in the document and confirm all requested cost fields are present for all 8 stops.", + "weight": 0.22 + }, + "R3": { + "requirement": "The plan includes the most practical budget-minded transport choice between every consecutive stop, with an estimated fare and rough travel time, comparing trains, buses, or flights when that matters.", + "verification": "Count the intercity legs in the document and confirm each leg has a chosen mode, estimated cost, and approximate duration.", + "weight": 0.18 + }, + "R4": { + "requirement": "Useful browser evidence is kept open, including representative lodging pages with photos, attraction or ticket pages, and map pages for representative cities.", + "verification": "Inspect the open tabs at the end and confirm that lodging, attraction/ticket, and map evidence tabs are still available for review.", + "weight": 0.14 + }, + "R5": { + "requirement": "The final planning document lists all 8 destinations in route order and includes the per-day and per-stop cost assumptions, plus the intercity transport costs.", + "verification": "Open the final document and confirm that destination order, daily assumptions, stop-level costs, and transport costs are all clearly organized.", + "weight": 0.16 + }, + "R6": { + "requirement": "The finished result provides a realistic total estimated cost for the full trip and leaves the completed document open along with the most useful lodging, transport, and map tabs.", + "verification": "Confirm the document contains a full-trip total and that the final browser state still has the document and key evidence tabs open.", + "weight": 0.12 + } + }, + "task_id": "73f7a6bce89de66f106a669c85c7908331d3d1b7", + "website": "https://www.novo-monde.com" + }, + { + "confirmed_task": "I’m trying to figure out the best realistic 'beer money' setup for someone in the UK, because I already know about Prolific and CloudResearch Connect but I don’t want to keep wasting time on sites that look good until you dig into them. Please do a serious public-web comparison of 12 to 15 UK-accessible platforms across the most relevant categories: survey/research panels, user-testing or interview platforms, cashback or receipt-reward apps, and general microtask sites. Start by checking the official public pages for each platform and confirm whether UK participants are accepted, what the platform mainly pays for, the payout method, the cash-out threshold, and any obvious waitlist, invite-only, or identity-check friction; if a field is not shown publicly, record it as 'not shown' instead of guessing. Then cross-check each platform with at least one independent public reputation source such as Trustpilot, Reddit discussions, app-store listings, or similar public review pages so I can tell the difference between 'legit but slow' and 'avoid this.' I want the final result in one comparison sheet with one row per platform and columns for platform name, category, UK eligibility status, task type, payout method, minimum cash-out, notable restrictions, public reputation signal, likely best use case, and your verdict. After that, rank the best 8 platforms for me in signup order: first the ones most worth applying to immediately, then the ones that are only worth using as filler. Keep the most useful evidence tabs open, including at least 6 official platform pages and at least 4 independent reputation or review pages, and end with the final comparison sheet open plus a short recommendation on whether I should mostly stick with Prolific and Connect or build a broader UK stack.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "A comparison sheet exists and is left open at the end, with one row per platform for 12 to 15 UK-accessible platforms and columns for platform name, category, UK eligibility status, task type, payout method, minimum cash-out, notable restrictions, public reputation signal, likely best use case, and verdict.", + "verification": "Check that the open sheet contains 12 to 15 platform rows and all explicitly requested columns, using 'not shown' where public information was unavailable.", + "weight": 0.2 + }, + "R2": { + "requirement": "The platform set covers the requested categories: survey/research panels, user-testing or interview platforms, cashback or receipt-reward apps, and general microtask sites.", + "verification": "Inspect the sheet rows and category labels to confirm all four categories are represented by at least one included platform.", + "weight": 0.14 + }, + "R3": { + "requirement": "For each included platform, the official public pages are used to confirm whether UK participants are accepted, what the platform mainly pays for, the payout method, the cash-out threshold, and any obvious waitlist, invite-only, or identity-check friction.", + "verification": "Spot-check platform rows against the kept-open official pages and confirm that unsupported fields are marked 'not shown' rather than guessed.", + "weight": 0.2 + }, + "R4": { + "requirement": "Each platform is cross-checked with at least one independent public reputation source such as Trustpilot, Reddit, app-store listings, or similar public review pages so the comparison distinguishes trustworthy-but-slow options from poor choices.", + "verification": "Review the evidence and sheet entries to confirm every platform has at least one recorded independent reputation signal from a public source.", + "weight": 0.16 + }, + "R5": { + "requirement": "The final output includes a ranked list of the best 8 platforms in signup order, separating the ones most worth applying to immediately from the ones only worth using as filler.", + "verification": "Check the final recommendation section for exactly 8 ranked platforms and clear grouping into immediate-priority versus filler options.", + "weight": 0.14 + }, + "R6": { + "requirement": "Useful browser evidence is left open, including at least 6 official platform pages and at least 4 independent reputation or review pages, along with the final comparison sheet.", + "verification": "Inspect open tabs at the end to confirm the minimum counts and that both official and independent evidence pages remain available.", + "weight": 0.1 + }, + "R7": { + "requirement": "The session ends with a short recommendation on whether the user should mostly stick with Prolific and CloudResearch Connect or build a broader UK stack.", + "verification": "Check the final written recommendation for an explicit conclusion addressing Prolific and Connect versus a broader stack.", + "weight": 0.06 + } + }, + "task_id": "6984e84f7f80fd923126e82006bbfaeeeec5b5d2", + "website": "https://connect.cloudresearch.com" + }, + { + "confirmed_task": "I’m trying to figure out what kind of community event I could realistically host in NYC, and I don’t just want generic brainstorming. Please use public event pages to study what already seems to work: start with Luma, then expand to a couple of other public event platforms if needed, and build me a comparison set of exactly 15 NYC community-oriented events that feel relevant for inspiration. I want a real mix across at least 4 neighborhoods and at least 5 event formats, like meetups, panels, workshops, socials, walks, volunteer events, coworking sessions, or founder/creator gatherings. For each of the 15 events, note the visible title, neighborhood or venue area, host or organizer if shown, format, price if shown, audience signal, and any community-planning details that are visible on the page such as description style, location clarity, agenda or run-of-show, RSVP/ticket framing, and whether there are clear participation expectations or community norms. Keep the strongest evidence tabs open for at least 6 representative events, including several Luma pages, so I can quickly inspect the examples myself. After comparing them, recommend exactly 8 event concepts I could plausibly host in NYC, with one sentence on why each concept fits the patterns you found. Then pick the best 3 concepts overall and, for each one, give me a draft event-page outline with a suggested title style, description structure, venue/location approach, agenda sections, and any community-related details I should make sure to include. End with a short recommendation on which single concept I should host first and why, and leave the most useful event example pages open. Write this up in a Crpytpad Document titled 'NYC Event Proposals'", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "A comparison set of exactly 15 NYC community-oriented public event pages is assembled, drawn from Luma first and expanded to other public event platforms if needed.", + "verification": "Check that 15 distinct public event pages were reviewed and that the set includes Luma examples plus any additional public-platform examples used to reach the total.", + "weight": 0.2 + }, + "R2": { + "requirement": "The 15-event set covers at least 4 neighborhoods and at least 5 event formats such as meetups, panels, workshops, socials, walks, volunteer events, coworking sessions, or founder/creator gatherings.", + "verification": "Check the recorded comparison notes for neighborhood coverage and format labels, confirming the minimum diversity counts are met.", + "weight": 0.18 + }, + "R3": { + "requirement": "For each of the 15 events, the visible title, neighborhood or venue area, host or organizer if shown, format, price if shown, audience signal, and visible community-planning details are captured, including description style, location clarity, agenda or run-of-show, RSVP or ticket framing, and participation expectations or community norms where shown.", + "verification": "Review the final comparison output and confirm those fields are recorded for each of the 15 events, using 'not shown' where a public page does not display a field.", + "weight": 0.17 + }, + "R4": { + "requirement": "At least 6 representative event pages are kept open as browser evidence, including several Luma pages, so the user can inspect the strongest examples directly.", + "verification": "Confirm that 6 or more relevant event tabs remain open at the end and that multiple open tabs are Luma event pages.", + "weight": 0.15 + }, + "R5": { + "requirement": "Exactly 8 NYC event concepts are recommended, each with a one-sentence explanation of why it fits the patterns found in the comparison set.", + "verification": "Check that there are exactly 8 recommended concepts and that each includes a clear one-sentence rationale tied to the observed event patterns.", + "weight": 0.15 + }, + "R6": { + "requirement": "The best 3 concepts are selected and each includes a draft event-page outline with a suggested title style, description structure, venue or location approach, agenda sections, and community-related details to include, followed by a final recommendation of which single concept to host first and why, and the write-up is captured in a CryptoPad Document titled 'NYC Event Proposals'.", + "verification": "Check that exactly 3 shortlisted concepts have complete draft page outlines covering all requested elements, and that the response ends with one recommended first event concept plus a reason; the final deliverable includes an open CryptoPad Document titled 'NYC Event Proposals'.", + "weight": 0.15 + } + }, + "task_id": "74da207c4a2c3ee82f3452d0d3c587af3f1a20ff", + "website": "https://luma.com" + }, + { + "confirmed_task": "I’m thinking about applying for teaching jobs around Athens, Georgia, and I don’t want a one-site skim — I want a real shortlist of the best districts or public-school employers to target for a future hiring cycle. Please start with Clarke County’s SchoolSpring employer page so we anchor on the local district correctly, then build out a comparison of exactly 8 school districts or public-school employers that are in Athens or within about a 45-minute drive of downtown Athens. For each one, use public pages only to find the employer name, the main jobs or careers page, whether certified teacher openings are currently visible, one representative teaching opening if shown, the current salary schedule for a new certified teacher if posted, key benefits or perks if posted, the school calendar or work-year clues if posted, and a rough drive time from downtown Athens. Also pull one public school-quality signal for each district, like the state report card, district profile, or another credible public source, and write \"not shown\" for anything you can’t verify publicly. Put the 8 employers into one clean comparison sheet or document so I can scan them side by side, then rank the best 5 districts for me to apply to with short reasons that balance pay, likely openings, commute, and overall district fit. Keep the most useful evidence visible in the browser by leaving open the career page and salary page for each of your top 5 picks, plus at least 2 representative job-posting tabs from the strongest options.", + "level": "hard", + "reference_length": 14, + "rubrics": { + "R1": { + "requirement": "The browsing session starts from Clarke County’s SchoolSpring employer page and correctly identifies the local employer anchor before expanding to exactly 8 school districts or public-school employers in Athens or within about a 45-minute drive of downtown Athens.", + "verification": "Check that Clarke County is explicitly used as the starting anchor and that the final comparison includes exactly 8 qualifying employers, each plausibly within the stated geography.", + "weight": 0.18 + }, + "R2": { + "requirement": "A single comparison sheet or document is produced with one entry for each of the 8 employers and includes the requested public-page fields: employer name, main jobs/careers page, whether certified teacher openings are visible, one representative teaching opening if shown, current salary schedule for a new certified teacher if posted, key benefits or perks if posted, school calendar or work-year clues if posted, rough drive time from downtown Athens, and one public school-quality signal.", + "verification": "Inspect the final sheet or document and confirm that all 8 entries contain the specified fields, with missing items recorded as \"not shown\" rather than invented.", + "weight": 0.2 + }, + "R3": { + "requirement": "The comparison relies only on public pages and records \"not shown\" wherever a requested field cannot be verified publicly.", + "verification": "Spot-check multiple entries against visible public sources and confirm there are no login-only claims or uncited filled-in gaps where the prompt required \"not shown.\"", + "weight": 0.14 + }, + "R4": { + "requirement": "The final output ranks the best 5 districts or public-school employers to apply to and gives short reasons for each ranking that balance pay, likely openings, commute, and overall district fit.", + "verification": "Check that there are exactly 5 ranked recommendations and that each one includes concise reasoning touching the decision factors named in the prompt.", + "weight": 0.18 + }, + "R5": { + "requirement": "For each of the top 5 ranked picks, the career page and salary page are left open in the browser as evidence.", + "verification": "Confirm that 10 evidence tabs remain open for the top 5 picks: one career/jobs page and one salary-related page for each ranked employer.", + "weight": 0.15 + }, + "R6": { + "requirement": "At least 2 representative teaching job-posting tabs from the strongest options are also left open in the browser.", + "verification": "Confirm that at least 2 live or recent representative teaching-opening pages remain open and correspond to employers discussed as strong options in the final ranking.", + "weight": 0.15 + } + }, + "task_id": "228dda0bbfce65f778b3f421cb1f575c8c8977f1", + "website": "https://clarke.schoolspring.com" + }, + { + "confirmed_task": "I’m trying to figure out the smartest way for 3 adults to fly from the Washington, DC area to Orlando for a future trip built around the original Jan 30 travel window, and I don’t want just one American Airlines screenshot-price. Please start by checking American’s public booking flow for the original one-way search from DCA to MCO for 3 adults around that date so we have the baseline, then broaden into a real comparison that I could actually use: compare flights from all 3 Washington-area airports (DCA, IAD, and BWI) to Orlando-area options if publicly shown, using a tight date window centered on that trip timing and keeping the search bounded to 3 outbound dates total. For each airport/date combination, compare American with at least 2 other major airline options visible on public booking or metasearch pages, and focus on the cheapest practical itinerary for 3 adults rather than just the lowest teaser fare. As you compare, note whether each best option is nonstop or connecting, the departure and arrival times, the fare type, and any obvious restrictions that would materially affect the real price for normal travelers, especially carry-on limits, checked-bag assumptions if clearly shown, and seat-selection limitations if clearly shown; if something is not shown, record it as not shown. I want a final side-by-side comparison covering exactly 9 search combinations total (3 origin airports x 3 outbound dates), with one best option per combination and a clear overall recommendation for the best value, the best convenience, and the best American-only choice. Keep the most useful evidence tabs open at the end: the original American baseline search, the two strongest non-American comparison pages, and the final winner page so I can review them.", + "level": "hard", + "reference_length": 11, + "rubrics": { + "R1": { + "requirement": "The session starts with the original American Airlines baseline: a public booking search for 3 adults from DCA to MCO around the original Jan 30 trip timing, and that baseline is used as the reference point for the rest of the comparison.", + "verification": "An open American Airlines results/search page is present for the baseline route and traveler count, and the final comparison explicitly identifies it as the baseline.", + "weight": 0.16 + }, + "R2": { + "requirement": "The final comparison covers exactly 9 search combinations total, formed by 3 Washington-area origin airports (DCA, IAD, and BWI) crossed with 3 outbound dates in the bounded trip window.", + "verification": "The delivered comparison contains one entry for each of the 9 airport/date combinations and does not omit or add combinations.", + "weight": 0.2 + }, + "R3": { + "requirement": "For each of the 9 search combinations, American is compared with at least 2 other major airline options visible on public booking or metasearch pages, and one best option is chosen for that combination.", + "verification": "Each combination entry shows a 3-way-or-more airline comparison including American plus at least 2 other major airline options, with one selected best option recorded.", + "weight": 0.18 + }, + "R4": { + "requirement": "Each chosen best option records the practical trip details explicitly requested: whether it is nonstop or connecting, departure and arrival times, fare type, and any obvious restrictions that materially affect real price, especially carry-on limits, checked-bag assumptions if clearly shown, and seat-selection limitations if clearly shown, using 'not shown' where needed.", + "verification": "All 9 selected options include those fields, and any unavailable details are marked 'not shown' rather than invented.", + "weight": 0.18 + }, + "R5": { + "requirement": "The final synthesis provides a clear overall recommendation naming the best value option, the best convenience option, and the best American-only option.", + "verification": "The conclusion explicitly labels all three recommendation categories and ties each to one of the compared options.", + "weight": 0.14 + }, + "R6": { + "requirement": "Useful browser evidence is left open at the end: the original American baseline search, the 2 strongest non-American comparison pages, and the final winner page.", + "verification": "Those 4 pages remain open and correspond to the exact evidence tabs requested in the prompt.", + "weight": 0.14 + } + }, + "task_id": "4d13c31d463a1277ca8b7ef95947ee4117b0f922", + "website": "https://www.aa.com" + }, + { + "confirmed_task": "I’m trying to figure out which electric-vehicle websites are actually worth following for the next few months, not just find one random EV site. Please do a serious browser-based comparison of 10 EV-focused public websites that cover the space in meaningfully different ways—news, car reviews, buying guides, charging and ownership advice, future model coverage, or broader EV analysis. Start from one clearly EV-focused site and expand naturally to other credible EV-focused sites you find on the public web. For each of the 10 sites, check the homepage plus at least one deeper article or section page so you can verify what it really specializes in, how current it seems, and whether it is more useful for shoppers, enthusiasts, or industry-followers. Compare them on concrete things I’d care about: what kind of EV coverage they emphasize, whether they seem actively updated, whether they cover both current models and upcoming EVs, whether they have practical charging or ownership information, and whether the site feels easy to use for ongoing research. Then recommend exactly 5 sites to keep in my regular EV-reading rotation, with a short reason for each, and identify the single best site for each of these three use cases: keeping up with EV news, researching a future EV purchase, and learning about charging/ownership. Keep the 5 recommended sites open in tabs at the end, and for at least 3 of those, also leave one representative deeper page open that shows why the site made the cut. If any field is unclear from public pages, say not shown rather than guessing.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "Exactly 10 EV-focused public websites are examined, and each is checked using both its homepage and at least one deeper article or section page.", + "verification": "The final comparison names 10 distinct EV-focused sites and includes evidence drawn from both the homepage and one deeper page for each site.", + "weight": 0.2 + }, + "R2": { + "requirement": "Each of the 10 sites is compared on the explicitly requested factors: coverage emphasis, apparent update activity, current-versus-upcoming EV coverage, charging or ownership information, and overall usability for ongoing research.", + "verification": "The final output contains side-by-side notes for every site covering all requested comparison dimensions, using 'not shown' where needed instead of guessing.", + "weight": 0.2 + }, + "R3": { + "requirement": "The work distinguishes what each site is most useful for by assessing whether it is better suited to shoppers, enthusiasts, or industry-followers.", + "verification": "Each site entry includes a clear audience-fit judgment tied to evidence from the public pages visited.", + "weight": 0.12 + }, + "R4": { + "requirement": "Exactly 5 sites are recommended for the user's regular EV-reading rotation, each with a short reason for inclusion.", + "verification": "The final recommendation section contains 5 and only 5 selected sites, each paired with a concise rationale grounded in the comparison.", + "weight": 0.16 + }, + "R5": { + "requirement": "The final synthesis identifies the single best site for each of the three requested use cases: EV news, future EV purchase research, and charging/ownership learning.", + "verification": "The final answer explicitly names one best site for each of the 3 use cases and explains why each won that category.", + "weight": 0.14 + }, + "R6": { + "requirement": "Browser evidence is preserved by leaving the 5 recommended sites open, plus one representative deeper page open for at least 3 of those recommended sites.", + "verification": "The ending browser state shows the 5 chosen site tabs still open and at least 3 additional deeper pages open that visibly support the recommendations.", + "weight": 0.18 + } + }, + "task_id": "20cea7be868a6dbaca0c1ff8a04562101f3fbd91", + "website": "https://insideevs.com" + }, + { + "confirmed_task": "I’m trying to decide whether I should keep using a browser-based password saver like Google Password Manager or switch to a dedicated password manager, and I want you to do a serious browser-based comparison that ends with exactly 3 apps I could realistically adopt. Start by identifying 8 to 10 credible password managers that have public pricing and security/privacy information, including Google Password Manager as a baseline if it qualifies. For each one, open and compare the official pricing page, the main security or architecture page, the privacy policy or privacy summary page, and a public help or feature page that confirms practical things like cross-device sync, passkey support, import/export options, and whether it works on the platforms I’m likely to care about. As you compare them, rule out anything that is obviously too expensive, weak on privacy, missing from major platforms, or unclear about how user data is protected. Then narrow the list to the best 5 finalists and do a deeper trust check for those 5 using public evidence such as independent security audits, bug bounty programs, encryption design explanations, breach or incident disclosures if any are relevant, and company transparency pages. After that, recommend exactly 3 applications that best balance affordability, privacy, and trustworthiness for a normal individual user, and for each of the 3 explain why it made the cut, what the likely monthly or annual cost is, what privacy or trust tradeoffs I’d be accepting, and whether migration from a browser-based password saver seems straightforward based on public import/export help pages. Keep the most useful comparison tabs open for the final 3, including at least one pricing tab and one security/privacy evidence tab for each finalist, plus one tab that shows Google Password Manager as the baseline I’m comparing against.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies and compares 8 to 10 credible password managers, with Google Password Manager included as a baseline if it qualifies.", + "verification": "Final output names 8 to 10 products considered and shows that Google Password Manager was included as the baseline or explicitly noted as not qualifying under the stated criteria.", + "weight": 0.16 + }, + "R2": { + "requirement": "For each considered product, the comparison uses public official pages covering pricing, security or architecture, privacy policy or privacy summary, and a help or feature page confirming practical capabilities such as sync, passkeys, import/export, and platform support.", + "verification": "Final comparison notes include those four evidence categories for each considered product, with browser evidence visible through opened official tabs used for the comparison.", + "weight": 0.18 + }, + "R3": { + "requirement": "Products that are too expensive, weak on privacy, missing major platform support, or unclear about user-data protection are ruled out before the shortlist is narrowed to the best 5 finalists.", + "verification": "The final synthesis explicitly shows which products were eliminated and why, and it presents a narrowed finalist set of exactly 5.", + "weight": 0.16 + }, + "R4": { + "requirement": "The 5 finalists receive a deeper trust review using public evidence such as independent security audits, bug bounty programs, encryption design explanations, relevant breach or incident disclosures, and company transparency pages.", + "verification": "For each of the 5 finalists, the final notes cite at least some of those deeper trust signals from public pages and distinguish stronger versus weaker trust evidence.", + "weight": 0.18 + }, + "R5": { + "requirement": "The final recommendation includes exactly 3 applications that best balance affordability, privacy, and trustworthiness for a normal individual user.", + "verification": "The end result contains exactly 3 recommended apps and clearly identifies them as the final picks rather than a broader list.", + "weight": 0.14 + }, + "R6": { + "requirement": "For each of the 3 recommended apps, the final explanation states why it made the cut, the likely monthly or annual cost, the privacy or trust tradeoffs involved, and whether migration from a browser-based password saver looks straightforward based on public import/export help pages.", + "verification": "Each of the 3 finalist summaries contains all four requested elements: rationale, cost, tradeoffs, and migration assessment grounded in public help or support pages.", + "weight": 0.1 + }, + "R7": { + "requirement": "Useful browser evidence is left open for review: for each of the final 3, at least one pricing tab and one security/privacy evidence tab remain open, plus one tab showing Google Password Manager as the baseline.", + "verification": "Open tabs at the end include the required pricing and security/privacy evidence pages for each of the 3 finalists and one baseline Google Password Manager tab.", + "weight": 0.08 + } + }, + "task_id": "5d3f0704adb650f6dfdea9e8bf1c070e3f4d02c7", + "website": "https://passwords.google.com" + }, + { + "confirmed_task": "I’m trying to figure out which quick-dry camping towel I should actually buy for future trips, and I don’t want a one-listing answer. Start by using Temu as the low-cost baseline, then compare it against mainstream public retail options like Amazon, Walmart, REI, Target, or other public store pages you can access without signing in. I want a real browser-based market scan of 12 total quick-dry camping or travel towels, with at least 3 from Temu and at least 3 from non-discount outdoor or big-box retailers, and the rest from any credible public listings that fit. For each product, record the product name, current listed price, stated size, material if shown, packed or product weight if shown, whether it comes with a pouch or loop if shown, color/options note if relevant, and anything the page says about drying speed or absorbency; if a field is missing, write 'not shown.' As you compare them, open the actual product pages plus enough review/photo sections to sanity-check whether the towels look genuinely compact and camping-usable rather than just cheap gym towels. Then narrow the 12 down into 3 finalists: the best budget pick, the best overall value pick, and the best premium or most durable-looking pick. For those 3 finalists, also compare shipping or delivery info if publicly shown, return policy basics if publicly shown, and at least one review-based concern or tradeoff. Finish with a concise recommendation that tells me which one you would buy for a typical car-camping or weekend hiking user and why, and leave the 3 finalist product tabs plus 2 comparison tabs open so I can review the evidence myself.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "The browsing session compares 12 total quick-dry camping or travel towels, including at least 3 from Temu and at least 3 from non-discount outdoor or big-box retailers, with the remaining items drawn from credible public listings.", + "verification": "Check the final comparison output for exactly 12 products and confirm retailer/source distribution meets the stated minimums using the recorded product pages.", + "weight": 0.2 + }, + "R2": { + "requirement": "Each of the 12 products includes the requested recorded fields: product name, current listed price, stated size, material if shown, packed or product weight if shown, pouch or loop note if shown, color/options note if relevant, and any drying-speed or absorbency claim, using 'not shown' where information is missing.", + "verification": "Inspect the final comparison notes and verify that every product row or entry contains all requested fields, with missing data explicitly marked as 'not shown' rather than omitted.", + "weight": 0.2 + }, + "R3": { + "requirement": "The session uses actual product pages and review/photo sections to sanity-check whether the towels appear compact and camping-usable rather than generic gym towels.", + "verification": "Confirm that product pages were opened for the compared items and that review or photo evidence was consulted and reflected in the notes for compactness/camping suitability.", + "weight": 0.15 + }, + "R4": { + "requirement": "The 12-product set is narrowed to exactly 3 finalists labeled as the best budget pick, the best overall value pick, and the best premium or most durable-looking pick.", + "verification": "Check the final synthesis for exactly three finalists with those three category labels and corresponding chosen products.", + "weight": 0.15 + }, + "R5": { + "requirement": "For each of the 3 finalists, the comparison includes shipping or delivery info if publicly shown, return policy basics if publicly shown, and at least one review-based concern or tradeoff.", + "verification": "Review the finalist summaries and confirm all three include those three elements, with 'not shown' used where public shipping or return details are unavailable.", + "weight": 0.15 + }, + "R6": { + "requirement": "The final recommendation states which towel should be bought for a typical car-camping or weekend hiking user and explains why.", + "verification": "Check for a clear single recommended product and a concise rationale tied to the comparison criteria rather than a generic summary.", + "weight": 0.1 + }, + "R7": { + "requirement": "At the end, the 3 finalist product tabs and 2 comparison tabs are left open for review.", + "verification": "Confirm that five tabs remain open matching the requested evidence set: three finalist product pages and two comparison-oriented tabs used during the evaluation.", + "weight": 0.05 + } + }, + "task_id": "b24db77407cc1befa527256883da54fd4f37c256", + "website": "https://www.temu.com" + }, + { + "confirmed_task": "I’m trying to get a realistic picture of what would actually make me a stronger applicant for a future philosophy PhD cycle, especially whether publications are genuinely expected now or whether that’s internet panic. Please do a serious browser-based comparison across 12 funded U.S. philosophy PhD programs that are publicly visible and reasonably research-active, using official admissions pages first and then student, faculty, and department pages where helpful. For each program, figure out what the application seems to reward most strongly from public evidence: things like writing sample emphasis, research fit, letters, prior coursework, GRE if mentioned, publications if mentioned, teaching or language preparation if relevant, and funding structure. If a field is missing, record it as not shown instead of guessing. Then sanity-check the publications question by looking at at least 6 public non-official discussion or advice pages from places like Reddit, faculty advice posts, or department FAQs, and separate what is officially stated from what applicants seem to believe. I want you to keep key evidence tabs open, including at least 4 official admissions pages, 3 faculty or current-student pages that help show research-fit expectations, and 2 discussion or advice pages that shaped the publications conclusion. Finish by writing one organized decision memo that tells me, in plain English, whether publications look necessary, helpful, or mostly optional for these programs; what the strongest recurring signals actually are; where a typical applicant without publications would need to compensate; and a concrete priority list of the top 8 things I should improve before applying.", + "level": "hard", + "reference_length": 5, + "rubrics": { + "R1": { + "requirement": "The browsing session covers exactly 12 funded U.S. philosophy PhD programs and uses official program or university admissions pages as the primary evidence base for each one.", + "verification": "The final memo lists 12 distinct philosophy PhD programs, and each program entry includes evidence drawn from an official admissions or department page.", + "weight": 0.2 + }, + "R2": { + "requirement": "For each of the 12 programs, the output identifies what the application appears to reward most strongly from public evidence, including writing sample emphasis, research fit, letters, prior coursework, GRE if mentioned, publications if mentioned, teaching or language preparation if relevant, and funding structure, with 'not shown' used where needed.", + "verification": "Each of the 12 program summaries includes the requested factors or explicitly marks missing items as 'not shown' rather than inferring them.", + "weight": 0.22 + }, + "R3": { + "requirement": "The publications question is sanity-checked using at least 6 public non-official discussion or advice pages, and the output clearly distinguishes official program statements from applicant or adviser commentary.", + "verification": "The final memo references at least 6 non-official discussion or advice pages and explicitly separates official evidence from community perception or informal advice.", + "weight": 0.16 + }, + "R4": { + "requirement": "Key evidence tabs are kept open, including at least 4 official admissions pages, 3 faculty or current-student pages that help show research-fit expectations, and 2 discussion or advice pages that shaped the publications conclusion.", + "verification": "The final browser state includes the requested mix of open tabs, and those tabs correspond to evidence used in the memo.", + "weight": 0.12 + }, + "R5": { + "requirement": "The final decision memo directly answers whether publications look necessary, helpful, or mostly optional for these programs.", + "verification": "The memo gives a clear overall conclusion on publications and ties that conclusion back to the compared program evidence and outside discussion pages.", + "weight": 0.12 + }, + "R6": { + "requirement": "The final decision memo explains what the strongest recurring signals actually are and where a typical applicant without publications would need to compensate.", + "verification": "The memo synthesizes cross-program patterns and includes a specific discussion of compensating strengths for applicants who do not have publications.", + "weight": 0.1 + }, + "R7": { + "requirement": "The final decision memo ends with a concrete priority list of the top 8 things the user should improve before applying.", + "verification": "The memo includes exactly 8 actionable improvement priorities tailored to the evidence gathered in the browsing session.", + "weight": 0.08 + } + }, + "task_id": "0dc4be74eb82df8d0ce1b556260bd615acffedd6", + "website": "https://www.reddit.com" + }, + { + "confirmed_task": "I’m trying to choose a babywearing jacket for a future cold-weather season, and I don’t want just one random product link. Please do a serious browser-based comparison of 12 babywearing jacket options across at least 6 brands or retailers, including H&M if they still have a relevant option, and use only public product pages. For each option, capture the exact product name, listed price, whether it is a true babywearing jacket or a regular coat plus insert, whether the insert is included or sold separately, front-carry versus back-carry support if shown, maternity use if shown, weather clues like fleece/water-resistant/down, available size range if shown, and anything important from the product photos or description that affects real-world use. Then check the sizing/help or return-policy pages for the same brands when available so I can see which options are least risky to order online, and note ‘not shown’ anywhere the site doesn’t say. Narrow the list to the 5 strongest options for a practical buyer who cares about warmth, ease of use, and not overspending, and explain the tradeoffs between best budget, best for colder weather, best for extended maternity-to-babywearing use, and best overall. Keep the final 5 product pages open along with at least 3 relevant size-guide or return-policy pages and at least 2 pages with strong product-photo evidence, so I can review the finalists in the browser afterward. End with one organized comparison table and a clear recommendation. Make a presentation for me in CrptoPad so I can easily go through your findings.", + "level": "hard", + "reference_length": 7, + "rubrics": { + "R1": { + "requirement": "The work compares exactly 12 babywearing jacket options drawn from at least 6 brands or retailers, using public product pages and including H&M if a relevant option is available.", + "verification": "Count 12 distinct product entries in the final comparison and confirm they span at least 6 brands or retailers; verify that H&M is included if a relevant option was found, or otherwise clearly noted as unavailable/not suitable.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each of the 12 options records the exact product name, listed price, whether it is a true babywearing jacket or a regular coat plus insert, whether the insert is included or separate, front-carry versus back-carry support if shown, maternity use if shown, weather clues, available size range if shown, and uses 'not shown' for missing fields.", + "verification": "Inspect the final comparison table and confirm each listed option includes all requested fields or an explicit 'not shown' entry where the site does not provide the information.", + "weight": 0.22 + }, + "R3": { + "requirement": "The browsing includes sizing/help or return-policy checks for the same brands when available, so the final comparison notes which options appear least risky to order online.", + "verification": "Verify that the final synthesis references sizing/help or return-policy information for the relevant brands when available, and that at least 3 such pages are kept open as requested.", + "weight": 0.14 + }, + "R4": { + "requirement": "The list is narrowed to exactly 5 strongest options for a practical buyer focused on warmth, ease of use, and not overspending, with explicit tradeoff notes for best budget, best for colder weather, best for extended maternity-to-babywearing use, and best overall.", + "verification": "Check that the final output identifies 5 finalists and assigns the requested tradeoff categories with brief supporting reasoning tied to the gathered product details.", + "weight": 0.2 + }, + "R5": { + "requirement": "Useful browser evidence is left open: the final 5 product pages, at least 3 relevant size-guide or return-policy pages, and at least 2 pages with strong product-photo evidence.", + "verification": "Confirm that the specified product, policy/help, and photo-evidence pages remain open at the end and correspond to the finalists or brands discussed.", + "weight": 0.14 + }, + "R6": { + "requirement": "The session ends with one organized comparison table plus a clear recommendation that synthesizes the market scan rather than just listing links, and a CryptoPad presentation is also created so the findings can be reviewed easily.", + "verification": "Review the final output to confirm there is a structured comparison table covering all requested fields and a concise recommendation identifying the best choice and why; an open CryptoPad Presentation contains the comparison findings and recommendation.", + "weight": 0.12 + } + }, + "task_id": "9acfcc050ae8de65ba5f5de787a47cef0589dd90", + "website": "https://www2.hm.com" + }, + { + "confirmed_task": "I’m considering making a meaningful donation or sponsorship to one specific nonprofit, but before I do that I want a real browser-based due-diligence review rather than just a quick IRS check. Start by finding the organization on the IRS Tax Exempt Organization Search and confirm its exact legal name, EIN if shown, deductibility and tax-exempt status, and whether anything looks inactive, revoked, or unusual. Then verify the same organization on the most relevant public state charity-registration or business-record pages you can find, and note whether it appears properly registered or if any key fields are not shown. After that, pull the most recent publicly available Form 990 or equivalent filing you can access and summarize a few concrete basics I’d care about as a donor, like recent revenue, expenses, leadership compensation if shown, and whether the filing looks current. Next, review the nonprofit’s own public website for mission clarity, leadership or board transparency, annual reports, impact claims, and contact information, and open a few of the strongest evidence pages so I can inspect them later. Then check at least 3 independent public sources such as nonprofit watchdog, charity database, or major public-profile pages to see whether they agree on status, scale, and credibility, and note any material discrepancies. Finally, compare this organization against exactly 4 similar nonprofits working in the same cause area, using public pages only, so I can see whether this is the strongest option or just one plausible option. Give me a final recommendation on whether I should feel comfortable supporting this nonprofit now, support one of the alternatives instead, or hold off pending more information, and keep the most useful registry, filing, watchdog, and comparison tabs open at the end.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The target nonprofit is verified on the IRS Tax Exempt Organization Search, with its exact legal name plus the requested IRS status details recorded, including whether anything appears inactive, revoked, or unusual.", + "verification": "Check that the final output includes the IRS lookup result for the target organization and that at least one relevant IRS results/details page is left open as evidence.", + "weight": 0.18 + }, + "R2": { + "requirement": "The organization is also checked on the most relevant public state charity-registration or business-record pages, and the result notes whether it appears properly registered or whether key fields were not shown.", + "verification": "Check that the final output records the state-level verification result and that at least one corresponding public registry page is left open or clearly referenced as evidence.", + "weight": 0.16 + }, + "R3": { + "requirement": "The most recent publicly available Form 990 or equivalent filing is found and summarized with the requested concrete basics: recent revenue, expenses, leadership compensation if shown, and whether the filing appears current.", + "verification": "Check that the final output includes those filing-based details and that a public filing page or filing source page is left open as evidence.", + "weight": 0.18 + }, + "R4": { + "requirement": "The nonprofit’s own public website is reviewed for mission clarity, leadership or board transparency, annual reports, impact claims, and contact information, with a few of the strongest evidence pages opened for later inspection.", + "verification": "Check that the final output covers all requested website-review categories and that multiple relevant pages from the nonprofit’s own site remain open.", + "weight": 0.14 + }, + "R5": { + "requirement": "At least 3 independent public sources such as watchdog, charity-database, or major public-profile pages are checked, and any material discrepancies in status, scale, or credibility are noted.", + "verification": "Check that 3 or more independent public sources are summarized in the final output and that at least some of those source pages are left open as evidence.", + "weight": 0.14 + }, + "R6": { + "requirement": "The target nonprofit is compared against exactly 4 similar nonprofits in the same cause area using public pages only, so the user can judge whether it is the strongest option or just one plausible option.", + "verification": "Check that the final output includes exactly 4 peer nonprofits with a usable side-by-side comparison grounded in public pages, and that key comparison tabs remain open.", + "weight": 0.12 + }, + "R7": { + "requirement": "A final recommendation is provided on whether to support the target nonprofit now, support one of the alternatives instead, or hold off pending more information, and the most useful registry, filing, watchdog, and comparison tabs are left open at the end.", + "verification": "Check that the final answer makes one of the three requested recommendation types and that the browser is left with the most useful evidence tabs open across those categories.", + "weight": 0.08 + } + }, + "task_id": "08b3ee12dd124bd3f046bd568260110899dc8695", + "website": "https://apps.irs.gov" + }, + { + "confirmed_task": "I want help building a real family safety starter plan for my household, not just finding one generic tip. Please use public pages only and focus mostly on official or major nonprofit guidance for a U.S.-based family. Research exactly 6 safety categories: home fire escape, severe weather sheltering, emergency contacts and family reunification, first aid and poisoning, car-seat or child passenger safety, and basic emergency supplies. For each category, find and compare at least 2 credible guidance pages, note the most important points they clearly agree on, and call out any meaningful differences or gaps. Keep the strongest page for each of the 6 categories open in separate tabs so I can review them later, and also keep open 2 or 3 comparison tabs that were especially useful for resolving differences. After that, create one organized family safety plan with exactly 15 action items total, grouped into 3 buckets: do today, do this week, and do this month. Each action item should be specific enough that I could actually do it, and it should cite which category it came from. If a source leaves something unclear or not shown, say so instead of guessing. End with a short section naming the 6 pages you trust most and why those are the ones worth keeping open.", + "level": "hard", + "reference_length": 29, + "rubrics": { + "R1": { + "requirement": "The browsing work covers exactly 6 safety categories: home fire escape, severe weather sheltering, emergency contacts and family reunification, first aid and poisoning, car-seat or child passenger safety, and basic emergency supplies.", + "verification": "The final output includes all 6 named categories and no substitute categories, with evidence drawn from public guidance pages for each one.", + "weight": 0.16 + }, + "R2": { + "requirement": "For each of the 6 categories, at least 2 credible public guidance pages are found and compared.", + "verification": "The final output shows a comparison for every category with at least 2 sources per category, for a minimum of 12 total guidance pages, and identifies the agreements and any meaningful differences or gaps.", + "weight": 0.2 + }, + "R3": { + "requirement": "The research relies mostly on official or major nonprofit guidance pages appropriate for a U.S.-based family.", + "verification": "The cited pages are predominantly official government, medical, safety-organization, or major nonprofit sources, and the final summary reflects that source quality standard.", + "weight": 0.14 + }, + "R4": { + "requirement": "The strongest page for each of the 6 categories is kept open, and 2 or 3 especially useful comparison tabs are also kept open.", + "verification": "There is browser evidence that 6 category-winning tabs remain open, plus 2 or 3 comparison tabs that were used to resolve or inspect differences.", + "weight": 0.14 + }, + "R5": { + "requirement": "One organized family safety plan is produced with exactly 15 action items grouped into do today, do this week, and do this month.", + "verification": "The final plan contains exactly 15 actionable items total, clearly grouped into the 3 requested time buckets rather than a different structure.", + "weight": 0.22 + }, + "R6": { + "requirement": "Each action item is specific enough to do, is linked back to its source category, and unclear or missing details are labeled as not shown instead of guessed. The plan ends with a short section naming the 6 most trusted pages and why they were kept open.", + "verification": "Every action item includes category attribution and practical specificity, any unknowns are marked not shown, and the ending section explicitly lists the 6 trusted pages with a brief rationale for each.", + "weight": 0.14 + } + }, + "task_id": "09fe03cd3da0fac15bb28f24a627c4bbdea0e611", + "website": "https://redcap.usuhs.edu" + }, + { + "confirmed_task": "I want to figure out where I should actually go in the Grand Rapids area if I want to make art in public instead of just staying home with a craft kit, and I don’t want this to stop at one studio. Please include Spruced Studio, but build me a real comparison of 10 public-facing art or craft places in or near Grand Rapids that appear to offer beginner-friendly workshops, drop-in sessions, one-off classes, or other ways for regular adults to participate without already being members. Use public pages only, and for each place capture the art medium, whether it looks more like drop-in versus scheduled workshop, any visible beginner-friendly signal, the price or price range if shown, the neighborhood or city area, and the clearest attendance or registration instructions; if something is missing, mark it as not shown. Open and keep the most useful workshop or class pages for the strongest 5 options, and for at least 3 of those also open a map, photo, or location page so I can sanity-check the vibe and area. When you’re done, recommend exactly 4 places for different needs: best low-pressure first try, best date-night or group outing, best hands-on craft skill builder, and best overall option if I wanted to start doing this regularly.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "The final comparison covers exactly 10 public-facing art or craft places in or near Grand Rapids, and Spruced Studio is included as one of them.", + "verification": "Count 10 distinct venues in the final output and confirm Spruced Studio appears among them.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each of the 10 places, the comparison records the requested fields: art medium, whether it appears to be drop-in versus scheduled workshop, any visible beginner-friendly signal, the price or price range if shown, the neighborhood or city area, and the clearest attendance or registration instructions, using 'not shown' where needed.", + "verification": "Inspect each venue entry and confirm all requested fields are present with either sourced details or an explicit 'not shown.'", + "weight": 0.22 + }, + "R3": { + "requirement": "The 10 places are sourced from public pages that make them look like real beginner-accessible options rather than private membership-only or advanced-only programs.", + "verification": "Check that each venue entry is supported by a public-facing page indicating workshops, classes, drop-ins, or other adult participation access.", + "weight": 0.14 + }, + "R4": { + "requirement": "The most useful workshop or class pages for the strongest 5 options are opened and kept available.", + "verification": "Confirm that 5 relevant class or workshop tabs remain open for the shortlisted options.", + "weight": 0.14 + }, + "R5": { + "requirement": "For at least 3 of those stronger options, a map, photo, or location page is also opened so the user can sanity-check the vibe and area.", + "verification": "Confirm at least 3 additional open tabs show map, photo, or location evidence corresponding to shortlisted venues.", + "weight": 0.1 + }, + "R6": { + "requirement": "The final recommendations include exactly 4 places matched to the requested use cases: best low-pressure first try, best date-night or group outing, best hands-on craft skill builder, and best overall option for doing this regularly.", + "verification": "Check that there are exactly 4 recommendation labels and each is assigned to one venue with the requested category names.", + "weight": 0.12 + }, + "R7": { + "requirement": "The recommendations are grounded in the comparison details rather than generic opinion, using the gathered information about medium, format, pricing, accessibility, or attendance flow.", + "verification": "Review the recommendation rationale and confirm it explicitly cites comparison factors gathered during browsing.", + "weight": 0.1 + } + }, + "task_id": "cc7eea277860163ddebeea9449a50b65d1890a29", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I’m thinking about a future long weekend in Colorado Springs and I don’t just want a quick list of three ideas — I want to know whether the city actually has enough variety to build a fun, realistic plan. Please start with Springs magazine or similar local-guide coverage to discover what’s happening and what people recommend, then build me a Colorado Springs weekend playbook using only public pages. I want at least 12 total options, with exactly 3 free or very low-cost activities/resources, exactly 3 food or drink stops, exactly 3 outdoor options, and exactly 3 current or recurring local events, seasonal attractions, or neighborhood-specific things to do. For each option, verify it on an official or primary public page when possible and note the title, what category it fits, the neighborhood or area, price if shown, hours or timing if shown, and whether advance booking seems needed or not shown. Open and keep the most useful evidence tabs available as you go, including at least 2 local-guide pages, at least 4 official venue or organizer pages, and at least 3 map-based pages so I can sanity-check location clusters. After that, compare the 12 options and turn them into 2 different weekend plans: one budget-friendly plan and one more experience-focused plan, with each plan covering Friday evening, Saturday daytime, Saturday evening, and Sunday daytime. Make the plans geographically sensible instead of bouncing all over town, and mention where there are obvious backups if something appears closed or weather-dependent. Finish with a clear recommendation on whether Colorado Springs looks like a strong choice for that kind of weekend, plus the 5 strongest pages left open for me to review. Can you also make sure to plan out the dining itinerary for the weekend under these guidelines? Find high quality, can't miss restaurants for every meal of the trip and write them in CryptoPad Spreadsheet.", + "level": "hard", + "reference_length": 28, + "rubrics": { + "R1": { + "requirement": "The final result identifies at least 12 total Colorado Springs options with exactly 3 free or very low-cost activities/resources, exactly 3 food or drink stops, exactly 3 outdoor options, and exactly 3 current or recurring local events, seasonal attractions, or neighborhood-specific things to do.", + "verification": "Count the listed options and confirm the category totals match 3/3/3/3 exactly.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each of the 12 options includes the requested recorded details: title, category, neighborhood or area, price if shown, hours or timing if shown, and whether advance booking seems needed or not shown.", + "verification": "Inspect each option entry and confirm every requested field is present, using 'not shown' where necessary.", + "weight": 0.16 + }, + "R3": { + "requirement": "Discovery starts from Springs magazine or similar local-guide coverage, and each option is verified on an official or primary public page when possible.", + "verification": "Check that local-guide pages were used for discovery and that the listed options rely on corresponding official or primary public pages where available.", + "weight": 0.14 + }, + "R4": { + "requirement": "Useful browser evidence is kept open, including at least 2 local-guide pages, at least 4 official venue or organizer pages, and at least 3 map-based pages.", + "verification": "Review the open tabs and confirm the minimum counts for local-guide, official/organizer, and map-based pages are met.", + "weight": 0.12 + }, + "R5": { + "requirement": "The findings are synthesized into 2 different weekend plans, one budget-friendly and one more experience-focused, and each plan covers Friday evening, Saturday daytime, Saturday evening, and Sunday daytime.", + "verification": "Check that both plans exist, are distinct, and each includes all four required time blocks.", + "weight": 0.14 + }, + "R6": { + "requirement": "The weekend plans are geographically sensible, mention obvious backups for closed or weather-dependent options, and end with a clear recommendation on whether Colorado Springs is a strong choice for that kind of weekend, with the 5 strongest pages left open for review.", + "verification": "Confirm the write-up discusses geographic clustering and backups, gives a clear recommendation, and leaves 5 strong pages open.", + "weight": 0.12 + }, + "R7": { + "requirement": "A dining itinerary is planned for every meal of the weekend using high-quality, can't-miss restaurants, and those meal recommendations are written into a CryptoPad Spreadsheet.", + "verification": "The final deliverables include a meal-by-meal restaurant plan for the weekend and an open CryptoPad Spreadsheet recording those dining recommendations.", + "weight": 0.14 + } + }, + "task_id": "2dab64ba576aeaa10453c0c642792f5ac71fb646", + "website": "https://springsmag.com" + }, + { + "confirmed_task": "I’m trying to figure out where I could realistically apply in Australia as a New Zealand-qualified lawyer with about three years of post-qualification experience, and I want a serious browser-based market sweep rather than a skim of one recruiter site. Please start with Legal People Australia, but then widen naturally to other Australian legal recruiters, major job boards, and public law-firm or in-house careers pages so we’re not missing obvious opportunities. Build me a shortlist of exactly 18 to 24 current roles that are genuinely relevant to someone around 2 to 4 PQE, and only include roles where the listing either explicitly welcomes New Zealand qualification or NZ admission, is silent but otherwise looks plausibly transferable, or clearly states Australian admission is required so that distinction is visible. For each shortlisted role, record the job title, employer or recruiter, city/state, practice area, stated PQE or experience range, whether NZ-qualified or NZ-admitted candidates are explicitly mentioned, whether Australian admission appears mandatory, whether a practising certificate or relocation detail is mentioned, and the application link; if a field is missing, mark it as not shown. Keep the work grounded in public pages only. Open and keep available the strongest evidence tabs for at least 8 representative roles spread across different cities or practice areas so I can sanity-check them later, including a mix of recruiter listings and direct employer pages if available. As you go, compare patterns across the market and separate the shortlist into three buckets: clearly eligible now, likely eligible but needs admission clarification, and probably not viable without current Australian admission. Finish with one organized tracker or memo that includes all 18 to 24 roles plus a concise recommendation on where I should focus first by city and practice area, and leave the finished tracker and the most useful evidence tabs open. I would prefer a CrpytoPad Spreadsheet for this tracker.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "A final organized tracker or memo is produced and includes exactly 18 to 24 current Australian legal roles relevant to a New Zealand-qualified lawyer with about 2 to 4 PQE.", + "verification": "Check that the final artifact exists, is left open, and contains between 18 and 24 distinct roles that match the stated candidate profile and Australian market scope.", + "weight": 0.2 + }, + "R2": { + "requirement": "The search starts with Legal People Australia and then widens to other Australian legal recruiters, major job boards, and public law-firm or in-house careers pages rather than staying on one site.", + "verification": "Confirm the browsing history or open tabs show Legal People plus additional public sources from at least two other source types named in the prompt, with roles drawn from that broader sweep.", + "weight": 0.15 + }, + "R3": { + "requirement": "Each shortlisted role records the requested fields: job title, employer or recruiter, city/state, practice area, stated PQE or experience range, whether NZ-qualified or NZ-admitted candidates are explicitly mentioned, whether Australian admission appears mandatory, whether a practising certificate or relocation detail is mentioned, and the application link, using 'not shown' where needed.", + "verification": "Sample the final tracker and verify that every included role has all required fields populated or marked 'not shown' exactly as requested.", + "weight": 0.2 + }, + "R4": { + "requirement": "The shortlist includes only roles that fit one of the three requested relevance conditions: explicitly welcoming NZ qualification/NZ admission, silent but plausibly transferable, or clearly requiring Australian admission so that distinction is visible.", + "verification": "Review the role notes and source pages to confirm each shortlisted listing is categorized on one of those stated eligibility bases rather than included without explanation.", + "weight": 0.15 + }, + "R5": { + "requirement": "At least 8 representative evidence tabs are kept available, spread across different cities or practice areas, and include a mix of recruiter listings and direct employer pages if available.", + "verification": "Inspect the remaining open tabs and confirm there are at least 8 relevant evidence pages spanning multiple cities or practice areas, with both recruiter and direct employer sources represented when available.", + "weight": 0.15 + }, + "R6": { + "requirement": "The final output separates all shortlisted roles into the three requested buckets—clearly eligible now, likely eligible but needs admission clarification, and probably not viable without current Australian admission—and ends with a concise recommendation on where to focus first by city and practice area, and the organized tracker is produced as a CryptoPad Spreadsheet.", + "verification": "Check the final tracker or memo for the three explicit buckets and a closing recommendation that prioritizes target cities and practice areas based on the gathered listings; the final deliverable includes an open CryptoPad Spreadsheet with the requested role tracker.", + "weight": 0.15 + } + }, + "task_id": "9825e1505ad6eacf0ae2af5709a74d9eb0280029", + "website": "https://www.legalpeople.com.au" + }, + { + "confirmed_task": "I want to buy one compact electric cooker for a small kitchen, but I don’t want a generic pick—I want the best realistic option for making rice several times a week and steaming vegetables in the same appliance. Please do a serious browser-based comparison of exactly 8 compact electric multicookers in roughly the 2.5- to 4-quart range, starting with Amazon but expanding to manufacturer pages and other major public retailer pages when that helps verify specs, pricing, or bundle contents. For every model, capture the current product name, approximate price, stated capacity, overall dimensions or footprint, whether it has a dedicated rice program or rice guidance, whether it includes a steaming rack or basket, inner-pot material if shown, dishwasher-cleaning claims if shown, and any clearly stated safety features. Then go one layer deeper on the strongest candidates by opening the manual or official product page when available so you can verify how rice cooking and vegetable steaming are actually described, not just the retailer bullet points. I also want you to check at least 3 credible third-party review or comparison sources and use them only to help evaluate real-world usability issues like rice texture, steam performance, cleaning annoyance, nonstick concerns, slow preheat, or confusing controls. At the end, narrow the field to the best 3 options for my use case, explicitly naming the best overall pick, the best budget pick, and the best easiest-to-clean pick. Keep the final 3 product pages open, plus at least 2 supporting evidence tabs such as a manual, official spec page, or review page that helped drive the decision, and give me a concise final comparison that makes clear why the winner beats the others for rice-and-vegetable cooking in a small kitchen.", + "level": "hard", + "reference_length": 9, + "rubrics": { + "R1": { + "requirement": "Exactly 8 compact electric multicookers in roughly the 2.5- to 4-quart range are compared.", + "verification": "The final comparison includes 8 distinct models, each clearly identified by product name and falling within the requested capacity band as shown on public product pages.", + "weight": 0.18 + }, + "R2": { + "requirement": "For every one of the 8 models, the comparison captures the requested product-page details: approximate price, stated capacity, dimensions or footprint, rice-program or rice-guidance status, included steaming rack or basket status, inner-pot material if shown, dishwasher-cleaning claims if shown, and clearly stated safety features.", + "verification": "Each of the 8 entries records those fields from retailer or official product pages, with 'not shown' used only where the page does not provide the information.", + "weight": 0.2 + }, + "R3": { + "requirement": "The strongest candidates are checked one level deeper using manuals or official product pages to verify how rice cooking and vegetable steaming are actually described.", + "verification": "Manuals and/or official spec pages are opened for the leading candidates, and the final reasoning cites those sources for rice and steaming behavior rather than relying only on retailer bullets.", + "weight": 0.17 + }, + "R4": { + "requirement": "At least 3 credible third-party review or comparison sources are checked to evaluate real-world usability issues such as rice texture, steam performance, cleaning annoyance, nonstick concerns, slow preheat, or confusing controls.", + "verification": "The browsing session includes at least 3 review/comparison sources, and the final synthesis uses them for usability tradeoffs that go beyond raw specs.", + "weight": 0.14 + }, + "R5": { + "requirement": "The field is narrowed to the best 3 options for this use case, explicitly identifying the best overall pick, the best budget pick, and the best easiest-to-clean pick.", + "verification": "The final answer presents exactly 3 shortlisted models and labels them with the three requested recommendation categories.", + "weight": 0.18 + }, + "R6": { + "requirement": "Browser evidence is left visible by keeping open the final 3 product pages plus at least 2 supporting evidence tabs such as a manual, official spec page, or review page that materially informed the decision.", + "verification": "Open tabs at the end include the 3 shortlisted product pages and at least 2 relevant support pages tied to the final recommendation.", + "weight": 0.13 + } + }, + "task_id": "5aaab80dfeee2bad47c26c6a9b706f3dbaf76ab2", + "website": "https://www.amazon.com" + }, + { + "confirmed_task": "I’m trying to plan a future trip from Kuala Lumpur to airport code LBJ, and I don’t just want one snapshot fare search — I want a real browser-based comparison that helps me decide when and how to book. First, confirm which airport LBJ refers to on public flight sites, then search for this route across at least 12 plausible future departure date combinations spread over multiple weeks or months so we can see whether true direct service actually exists and when it tends to be cheapest. Prioritize nonstop options whenever they are real, but if nonstop service is missing or clearly poor value on some dates, include the strongest one-stop fallback options for those same windows so I can compare the tradeoff. For each of the 12 date checks, capture the cheapest viable option, total travel time, airline, and whether it is nonstop or one-stop, and cross-check the most promising results on at least two public flight-search or airline pages rather than trusting a single source. Then narrow that down to the 6 strongest overall options and compare the important booking details on public pages: baggage allowance, refund or change flexibility if shown, and the booking source or airline page that appears most trustworthy. Keep the most useful search tabs open, including at least 3 tabs showing the best nonstop candidates and at least 2 tabs showing the best one-stop backups, so I can inspect them later. Finish with a concise recommendation that names the best overall option, the best cheapest nonstop option if different, and the best value fallback if I need to compromise on stops or travel time, with all prices, durations, and any missing details marked as not shown.", + "level": "hard", + "reference_length": 5, + "rubrics": { + "R1": { + "requirement": "The task confirms which airport code 'LBJ' refers to on public flight pages before doing the comparison.", + "verification": "Final output explicitly identifies the airport represented by LBJ and cites or references the public flight-search evidence used to confirm it.", + "weight": 0.12 + }, + "R2": { + "requirement": "At least 12 plausible future departure date combinations are searched for the KUL-to-LBJ route across multiple weeks or months.", + "verification": "Final comparison includes 12 distinct future date checks with recorded result details for each search window.", + "weight": 0.18 + }, + "R3": { + "requirement": "Each of the 12 date checks records the cheapest viable option, total travel time, airline, and whether it is nonstop or one-stop.", + "verification": "The final comparison table or memo contains all four required fields for all 12 date checks, using 'not shown' where needed.", + "weight": 0.18 + }, + "R4": { + "requirement": "Nonstop options are prioritized when real, but strong one-stop fallback options are also included for the same windows when nonstop service is missing or poor value.", + "verification": "The results clearly distinguish nonstop versus one-stop options and include one-stop alternatives where relevant instead of only listing nonstops.", + "weight": 0.16 + }, + "R5": { + "requirement": "The most promising results are cross-checked on at least two public flight-search or airline pages, and the 6 strongest overall options are narrowed down for closer comparison.", + "verification": "The final output identifies 6 shortlisted options and notes cross-check evidence from at least two public sources for the leading candidates.", + "weight": 0.16 + }, + "R6": { + "requirement": "For the 6 strongest overall options, the comparison includes baggage allowance, refund or change flexibility if shown, and the most trustworthy booking source or airline page.", + "verification": "Each of the 6 shortlisted options contains those booking-detail fields, with 'not shown' used when a public page does not provide them.", + "weight": 0.1 + }, + "R7": { + "requirement": "The most useful tabs are left open, including at least 3 best nonstop candidate tabs and at least 2 best one-stop backup tabs, and the final recommendation names the best overall option, the best cheapest nonstop option if different, and the best value fallback.", + "verification": "Open tabs visibly include the required nonstop and one-stop evidence pages, and the final written recommendation contains all three requested recommendation labels with price and duration details.", + "weight": 0.1 + } + }, + "task_id": "7f9a7d1771191e31cdb77715a4207fcd469346ca", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I’m trying to make a serious Apple-only buying decision for a small three-role setup in the UK, and I want you to do the whole comparison in the browser instead of just glancing at one MacBook Pro page. Please start from Apple UK’s Mac pages and compare at least 8 real configurations drawn across the current MacBook Air, MacBook Pro, iMac, Mac mini, and Mac Studio families, using only public Apple pages. I need this organized around exactly three roles: a travel-heavy general user, a software developer, and a photo/video creator. For each role, figure out which Mac is the best fit and which lower-cost fallback is still reasonable, and note the key reasons from the official specs pages such as chip tier, memory, storage, display, ports, battery, and external-display support whenever Apple shows it. As you work, keep the most useful product pages open for the finalist options, including at least one MacBook Pro page and the main Mac lineup page, so I can review the evidence. After the comparison, build one draft Apple UK shopping bag with exactly 3 chosen Macs total, one for each role, and add AppleCare+ for each if publicly available on the path you use. Also check whether each chosen Mac plausibly needs any first-party accessory from Apple, like an adapter, mouse, keyboard, or external display, and add only the accessories that are clearly justified by the role; if something is not shown publicly, say not shown rather than guessing. In the end, give me a concise final buying recommendation that includes the 8 compared configurations, the 3 selected Macs, any added AppleCare+ or accessories, the estimated subtotal from the bag if visible, and the main tradeoffs you used to decide. Leave the final bag open along with the key comparison tabs. Give me the different options and viable options you drew up in a CryptoPad spreadsheet.", + "level": "hard", + "reference_length": 10, + "rubrics": { + "R1": { + "requirement": "The browsing session compares at least 8 real configurations across the current MacBook Air, MacBook Pro, iMac, Mac mini, and Mac Studio families using public Apple UK pages.", + "verification": "Final output lists 8 or more specific compared configurations and shows coverage across the named Mac families from Apple UK product/configuration pages.", + "weight": 0.18 + }, + "R2": { + "requirement": "The comparison is organized around exactly three roles: a travel-heavy general user, a software developer, and a photo/video creator, with one best-fit Mac and one lower-cost fallback identified for each role.", + "verification": "Final recommendation includes all 3 roles and, for each one, names both a best-fit option and a lower-cost fallback tied to that role.", + "weight": 0.18 + }, + "R3": { + "requirement": "For the role-based decisions, the key reasons are drawn from official specs pages and cover the requested categories when shown: chip tier, memory, storage, display, ports, battery, and external-display support, and the compared configurations and viable options are also captured in a CryptoPad Spreadsheet.", + "verification": "Each chosen role recommendation includes spec-based reasoning from Apple pages, and missing fields are explicitly marked as not shown rather than inferred; an open CryptoPad Spreadsheet records the compared Mac configurations and viable options.", + "weight": 0.16 + }, + "R4": { + "requirement": "The browser keeps the most useful product pages open for the finalist options, including at least one MacBook Pro page and the main Mac lineup page.", + "verification": "Open tabs at the end include the main Mac lineup page, at least one MacBook Pro page, and additional finalist product/spec pages used in the comparison.", + "weight": 0.12 + }, + "R5": { + "requirement": "A draft Apple UK shopping bag is built with exactly 3 chosen Macs total, one for each role, and AppleCare+ is added for each if publicly available on the path used.", + "verification": "The final bag shows 3 Macs corresponding to the 3 roles, and AppleCare+ appears for each where available; if not available, that is explicitly noted.", + "weight": 0.18 + }, + "R6": { + "requirement": "The session checks whether each chosen Mac plausibly needs any first-party Apple accessory and adds only clearly justified accessories, using not shown for anything unavailable publicly.", + "verification": "Final output and/or bag include only role-justified first-party accessories, with unsupported or unavailable details marked not shown instead of guessed.", + "weight": 0.08 + }, + "R7": { + "requirement": "The end result gives a concise final buying recommendation including the 8 compared configurations, the 3 selected Macs, any added AppleCare+ or accessories, the estimated subtotal from the bag if visible, and the main tradeoffs, with the final bag left open.", + "verification": "Final summary contains all requested elements and the bag remains open at the end as the decision endpoint.", + "weight": 0.1 + } + }, + "task_id": "58c130841b267ab28c26fc1df1a2bdfa00b3b5ba", + "website": "https://www.apple.com" + }, + { + "confirmed_task": "I need help choosing an emerald green dress in women’s size 12 as a birthday gift for my sister, and I want this done like a real shopping decision instead of just grabbing the first result. Start with PrettyLittleThing since that’s where I first looked, but please widen out to other mainstream public retailers if they have stronger options. Find 12 total candidate dresses across at least 6 different retailers, and only include items that are explicitly shown in or clearly selectable to size 12; if some useful detail like fabric or exact length is not shown, just note it as not shown instead of guessing. For each dress, compare the product photos, silhouette, occasion level, price, any visible reviews or ratings, color wording if it’s close to emerald, shipping timing if publicly shown, and the retailer’s return policy from the public site. I want you to separate the 12 candidates into three groups of exactly 4 each: best value, best for a dressier birthday dinner or night out, and safest gift choice if I’m unsure about her exact style. Keep the product pages open for the best 6 finalists, and also keep open the key return-policy or delivery-info pages for at least 3 of the retailers so I can sanity-check the buying risk. At the end, recommend exactly 3 dresses ranked in order, explain why each made the cut, call out any tradeoffs like final sale or slow shipping, and leave open the single best buy plus the two strongest backup product pages.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "Exactly 12 candidate emerald-green-or-close dresses are identified across at least 6 different retailers, and each candidate is explicitly shown in or clearly selectable to women’s size 12.", + "verification": "Count the candidate dresses and retailers used, and confirm each included product page shows size 12 availability or selection rather than an inferred fit.", + "weight": 0.2 + }, + "R2": { + "requirement": "Each of the 12 candidates is compared on the requested fields: product photos, silhouette, occasion level, price, any visible reviews or ratings, color wording, shipping timing if publicly shown, and the retailer’s public return policy, with missing fields marked not shown where needed.", + "verification": "Review the final comparison output and confirm every candidate includes all requested comparison dimensions or an explicit not shown note for unavailable details.", + "weight": 0.2 + }, + "R3": { + "requirement": "The 12 candidates are organized into exactly three groups of 4 each: best value, best for a dressier birthday dinner or night out, and safest gift choice if the sister’s exact style is uncertain.", + "verification": "Check that there are exactly 3 labeled groups and exactly 4 dresses in each group, with no missing or extra entries.", + "weight": 0.15 + }, + "R4": { + "requirement": "The browser evidence is preserved by keeping open the product pages for the best 6 finalists and the key return-policy or delivery-info pages for at least 3 retailers.", + "verification": "Inspect open tabs to confirm 6 finalist product pages remain open and that at least 3 retailer policy or delivery-information pages are also open.", + "weight": 0.15 + }, + "R5": { + "requirement": "A final recommendation of exactly 3 dresses is provided in ranked order, with a short explanation for why each made the cut and explicit tradeoffs such as final sale, weak reviews, limited size range, or slower shipping where applicable.", + "verification": "Check that the final recommendation contains exactly 3 ranked dresses and that each entry includes both a positive rationale and any relevant tradeoff notes.", + "weight": 0.18 + }, + "R6": { + "requirement": "The session ends with the single best buy and the two strongest backup product pages left open for direct purchase review.", + "verification": "Inspect the final open tabs and confirm that the top-ranked dress page plus the 2 backup product pages are still open.", + "weight": 0.12 + } + }, + "task_id": "304ff89d7a6a6df597aafaf720ba11f6c49214c3", + "website": "https://www.prettylittlething.com" + }, + { + "confirmed_task": "I’m trying to figure out whether I can realistically handle a failing instrument cluster on my 2012 Mercedes GLK350 myself or whether I should pay a shop, so please do a serious browser-based repair-prep session instead of just summarizing one video. Start by finding and comparing at least 7 public sources on the actual cluster-removal process for this exact model or the closest clearly compatible X204/GLK fit, including at least 2 videos, at least 2 forum or owner-discussion threads, at least 1 parts diagram or exploded-view page, and at least 1 written guide or listing that helps confirm how the cluster is mounted. From those sources, build one consolidated step-by-step removal and reinstall explanation that calls out the tool sizes or trim tools when shown, battery or airbag-related precautions if mentioned, hidden clips or fasteners, and any disagreements between sources. Then research replacement paths on public pages by comparing at least 5 cluster options total across used, rebuilt, and OEM-or-OE-style listings, and note the part number when shown, fitment notes, price, mileage disclosure if applicable, warranty or return details if shown, and whether the listing or surrounding sources suggest coding, VIN matching, immobilizer issues, or odometer problems. After that, look for at least 3 public repair-shop or cluster-rebuild pages that could serve as a professional fallback, and capture what each one publicly shows about Mercedes cluster service, turnaround, pricing guidance if shown, or contact limitations if pricing is not shown. Keep the most useful evidence tabs open for the best removal video, the clearest forum thread, one parts-diagram page, the 3 strongest replacement listings, and the 2 most credible professional-service pages. Finish with a practical decision memo that tells me whether the evidence supports DIY removal only, full DIY replacement, or using a pro, and include a concise risk checklist, a shopping shortlist of the best 3 replacement options, and a final recommendation based only on what the public pages actually support.", + "level": "hard", + "reference_length": 5, + "rubrics": { + "R1": { + "requirement": "At least 7 public sources are used for the removal-process research, including at least 2 videos, at least 2 forum or owner-discussion threads, at least 1 parts diagram or exploded-view page, and at least 1 written guide or listing that helps confirm mounting/removal details.", + "verification": "The browsing session and final memo show these source types and counts, with source-specific evidence visible from the pages used.", + "weight": 0.18 + }, + "R2": { + "requirement": "A single consolidated step-by-step explanation for removing and reinstalling the 2012 Mercedes GLK350 instrument cluster is produced, and it includes tools when shown, battery or airbag-related precautions if mentioned, hidden clips or fasteners, and any disagreements between sources.", + "verification": "The final memo contains the full synthesized procedure and explicitly notes cautions, hidden hardware, and source disagreements rather than only linking pages.", + "weight": 0.2 + }, + "R3": { + "requirement": "At least 5 replacement cluster options are compared across used, rebuilt, and OEM-or-OE-style listings, with part number when shown, fitment notes, price, mileage disclosure if applicable, warranty or return details if shown, and any coding/VIN/immobilizer/odometer notes supported by the public pages.", + "verification": "The final comparison includes 5 or more listings spanning the requested replacement categories and records the requested fields as shown or 'not shown.'", + "weight": 0.22 + }, + "R4": { + "requirement": "At least 3 public repair-shop or cluster-rebuild pages are reviewed as professional fallback options, capturing what each page shows about Mercedes cluster service, turnaround, pricing guidance if shown, or contact limitations when pricing is not shown.", + "verification": "The final memo lists 3 or more professional-service options and records the requested public-page details for each.", + "weight": 0.14 + }, + "R5": { + "requirement": "The most useful evidence tabs are kept open for the best removal video, the clearest forum thread, one parts-diagram page, the 3 strongest replacement listings, and the 2 most credible professional-service pages.", + "verification": "Open tabs at the end correspond to these exact evidence categories and quantities.", + "weight": 0.12 + }, + "R6": { + "requirement": "The session ends with a practical decision memo that recommends DIY removal only, full DIY replacement, or using a pro, and it includes a concise risk checklist plus a shopping shortlist of the best 3 replacement options.", + "verification": "The final memo contains one of the three requested recommendation outcomes, a risk checklist, and a clearly labeled top-3 replacement shortlist tied to the browsing evidence.", + "weight": 0.14 + } + }, + "task_id": "24705fdf2d165f4f844984e62b4511396a88206d", + "website": "https://www.youtube.com" + }, + { + "confirmed_task": "I’m trying to find a pair of women’s bootcut jeans that I’d actually buy, and my requirements are annoyingly specific: long length, dark denim wash, and frayed hems. Don’t just check one store. Use the public web to build me a serious comparison across 10 to 12 viable pairs from brands that are realistically buyable online in the U.S., including Abercrombie if they have a match. For each candidate, verify from the product page whether it clearly offers a long or tall length, a dark wash, and a frayed/raw hem; if one of those details is unclear, mark it as not shown instead of guessing. While comparing them, pay close attention to the details that will actually affect whether I’d keep them: inseam or length info, rise, stretch or fabric composition, price, available sizes, and whether customer photos/reviews make the wash and hem look the way the product page claims. Also check the return policy on each brand’s public site so I can avoid something that would be a pain to send back. Narrow everything down to the best 4 finalists, and for those 4, keep the product tabs open along with the most useful size-guide or return-policy tabs. In the end, give me one clear best overall pick, one best value pick, and one safest-to-order pick, with short reasons tied to the evidence you found.", + "level": "hard", + "reference_length": 7, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies 10 to 12 viable jean candidates from publicly accessible U.S. retail pages, including Abercrombie if it has a match.", + "verification": "Count the final candidate set and confirm each item is a distinct public product page from a buyable online retailer; confirm Abercrombie is included if a qualifying or near-qualifying candidate was found there.", + "weight": 0.18 + }, + "R2": { + "requirement": "For every candidate, the comparison explicitly checks long or tall length, dark wash, and frayed or raw hem status from the product page, using 'not shown' where the page does not clearly state one of those details.", + "verification": "Inspect the final comparison and confirm all candidates have entries for the three requested criteria with no guessed values where evidence is missing.", + "weight": 0.2 + }, + "R3": { + "requirement": "The comparison records the practical purchase details the prompt asked for for each candidate: inseam or length info, rise, stretch or fabric composition, price, available sizes, and whether customer photos or reviews support the claimed wash and hem appearance.", + "verification": "Check that each candidate includes all listed comparison fields or 'not shown' where applicable, and that review/photo observations are tied to the actual product listing context.", + "weight": 0.2 + }, + "R4": { + "requirement": "A public-site return-policy check is completed for each brand represented in the candidate set so the user can judge return friction before ordering.", + "verification": "Confirm that each brand in the comparison has a corresponding return-policy finding from a public page and that the result is incorporated into the evaluation.", + "weight": 0.12 + }, + "R5": { + "requirement": "The candidate pool is narrowed to exactly 4 finalists, and the most useful evidence tabs are left open for those finalists, including the product pages and any key size-guide or return-policy pages used to judge them.", + "verification": "Confirm there are exactly 4 finalists and that relevant finalist product tabs remain open, along with any supporting size-guide or return-policy tabs referenced as important evidence.", + "weight": 0.15 + }, + "R6": { + "requirement": "The final recommendation gives exactly one best overall pick, one best value pick, and one safest-to-order pick, each with short reasons grounded in the compared evidence.", + "verification": "Check that all three recommendation categories are present exactly once and that each rationale refers back to comparison evidence such as fit details, price, review support, or return policy.", + "weight": 0.15 + } + }, + "task_id": "c4b9782b4f76679169f1438e1e70addd96d7b518", + "website": "https://www.abercrombie.com" + }, + { + "confirmed_task": "I’m trying to stop wasting money on blushes that look cute online but disappear by lunch, so please help me do a real browser-based comparison focused on rosy pink shades that are meant to last through a full day. Start by checking Hey Hottie and identify the best rosy-pink blush they currently sell, then capture the exact product name and the exact text on the public page that supports a long-lasting or all-day-wear claim. After that, broaden the search one practical level up and compare Hey Hottie against 11 other rosy-pink blush options from reputable beauty brands or major beauty retailers, so I end up with 12 total products including Hey Hottie. For each of the 12, record the product name, brand, formula type, the closest rosy-pink shade name, the listed price, size if shown, the exact wear-claim text if the page gives one, and mark \"not shown\" when a field is missing. Please also use public product photos, swatches, or retailer/brand images to sanity-check whether each shade really reads as rosy pink instead of peach, mauve, or berry, and note any obvious mismatch. I want this to feel like a serious buying decision, so use a mix of official brand pages and major retailer pages where helpful, and compare review signals too when they’re publicly visible. Then narrow it down to the best 3 options for three different needs: best overall rosy-pink all-day pick, best budget pick, and best cream-or-dewy pick. Be explicit about whether the Hey Hottie option actually makes the final top 3 and why or why not. Keep the Hey Hottie product page open, plus the final 3 winning product pages and at least 2 useful comparison/review tabs, so I can inspect the evidence afterward.", + "level": "hard", + "reference_length": 5, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies the best current Hey Hottie rosy-pink blush candidate and includes both the exact product name and the exact public-page text supporting a long-lasting or all-day-wear claim.", + "verification": "Check the final result for one Hey Hottie product name plus a verbatim supporting claim text taken from a public Hey Hottie page, and confirm the Hey Hottie product tab is left open.", + "weight": 0.18 + }, + "R2": { + "requirement": "A total of 12 rosy-pink blush products are compared, including Hey Hottie plus 11 other options from reputable beauty brands or major beauty retailers.", + "verification": "Count the compared products in the final comparison and confirm there are exactly 12 total entries with Hey Hottie included.", + "weight": 0.18 + }, + "R3": { + "requirement": "For each of the 12 products, the comparison records product name, brand, formula type, closest rosy-pink shade name, listed price, size if shown, exact wear-claim text if shown, and uses \"not shown\" where fields are missing.", + "verification": "Inspect all 12 entries and confirm each required field is present, with missing values explicitly labeled \"not shown\" rather than omitted.", + "weight": 0.2 + }, + "R4": { + "requirement": "The session uses public product photos, swatches, or retailer/brand images to assess whether each option truly reads as rosy pink, and notes any obvious mismatch with peach, mauve, or berry tones.", + "verification": "Review the comparison notes for all 12 products and confirm there is a shade-fit judgment for each, including mismatch notes where applicable, supported by visible product-image or swatch browsing.", + "weight": 0.14 + }, + "R5": { + "requirement": "The comparison uses a mix of official brand pages and major retailer pages where helpful, and includes publicly visible review signals when available.", + "verification": "Check that the evidence comes from more than one source type, with at least some official brand pages and some major retailer pages or retailer review evidence, and that review signals are noted when visible.", + "weight": 0.1 + }, + "R6": { + "requirement": "The final synthesis names exactly 3 winners: best overall rosy-pink all-day pick, best budget pick, and best cream-or-dewy pick, and explicitly states whether Hey Hottie makes the top 3 and why or why not.", + "verification": "Confirm the final recommendation section contains exactly 3 category winners and a direct statement about Hey Hottie’s top-3 status with reasoning.", + "weight": 0.12 + }, + "R7": { + "requirement": "The browser is left with the Hey Hottie product page open, the 3 winning product pages open, and at least 2 useful comparison or review tabs open for later inspection.", + "verification": "Inspect the open tabs at the end and confirm the Hey Hottie tab, 3 winner tabs, and at least 2 comparison/review evidence tabs remain open.", + "weight": 0.08 + } + }, + "task_id": "63658b88215cc7b85e97e36d5b8b39b47904c6b6", + "website": "https://heyhottie.co" + }, + { + "confirmed_task": "I’m trying to figure out what kind of genuinely fun, alternative adult outing I should plan in London for a future weekend, and I don’t want just three random ideas. Please do a real browser-based comparison of 12 to 15 London experiences that skew unusual or playful for adults, with at least 4 immersive game/story-led options, at least 4 challenge or escape-style activities, and at least 4 social or competitive experiences that still feel more interesting than a normal bar night. Use official venue pages plus maps/review pages where helpful, and for each option note the name, borough or area, the core format, typical duration, indicative price per person if public, ideal group size if shown, and anything important about booking or age restrictions; if something is missing, write \"not shown.\" As you work, keep the strongest candidate tabs open so I can inspect them later, and for at least 6 of the best options open pages with photos, maps, or reviews so there’s visible evidence of what the experience is really like and where it sits in the city. Then narrow the list to the best 6 overall and compare them more carefully for vibe, value, logistical ease, and how distinctive they feel versus touristy or generic options. Finally, recommend exactly 3 winners: one best immersive game, one best escape/challenge activity, and one best overall alternative night-out pick. Also build 2 realistic sample plans from your shortlist: one for a date night and one for a group of 4 to 6 friends, each centered on one main experience plus a nearby food or drink spot that makes geographic sense. Leave the most useful venue and map/review tabs open at the end so I can review the finalists.", + "level": "hard", + "reference_length": 14, + "rubrics": { + "R1": { + "requirement": "The browsing session compares 12 to 15 London experiences, including at least 4 immersive game/story-led options, at least 4 challenge or escape-style activities, and at least 4 social or competitive experiences that feel more interesting than a normal bar night.", + "verification": "Count the total experiences in the final comparison and confirm the category minimums are satisfied using the listed venues and their described formats.", + "weight": 0.2 + }, + "R2": { + "requirement": "For each compared experience, the results include the name, borough or area, core format, typical duration, indicative price per person if public, ideal group size if shown, and any booking or age-restriction notes, using \"not shown\" where needed.", + "verification": "Check every listed experience entry for all requested fields and confirm missing fields are explicitly marked \"not shown\" rather than omitted.", + "weight": 0.18 + }, + "R3": { + "requirement": "Official venue pages are used for the experiences, and for at least 6 of the best options there is visible browser evidence from pages with photos, maps, or reviews showing what the experience is like and where it is located.", + "verification": "Inspect the open tabs or navigation history to confirm official venue pages were used and that at least 6 shortlisted options have supporting photo, map, or review pages opened.", + "weight": 0.16 + }, + "R4": { + "requirement": "The work narrows the broader set to the best 6 overall and compares those finalists specifically on vibe, value, logistical ease, and distinctiveness versus touristy or generic options.", + "verification": "Review the finalist comparison section and confirm there are exactly 6 finalists with explicit commentary on all four requested decision factors.", + "weight": 0.16 + }, + "R5": { + "requirement": "The final recommendations name exactly 3 winners: one best immersive game, one best escape/challenge activity, and one best overall alternative night-out pick.", + "verification": "Check the conclusion for exactly three labeled winners matching the requested categories, with no missing or extra winner categories.", + "weight": 0.12 + }, + "R6": { + "requirement": "Two realistic sample plans are produced from the shortlist: one for a date night and one for a group of 4 to 6 friends, each built around one main experience plus a nearby food or drink spot that makes geographic sense.", + "verification": "Confirm there are exactly two plans, each with the correct audience type, one chosen main experience, and one nearby food or drink stop that is plausibly close based on the opened map/review evidence.", + "weight": 0.1 + }, + "R7": { + "requirement": "The most useful venue and map/review tabs for the finalists are left open so the user can inspect the strongest options afterward.", + "verification": "At the end of the session, confirm that key finalist venue tabs and supporting map/review tabs remain open rather than being fully closed out.", + "weight": 0.08 + } + }, + "task_id": "54fdb6126b926e2a48b4e45ffc1fe303e873eb7f", + "website": "https://www.inthehiddencity.com" + }, + { + "confirmed_task": "I just got serious about buying a Nintendo Switch OLED for someone who is basically new to console gaming, and I do not want to waste money on random accessories that look useful but are not actually the right starter setup. Please build me a real browser-based buying guide that starts with the official Nintendo Switch OLED pages so you can verify what is actually compatible, then compare public product pages across Amazon plus at least two other major retailers like Best Buy, Target, Walmart, or GameStop. I want you to cover at least 5 categories that matter for a first setup: a screen protector, carrying case, extra controller option, microSD storage, and one charging or dock-friendly power accessory; if another category looks truly important, you can include it as optional. For each of those 5 required categories, compare exactly 3 viable products, so I end up with 15 compared items total, and note the product name, price, retailer, any obvious compatibility notes, and the main reason someone would pick it over the others. After that, turn the comparison into 3 complete starter bundles for different buyers: a lean budget bundle, a balanced everyday bundle, and a travel-or-multiplayer bundle. Each bundle should include exactly 5 to 7 items total, use only products you already checked, and clearly mark which items are essential versus optional. Also include one beginner-friendly first game recommendation and whether Nintendo Switch Online seems worth adding for that bundle. Keep the most useful evidence tabs open, including the official compatibility pages and at least one live product page for each required category, and finish with one clear recommendation for which bundle I should actually buy if I want the best overall value for a new Switch OLED owner.", + "level": "hard", + "reference_length": 7, + "rubrics": { + "R1": { + "requirement": "The browsing session starts from official Nintendo Switch OLED pages and uses them to verify compatibility or setup constraints relevant to the recommended accessories.", + "verification": "Official Nintendo Switch OLED or Nintendo support pages are opened and the final comparison references compatibility or setup notes drawn from those pages.", + "weight": 0.16 + }, + "R2": { + "requirement": "Exactly 5 required categories are covered: screen protector, carrying case, extra controller option, microSD storage, and one charging or dock-friendly power accessory.", + "verification": "The final comparison is organized by those 5 categories and includes no missing required category.", + "weight": 0.16 + }, + "R3": { + "requirement": "For each of the 5 required categories, exactly 3 viable products are compared using public product pages from Amazon plus at least two other major retailers overall.", + "verification": "The final output contains 15 compared items total, with product name, price, retailer, compatibility note if relevant, and a main reason to choose each item; retailer coverage includes Amazon and at least two of Best Buy, Target, Walmart, or GameStop.", + "weight": 0.22 + }, + "R4": { + "requirement": "The comparison is synthesized into 3 complete starter bundles: one lean budget bundle, one balanced everyday bundle, and one travel-or-multiplayer bundle.", + "verification": "All 3 named bundles appear, each contains exactly 5 to 7 items drawn only from previously compared products, and each item is marked essential or optional.", + "weight": 0.18 + }, + "R5": { + "requirement": "Each of the 3 bundles includes one beginner-friendly first game recommendation and a judgment on whether Nintendo Switch Online seems worth adding for that bundle.", + "verification": "Every bundle explicitly lists one game recommendation and states yes, no, or conditional value for Nintendo Switch Online.", + "weight": 0.12 + }, + "R6": { + "requirement": "Useful browser evidence is preserved by keeping open the official compatibility pages and at least one live product page for each required category.", + "verification": "Open tabs at the end include official Nintendo pages plus at least 5 live product pages representing the 5 required categories.", + "weight": 0.16 + } + }, + "task_id": "d7696e89cb472203c4fe131625ff8a025b86e6e7", + "website": "https://www.amazon.com" + }, + { + "confirmed_task": "I’m trying to book a perfume bar or custom scent experience for a future wedding, and I don’t want just one random lead—I want a real shortlist I could confidently contact for a destination event anywhere in the world. Please use public web pages to find 8 to 10 perfume-bar, fragrance-bar, or scent-activation vendors that appear to do weddings, private events, or luxury event activations, and for each one verify whether they explicitly mention international travel, destination events, global service, or leave that as not shown if you can’t confirm it. Compare them on the details that would actually matter to me: whether they look wedding-appropriate, what kind of guest experience they offer, whether they seem able to handle on-site personalization, what regions they mention serving, how polished the setup looks from photos, and exactly how I would inquire or book them. Keep the strongest 4 vendor pages open, including at least 2 that clearly suggest international or destination capability and at least 1 that looks especially strong for weddings. Then give me a final ranked shortlist of the best 5 options, explain which single vendor you’d contact first for a destination wedding and why, and include the booking or contact method for each finalist.", + "level": "hard", + "reference_length": 9, + "rubrics": { + "R1": { + "requirement": "A total of 8 to 10 perfume-bar, fragrance-bar, or scent-activation vendors are identified from public pages, and each appears to serve weddings, private events, or luxury event activations.", + "verification": "Check that the final comparison includes 8 to 10 distinct vendors and that each entry cites or reflects a public page showing relevant event-service fit.", + "weight": 0.18 + }, + "R2": { + "requirement": "For every vendor, the result states whether the vendor explicitly mentions international travel, destination events, global service, or 'not shown' when that cannot be confirmed.", + "verification": "Review each vendor entry for a destination/international-service field populated with either confirmed evidence or 'not shown' rather than omission.", + "weight": 0.18 + }, + "R3": { + "requirement": "The comparison covers the decision factors explicitly requested for each vendor: wedding appropriateness, guest experience offered, apparent ability to handle on-site personalization, regions served, setup polish from photos, and inquiry or booking method.", + "verification": "Inspect the vendor comparison and confirm that each requested factor is addressed for all reviewed vendors, using 'not shown' where needed.", + "weight": 0.2 + }, + "R4": { + "requirement": "The strongest 4 vendor pages are left open, including at least 2 that clearly suggest international or destination capability and at least 1 that looks especially strong for weddings.", + "verification": "Check the open browser tabs at the end and confirm there are 4 relevant vendor pages matching the requested mix.", + "weight": 0.14 + }, + "R5": { + "requirement": "A final ranked shortlist of the best 5 options is provided.", + "verification": "Confirm that exactly 5 finalists are ranked in order in the final output.", + "weight": 0.12 + }, + "R6": { + "requirement": "The final answer identifies which single vendor should be contacted first for a destination wedding and explains why.", + "verification": "Check that one vendor is explicitly named as the top outreach choice and that a reasoned explanation is given.", + "weight": 0.1 + }, + "R7": { + "requirement": "Each of the 5 finalists includes a booking or contact method.", + "verification": "Review the finalist entries and confirm that each contains a concrete inquiry path such as contact form, email, phone, DM, or booking page.", + "weight": 0.08 + } + }, + "task_id": "a5d88b699a2524de24e157d9269d0c42a070edab", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I want to stop randomly clicking tech videos and instead figure out which YouTube reviewers are actually worth following for my future gadget research. Please use public YouTube pages to build me a serious shortlist of 10 tech-related videos drawn from at least 6 different channels that seem most relevant to a shopper who cares about product reviews and first-impression videos for mainstream consumer tech like phones, laptops, audio gear, wearables, or similar gadgets. For each video, include the exact title, channel name, product category, how recent it is, and a short 2–3 sentence summary of what the reviewer actually says. While doing that, compare the channels on things like whether they show real hands-on testing, whether the video feels more like a quick impression or a deeper review, whether pros and cons are clearly stated, and whether the presentation style seems useful for someone trying to decide what to buy. Keep at least 4 of the strongest video tabs open and at least 3 channel pages open so I can review the evidence myself. At the end, recommend the best 3 channels for me to follow going forward, explain what each is best for, and identify the best 5 videos from your set if I only have time to watch a few. Prepare a presentation in CryptoPad presentations of a rundown on your recommendations and analysis.", + "level": "hard", + "reference_length": 12, + "rubrics": { + "R1": { + "requirement": "The final result includes exactly 10 tech-related YouTube videos drawn from at least 6 different channels.", + "verification": "Count the videos and confirm the channel names show coverage from 6 or more distinct YouTube channels.", + "weight": 0.2 + }, + "R2": { + "requirement": "Each of the 10 videos includes the exact title, channel name, product category, how recent it is, and a 2–3 sentence summary.", + "verification": "Check every listed video entry for all requested fields and confirm the summary length is 2–3 sentences.", + "weight": 0.18 + }, + "R3": { + "requirement": "The selected videos stay relevant to product reviews and first-impression coverage for mainstream consumer tech such as phones, laptops, audio gear, wearables, or similar gadgets.", + "verification": "Review the 10 chosen videos and confirm they fit the requested review/impression focus and product-category scope.", + "weight": 0.14 + }, + "R4": { + "requirement": "The channels are compared on whether they show real hands-on testing, whether each video feels like a quick impression or a deeper review, whether pros and cons are clearly stated, and whether the presentation style seems useful for buying decisions.", + "verification": "Inspect the comparison writeup and confirm all four requested comparison dimensions are explicitly addressed for the shortlisted channels or videos.", + "weight": 0.18 + }, + "R5": { + "requirement": "At least 4 of the strongest video tabs are kept open and at least 3 channel pages are kept open as visible evidence.", + "verification": "Check the open browser tabs at the end and confirm there are 4 or more video pages plus 3 or more channel pages left open.", + "weight": 0.12 + }, + "R6": { + "requirement": "The final recommendation names the best 3 channels to follow going forward and explains what each is best for.", + "verification": "Confirm there are exactly 3 recommended channels and that each has a clear explanation of its best use case.", + "weight": 0.1 + }, + "R7": { + "requirement": "The final recommendation also identifies the best 5 videos from the set for someone who only has time to watch a few, and a CryptoPad Presentation is also prepared as a rundown of the recommendations and analysis.", + "verification": "Check that exactly 5 videos are singled out from the 10-video set as the top viewing priorities; an open CryptoPad Presentation contains the channel and video recommendation rundown.", + "weight": 0.08 + } + }, + "task_id": "265abf0668773a8295847d5f5abf3bd003cbe3a3", + "website": "https://www.youtube.com" + }, + { + "confirmed_task": "I want to set up my browser for smarter online shopping, but I do not want to install a bunch of sketchy coupon extensions blindly. Please use the Chrome Web Store Shopping category as the starting point and build me a serious comparison of 10 Chrome extensions that help with price comparison, coupon finding, cashback, or price-history tracking. For each one, open the Chrome Web Store listing and then verify it on the extension’s official public site or help page if available, so we can compare what it actually claims to do. Record for all 10: extension name, Chrome Web Store listing page, star rating, review count, whether it focuses on coupons, price comparison, cashback, or price history, whether an account is required if that is clearly stated, any notable permissions or data-access warnings shown publicly, and the official site or support page if shown; if something is missing, write not shown. Then narrow that set to the best 4 options for a privacy-conscious shopper in the U.S. who wants real savings without installing redundant tools, and explain the role each of the 4 would play so I know whether they overlap or complement each other. Keep the most useful tabs open at the end: the Chrome Web Store listings for the final 4, at least 2 official privacy or help pages that were important to the decision, and 2 runner-up tabs that were strong but not selected, so I can review the evidence myself. Can you create a CryptoPad Document with your rankings - and links for each of the extensions to relevant reviews or online sentiment for each?", + "level": "hard", + "reference_length": 5, + "rubrics": { + "R1": { + "requirement": "A comparison set of exactly 10 Chrome extensions in the Shopping category is assembled, starting from the Chrome Web Store, and each candidate is relevant to price comparison, coupon finding, cashback, or price-history tracking.", + "verification": "The final output lists 10 distinct extensions with Chrome Web Store listing pages and shopping-related functions that match the requested categories.", + "weight": 0.14 + }, + "R2": { + "requirement": "For each of the 10 extensions, the required fields are recorded: extension name, Chrome Web Store listing page, star rating, review count, primary shopping function, whether an account is required if clearly stated, any notable permissions or data-access warnings shown publicly, and the official site or support page if shown, using 'not shown' where needed.", + "verification": "Each of the 10 entries contains all requested fields or explicitly says 'not shown' for missing information.", + "weight": 0.18 + }, + "R3": { + "requirement": "Each Chrome Web Store listing is cross-checked against the extension’s official public site or help page when available, rather than relying only on the store listing.", + "verification": "Official public pages or support/help pages are cited when available, and the comparison reflects claims verified beyond the Web Store listing.", + "weight": 0.12 + }, + "R4": { + "requirement": "The final material includes a relevant public review, reputation, or online-sentiment link for each extension so the ranking is not anchored only to official pages and store metadata.", + "verification": "Each extension entry includes at least one public review, reputation, or sentiment link that can be opened and inspected separately from the official site and Web Store page.", + "weight": 0.14 + }, + "R5": { + "requirement": "The analysis narrows the 10 extensions to exactly 4 best options for a privacy-conscious shopper in the U.S. who wants real savings without installing redundant tools, and explains the role each of the 4 would play, including overlap versus complementarity.", + "verification": "Exactly 4 finalists are clearly identified, and each finalist includes a role explanation covering both usefulness and overlap/complementarity.", + "weight": 0.18 + }, + "R6": { + "requirement": "Useful browser evidence is left open at the end: the Chrome Web Store listings for the final 4, at least 2 official privacy or help pages that mattered to the decision, and 2 runner-up tabs that were strong but not selected.", + "verification": "The kept-open tabs match the requested counts and types: 4 finalist store listings, 2 official privacy/help pages, and 2 runner-up tabs.", + "weight": 0.12 + }, + "R7": { + "requirement": "A CryptoPad Document is created with the rankings and links for each extension, including the relevant review or online-sentiment links.", + "verification": "The open CryptoPad Document contains the extension rankings and includes the requested review/sentiment links for each listed extension.", + "weight": 0.12 + } + }, + "task_id": "d03a08650b62f90b2de5e4bcbb98a50653100e1e", + "website": "https://chromewebstore.google.com" + }, + { + "confirmed_task": "I’m trying to get a genuinely useful picture of how jazz connects to community service, not just a vague summary, because I may want to volunteer, donate, or suggest a partner organization later. Please use public web pages to research 10 to 12 jazz-focused nonprofits or community programs in the U.S. that clearly use jazz as part of education, outreach, health, social support, youth development, or other community-serving work. For each organization, confirm at least 3 concrete ways it serves the community from its own site or other strong public pages, and note whether there is a visible way for an outsider to get involved through volunteering, donating, attending, or partnering; if something is not shown, say not shown. As you work, keep the strongest evidence tabs open for at least 6 organizations, including a mix of program, impact, and volunteer or support pages, so I can review them later. Then synthesize what you found into a comparison that groups the organizations by service model, points out the most common and the most distinctive community-service approaches, and recommends the 5 strongest organizations for someone who specifically wants hands-on community impact through jazz. In the final write-up, include a ranked top 5 with a short reason for each choice and make clear which organization looks best for volunteering, which looks best for youth education impact, and which looks best for broader community outreach.", + "level": "hard", + "reference_length": 5, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies 10 to 12 jazz-focused U.S. nonprofits or community programs that clearly use jazz in community-serving work.", + "verification": "Count the organizations in the final comparison and confirm each is jazz-focused and community-serving based on public pages reviewed during the session.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each organization, the final write-up records at least 3 concrete ways it serves the community, drawn from its own site or other strong public pages.", + "verification": "Check that every listed organization has 3 or more specific service activities or program types described in the final output, with enough detail to distinguish them from generic mission language.", + "weight": 0.2 + }, + "R3": { + "requirement": "For each organization, the final output states whether there is a visible way for an outsider to get involved through volunteering, donating, attending, or partnering, and uses 'not shown' when that is missing.", + "verification": "Review each organization entry and confirm that an engagement pathway or an explicit 'not shown' note is included.", + "weight": 0.14 + }, + "R4": { + "requirement": "At least 6 strong evidence tabs are kept open, covering a mix of program, impact, and volunteer or support pages for different organizations.", + "verification": "Inspect the open tabs at the end and confirm there are at least 6 relevant public pages left open, with the required mix of page types across multiple organizations.", + "weight": 0.14 + }, + "R5": { + "requirement": "The final synthesis groups the organizations by service model and identifies both the most common and the most distinctive ways jazz is being used for community service.", + "verification": "Check that the final write-up includes explicit grouping by service model and clearly labeled observations about common patterns and distinctive approaches.", + "weight": 0.14 + }, + "R6": { + "requirement": "The final output includes a ranked top 5 recommendation for someone seeking hands-on community impact through jazz, with a short reason for each choice.", + "verification": "Confirm there are exactly 5 ranked recommendations and that each has a brief rationale tied to the comparison findings.", + "weight": 0.12 + }, + "R7": { + "requirement": "The final recommendation explicitly identifies which organization looks best for volunteering, which looks best for youth education impact, and which looks best for broader community outreach.", + "verification": "Check that these three labeled best-fit conclusions are present and correspond to organizations already researched in the session.", + "weight": 0.08 + } + }, + "task_id": "a931f7baacfd7f1bcea8409bb8b3d84383734680", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I’m seriously considering a Honda Civic Hybrid for my next car, but I don’t want just one review pulled out of context. Please do a thorough browser-based comparison that helps me decide whether the Civic Hybrid is actually the best fit in the compact-hybrid segment. Start by finding at least 6 professional reviews of the Honda Civic Hybrid from reputable automotive publications, and for each one capture the model year reviewed plus the main pros, cons, and overall takeaway. Then verify the current official Honda Civic Hybrid trims, key specs, fuel-economy figures, and notable feature differences on Honda’s public pages. After that, compare the Civic Hybrid against 3 realistic rivals in the same general class—such as the Toyota Prius, Toyota Corolla Hybrid, and Hyundai Elantra Hybrid, or close substitutes if one of those is not comparable enough—using official manufacturer pages plus professional reviews. I want the comparison to cover starting price, mpg or equivalent efficiency, horsepower, cargo space, standout tech or comfort features, and whether reviewers describe it as more fun-to-drive, more practical, or more value-oriented. Also check public safety sources like IIHS and NHTSA if available, and include what is shown there rather than guessing. Keep the most useful evidence tabs open as you go, including at least 3 Honda-related tabs, at least 3 rival-model tabs, and at least 2 review or safety tabs that best support the final recommendation. Finish with a concise buyer-oriented decision memo that tells me whether the Civic Hybrid is the best overall pick, the best fun-to-drive pick, or not the best choice for value, and clearly note any important fields as not shown if the public pages don’t provide them. Prepare a presentation in CryptoPad that makes it fun to read through and read your recommendations.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "At least 6 professional reviews of the Honda Civic Hybrid are gathered from reputable automotive publications, and each includes the reviewed model year plus main pros, cons, and overall takeaway.", + "verification": "The final output lists 6 or more distinct professional review sources and records the requested review details for each one.", + "weight": 0.2 + }, + "R2": { + "requirement": "The current official Honda Civic Hybrid trims, key specs, fuel-economy figures, and notable feature differences are verified on Honda public pages.", + "verification": "The final output includes trim-level Honda information sourced from official public Honda pages, with specs, efficiency, and feature differences clearly summarized.", + "weight": 0.16 + }, + "R3": { + "requirement": "The Honda Civic Hybrid is compared against 3 realistic rivals in the same general class using official manufacturer pages plus professional reviews.", + "verification": "Exactly 3 rival models are included, and each has comparison details drawn from both official model pages and review-based evidence.", + "weight": 0.18 + }, + "R4": { + "requirement": "The comparison covers starting price, mpg or equivalent efficiency, horsepower, cargo space, standout tech or comfort features, and whether reviewers describe each model as more fun-to-drive, more practical, or more value-oriented.", + "verification": "For the Civic Hybrid and all 3 rivals, the final comparison explicitly includes each requested category or marks missing items as not shown.", + "weight": 0.18 + }, + "R5": { + "requirement": "Public safety sources like IIHS and NHTSA are checked if available, and the results are included without guessing.", + "verification": "The final output reports what IIHS and/or NHTSA show for the relevant vehicles when available, and uses 'not shown' or equivalent where information is unavailable.", + "weight": 0.1 + }, + "R6": { + "requirement": "The most useful evidence tabs are kept open, including at least 3 Honda-related tabs, at least 3 rival-model tabs, and at least 2 review or safety tabs that best support the final recommendation.", + "verification": "Open tabs at the end include the required minimum counts across Honda, rival, and review/safety evidence pages.", + "weight": 0.08 + }, + "R7": { + "requirement": "A concise buyer-oriented decision memo is produced stating whether the Civic Hybrid is the best overall pick, the best fun-to-drive pick, or not the best choice for value, with important missing fields marked as not shown, and a CryptoPad Presentation is also created to make the buyer recommendation easy to review.", + "verification": "The final memo makes an explicit recommendation using the requested framing and clearly flags any unavailable public-page fields as not shown; an open CryptoPad Presentation presents the compact-hybrid comparison and recommendation.", + "weight": 0.1 + } + }, + "task_id": "e8748499c5ee313e307ba6819d4978cd061d8445", + "website": "https://www.idehonda.com" + }, + { + "confirmed_task": "I’m trying to figure out whether a Honda Civic Hatchback Hybrid would actually work for me as a winter daily driver in a place that gets regular snow, slush, and cold mornings, not just whether one reviewer liked it. Please do a serious browser-based investigation using public pages only. Start by finding at least 8 credible pieces of evidence about this car’s snowy-weather behavior, including at least 3 video reviews or tests, at least 2 owner-reported winter experiences from public forums or communities, and at least 2 official or manufacturer-style sources such as specs, manuals, or feature pages that help explain traction, drive modes, tires, clearance, or cold-weather limitations. Summarize what each source actually says about snow starts, traction, braking, hill driving, stability, cabin warm-up, visibility, and any cautions or missing information, and keep the strongest evidence tabs open. Then compare the Civic Hatchback Hybrid against exactly 3 realistic alternatives for the same kind of buyer, preferably other compact hatchback or sedan hybrids or similarly efficient daily drivers that someone might cross-shop for winter use. For each of the 4 total vehicles, capture drivetrain, approximate ground-clearance context if publicly shown, winter-relevant features, and whether the evidence suggests it is a confident snow choice, an acceptable-with-good-winter-tires choice, or a poor fit. Use tire retailer or fitment pages only as supporting evidence to check whether common winter-tire sizing appears straightforward for the Civic, and note any obvious constraints. Finish with a concise decision memo that tells me whether the Honda Civic Hatchback Hybrid looks viable for a future snowy-climate commute, what the biggest winter caveats are, which of the 3 alternatives looks strongest if I want more confidence in snow, and leave the most useful comparison and evidence tabs open so I can review them myself.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "At least 8 credible pieces of evidence are gathered about the Honda Civic Hatchback Hybrid's snowy-weather behavior, including at least 3 video reviews or tests, at least 2 owner-reported winter experiences, and at least 2 official or manufacturer-style sources.", + "verification": "The final output lists 8 or more distinct public sources and their categories, and the open browser tabs include representative examples from each required source type.", + "weight": 0.2 + }, + "R2": { + "requirement": "Each source is summarized for what it says about snow starts, traction, braking, hill driving, stability, cabin warm-up, visibility, and any cautions or missing information.", + "verification": "The final memo contains per-source summaries covering these winter-use dimensions when available, and explicitly marks 'not shown' or similar where a source does not address a category.", + "weight": 0.18 + }, + "R3": { + "requirement": "The Honda Civic Hatchback Hybrid is compared against exactly 3 realistic alternatives for the same buyer, for 4 total vehicles.", + "verification": "The final comparison includes exactly 4 vehicles total and clearly identifies the 3 alternatives as cross-shopped options relevant to efficient winter daily driving.", + "weight": 0.16 + }, + "R4": { + "requirement": "For each of the 4 total vehicles, the comparison captures drivetrain, approximate ground-clearance context if publicly shown, winter-relevant features, and whether the evidence suggests it is a confident snow choice, an acceptable-with-good-winter-tires choice, or a poor fit.", + "verification": "The final comparison table or memo includes all requested fields for each vehicle, using 'not shown' where public pages do not provide a field.", + "weight": 0.17 + }, + "R5": { + "requirement": "Tire retailer or fitment pages are used only as supporting evidence to check whether common winter-tire sizing appears straightforward for the Civic, with any obvious constraints noted.", + "verification": "The final output includes a Civic-specific winter-tire fitment note based on public fitment pages and does not expand the task into unrelated tire shopping beyond the requested support role.", + "weight": 0.11 + }, + "R6": { + "requirement": "The session ends with a concise decision memo stating whether the Honda Civic Hatchback Hybrid looks viable for a future snowy-climate commute, the biggest winter caveats, and which of the 3 alternatives looks strongest for more confidence in snow.", + "verification": "The final memo contains an explicit viability judgment for the Civic, names the key caveats, and identifies one strongest alternative with a brief reason tied to the gathered evidence.", + "weight": 0.1 + }, + "R7": { + "requirement": "The most useful comparison and evidence tabs are left open for review.", + "verification": "Open tabs at the end include the strongest Civic snow evidence sources and the key comparison pages used for the alternatives.", + "weight": 0.08 + } + }, + "task_id": "e1184e98c9a0c78c6170d6c740bdf30b8dd11442", + "website": "https://www.youtube.com" + }, + { + "confirmed_task": "I’m trying to figure out whether the Kia K5 GT should actually make my next test-drive shortlist, not just watch one random review. Start on YouTube and find 10 recent, substantive video reviews or comparison videos that cover the Kia K5 GT, and record for each one the video title, channel name, approximate viewpoint of the reviewer, and whether the review is mainly positive, mixed, or negative. Keep the 4 most useful review tabs open, including at least one straight review and at least one head-to-head comparison. Then open Kia’s official K5 GT pages and verify the current core facts I’d care about as a shopper: engine/performance details, transmission, notable standard features, warranty, and starting price or price range if shown. After that, compare the K5 GT against exactly 4 realistic sporty sedan alternatives on public pages from official manufacturer sites, using models that are actually plausible cross-shops for someone considering a quick midsize or near-midsize sport sedan. For those 5 cars total, compare price, horsepower, drivetrain, key comfort or tech features, and anything clearly positioned as performance-oriented. Next, check public safety and ownership-risk signals for the Kia K5 GT using sources like IIHS, NHTSA, recall pages, and owner-review or reliability pages if available, and note anything that looks like a meaningful concern or a reassuring sign; if a field is not shown, say not shown. Finally, give me a ranked recommendation with exactly 3 outcomes: best value pick, best performance-leaning pick, and best all-around daily-driver pick. Explain where the Kia K5 GT lands, what kind of buyer it fits best, and whether I should prioritize it for a test drive now or focus on one of the alternatives instead. Leave the key review, official spec, and safety/evidence pages open so I can inspect them afterward.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies exactly 10 recent, substantive YouTube video reviews or comparison videos covering the Kia K5 GT, and for each one records the video title, channel name, approximate reviewer viewpoint, and whether the review is positive, mixed, or negative.", + "verification": "Check that the final output lists 10 videos with all four requested fields and that they are clearly about the Kia K5 GT.", + "weight": 0.18 + }, + "R2": { + "requirement": "Exactly 4 of the most useful review tabs are kept open, including at least 1 straight Kia K5 GT review and at least 1 head-to-head comparison video.", + "verification": "Inspect open tabs or end-state browser evidence to confirm 4 relevant YouTube tabs remain open with the required mix.", + "weight": 0.14 + }, + "R3": { + "requirement": "Official Kia K5 GT pages are opened and used to verify the requested shopper-facing facts: engine or performance details, transmission, notable standard features, warranty, and starting price or price range if shown.", + "verification": "Check that official Kia pages are open or cited in the final synthesis and that each requested fact is reported, with 'not shown' used where needed.", + "weight": 0.16 + }, + "R4": { + "requirement": "The Kia K5 GT is compared against exactly 4 realistic sporty sedan alternatives using public official manufacturer pages, and the comparison covers all 5 cars total on price, horsepower, drivetrain, key comfort or tech features, and performance-oriented positioning.", + "verification": "Review the final comparison to confirm there are 5 total cars, 4 alternatives, and all requested fields are present from public official pages.", + "weight": 0.2 + }, + "R5": { + "requirement": "Public safety and ownership-risk signals for the Kia K5 GT are checked using sources such as IIHS, NHTSA, recall pages, and owner-review or reliability pages if available, with meaningful concerns or reassuring signs noted and 'not shown' used for missing fields.", + "verification": "Confirm the final output includes findings from the requested categories of public sources and notes concrete concerns, reassuring signs, or 'not shown' where appropriate.", + "weight": 0.14 + }, + "R6": { + "requirement": "The final recommendation provides exactly 3 outcomes—best value pick, best performance-leaning pick, and best all-around daily-driver pick—and explicitly explains where the Kia K5 GT lands, what buyer it fits best, and whether it should be prioritized for a test drive now.", + "verification": "Check that the conclusion includes exactly the 3 requested outcome labels and directly addresses Kia K5 GT fit and test-drive priority.", + "weight": 0.1 + }, + "R7": { + "requirement": "Key evidence pages are left open at the end, including useful review tabs plus official spec and safety/evidence pages that support the recommendation.", + "verification": "Inspect the final browser state to confirm that relevant YouTube, official Kia, and safety or ownership-risk pages remain open for later review.", + "weight": 0.08 + } + }, + "task_id": "03328a94fec2dce938ded3959bdb6ea292c22186", + "website": "https://www.youtube.com" + }, + { + "confirmed_task": "I want to make one genuinely good winter purchase instead of impulse-buying the first heated blanket I see. Please do a serious browser comparison of 12 current electric heated blanket options across major public retailer pages and manufacturer pages where available, focusing on products that are realistically usable for bed warmth in winter in the U.S. Include a mix of at least 4 full-size bed blankets, at least 4 throws, and at least 4 larger or premium options for shared use or colder sleepers. For each option, record the product name, retailer, current listed price, size, material, number of heat settings, auto shutoff details, machine-washability, controller style, safety certification or compliance language if shown, warranty length if shown, and shipping or delivery info if it is prominently visible; use 'not shown' when a field is missing. Open the actual listing pages for all candidates, and for at least 6 of them also open either the manufacturer product page, care/manual page, or another public page that helps verify the safety or care details, so the comparison is not based on retailer copy alone. While comparing, look through photos and review sections closely enough to capture recurring positives and negatives for each product, especially anything about uneven heating, reliability, shedding, thinness, controller annoyance, or washing problems. Then narrow the 12 down to a final shortlist of 5 with a clear recommendation for best overall, best value, best throw, best for couples or larger beds, and best premium pick. End with one concise buying memo that explains which single blanket I should buy and why, and keep the 5 finalist product tabs plus the 2 most useful verification tabs open so I can review them.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "Exactly 12 current heated blanket candidates are compared from public pages, including at least 4 full-size bed blankets, at least 4 throws, and at least 4 larger or premium options for shared use or colder sleepers.", + "verification": "The final comparison includes 12 distinct products and clearly identifies their type/category so the required 4/4/4 coverage can be checked from the recorded entries and open product tabs.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each of the 12 options, the comparison records the product name, retailer, current listed price, size, material, number of heat settings, auto shutoff details, machine-washability, controller style, safety certification or compliance language if shown, warranty length if shown, and shipping or delivery info if prominently visible, using 'not shown' when needed.", + "verification": "Each of the 12 products has all requested fields filled in or explicitly marked 'not shown' in the final comparison.", + "weight": 0.22 + }, + "R3": { + "requirement": "All 12 candidates have their actual retailer listing pages opened, and at least 6 of them also have a manufacturer product page, care/manual page, or other public verification page opened to confirm safety or care details.", + "verification": "Open tabs show the 12 retailer listings and at least 6 additional verification pages tied to specific candidates.", + "weight": 0.17 + }, + "R4": { + "requirement": "The comparison captures recurring review and photo-based positives and negatives for each product, with attention to uneven heating, reliability, shedding, thinness, controller annoyance, and washing problems when those themes appear.", + "verification": "Each candidate includes brief synthesized pros/cons or review-theme notes drawn from its public listing and supporting pages.", + "weight": 0.14 + }, + "R5": { + "requirement": "The 12 products are narrowed to a final shortlist of exactly 5, and the shortlist includes a named recommendation for best overall, best value, best throw, best for couples or larger beds, and best premium pick.", + "verification": "The final output presents exactly 5 finalists and assigns each of the five requested recommendation labels.", + "weight": 0.14 + }, + "R6": { + "requirement": "A concise final buying memo recommends one single blanket to buy and explains why it wins over the other finalists.", + "verification": "The final memo explicitly names one product as the buy recommendation and gives a comparative rationale grounded in the recorded features, verification details, and review themes.", + "weight": 0.09 + }, + "R7": { + "requirement": "The 5 finalist product tabs and the 2 most useful verification tabs are left open at the end for review.", + "verification": "The final browser state shows 7 kept-open tabs: 5 finalist product pages and 2 verification pages.", + "weight": 0.06 + } + }, + "task_id": "b6bbdc8e0388a5a1e256f561a8f0b7d92c6e772c", + "website": "https://www.amazon.com" + }, + { + "confirmed_task": "I want help making a real purchase decision on a warm, military-style winter jacket, not just grabbing the first tactical-looking one I see. Start with Viktos, but widen out to other credible brands that sell the same general kind of jacket on public product pages. Compare exactly 12 jackets total from at least 6 brands, with Viktos definitely included, and focus on options that visibly read as military or tactical rather than fashion-only parkas. For each jacket, capture the product name, listed price, insulation or warmth information if shown, shell or weather-protection details, available colors, whether common tactical colors like black, ranger green, coyote, olive, or gray are offered, and whether size Large is shown or not shown. Also check the sizing chart or fit notes where available, plus the return policy or warranty page for each brand, because I do not want to get stuck with an expensive bad fit. Keep the browsing practical and visual: open the strongest product pages in separate tabs, compare photos closely so the styling stays in the para-military lane, and leave the best 4 jacket pages open at the end along with 2 useful sizing or return-policy pages. Then give me a final ranked shortlist of 5 jackets with a clear winner, a best value pick, and a best severe-cold pick, explaining the tradeoffs in warmth, style, price, and purchase risk. Create a tracker of all the options and their prices, along with their links in CryptoPad Spreadsheets.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "Exactly 12 jackets are compared from at least 6 brands, and Viktos is included among those brands.", + "verification": "The final comparison explicitly lists 12 distinct jacket product pages and shows brand names demonstrating coverage of at least 6 brands including Viktos.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each of the 12 jackets includes the requested recorded details: product name, listed price, insulation or warmth information if shown, shell or weather-protection details, available colors, whether black/ranger green/coyote/olive/gray are offered, and whether size Large is shown or not shown.", + "verification": "The final output contains one entry per jacket with all requested fields populated or marked not shown where unavailable.", + "weight": 0.24 + }, + "R3": { + "requirement": "Sizing and purchase-risk checks are completed by reviewing sizing chart or fit notes where available and the return policy or warranty page for each brand.", + "verification": "The final output includes fit or sizing-chart notes for the compared jackets where available and a return-policy or warranty note for each represented brand.", + "weight": 0.16 + }, + "R4": { + "requirement": "The comparison stays focused on military or tactical-looking winter jackets rather than drifting into generic fashion outerwear.", + "verification": "The selected jackets' product pages and the final reasoning clearly indicate tactical or military-style positioning and visual styling fit the requested para-military lane.", + "weight": 0.12 + }, + "R5": { + "requirement": "The browsing keeps the work visual and evidence-based by opening the strongest product pages in separate tabs and comparing product photos closely.", + "verification": "At least the strongest candidate product pages remain open at the end, and the final notes reference visual/photo-based style comparisons drawn from those public pages.", + "weight": 0.1 + }, + "R6": { + "requirement": "The best 4 jacket product pages are left open at the end, along with 2 useful sizing or return-policy pages.", + "verification": "The final browser state includes 4 open product tabs for the top jackets and 2 open sizing or return-policy tabs relevant to the purchase decision.", + "weight": 0.1 + }, + "R7": { + "requirement": "A final ranked shortlist of 5 jackets is produced, including a clear overall winner, a best value pick, and a best severe-cold pick, with tradeoffs explained in warmth, style, price, and purchase risk, and a CryptoPad Spreadsheet tracker of all options, prices, and links is also created.", + "verification": "The final response contains a ranked top-5 shortlist and explicitly labels the overall winner, best value, and best severe-cold option with rationale covering the requested tradeoffs; an open CryptoPad Spreadsheet records the jacket options, prices, and links.", + "weight": 0.1 + } + }, + "task_id": "7c0efe6388e3b7745a4db7e755b4e9b0a7b23508", + "website": "https://viktos.com" + }, + { + "confirmed_task": "I already have a show on YouTube and want to turn it into a real podcast setup, with Spotify as a priority but not necessarily by blindly using Spotify’s own hosting if another path is better. Please use public pages only to compare exactly 8 realistic podcast hosting or distribution options for a YouTube-first creator, including Spotify for Creators and 7 other major platforms you judge to be good fits. For each option, check the official site and record whether I can create a new show or import an existing one, whether distribution to Spotify and other podcast apps is supported, whether video podcasts are supported, the publicly shown free tier or lowest paid price, what public analytics or monetization features are advertised, and any obvious limitation for repurposing an existing YouTube show into podcast episodes; if a field is not shown, say 'not shown.' While doing this, keep the official pricing or features pages open for the 4 strongest options so I can compare them later. Then go through Spotify for Creators on its public pages far enough to confirm the current onboarding path for adding a podcast, including the live entry point that corresponds to 'Add your podcast' or its current equivalent, and also find Spotify’s public guidance about creating a new show versus importing one by RSS. Finally, give me a clear recommendation for the best low-cost path if my goal is to repurpose an existing YouTube show into audio now without boxing myself in later, explain why the top 2 options beat the rest, and leave the key Spotify and top-alternative tabs open.", + "level": "hard", + "reference_length": 18, + "rubrics": { + "R1": { + "requirement": "Exactly 8 realistic podcast hosting or distribution options are compared, including Spotify for Creators and 7 other major platforms judged to fit a YouTube-first creator.", + "verification": "The final comparison includes 8 distinct options total, one of which is Spotify for Creators, with no missing platform entries.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each of the 8 options, the comparison records whether a new show can be created or an existing one imported, whether distribution to Spotify and other podcast apps is supported, whether video podcasts are supported, the public free tier or lowest paid price, public analytics or monetization features, and any obvious limitation for repurposing an existing YouTube show, using 'not shown' where needed.", + "verification": "Each platform entry contains all requested fields, and missing public information is explicitly marked as 'not shown' rather than omitted.", + "weight": 0.27 + }, + "R3": { + "requirement": "The official pricing or features pages for the 4 strongest options are kept open so the user can compare them later.", + "verification": "There is browser evidence that 4 official pricing/features tabs corresponding to the selected strongest options remain open at the end.", + "weight": 0.12 + }, + "R4": { + "requirement": "Spotify for Creators is checked on public pages far enough to confirm the current onboarding path for adding a podcast, including the live entry point matching 'Add your podcast' or its current equivalent.", + "verification": "The final result describes the current Spotify onboarding path and shows browser evidence from the relevant public Spotify for Creators page(s) that the add-podcast entry point was reached or verified.", + "weight": 0.16 + }, + "R5": { + "requirement": "Spotify’s public guidance about creating a new show versus importing one by RSS is found and incorporated.", + "verification": "The final answer includes both Spotify paths—new show creation and RSS import—based on public Spotify guidance, with the supporting Spotify guidance page(s) open or clearly referenced in the browsing evidence.", + "weight": 0.12 + }, + "R6": { + "requirement": "A clear final recommendation is given for the best low-cost path to repurpose an existing YouTube show into audio without boxing the creator in later, and it explains why the top 2 options beat the rest.", + "verification": "The conclusion names one best option and one runner-up, ties both directly to the stated low-cost and future-flexibility goal, and compares them against the other options rather than giving a generic summary.", + "weight": 0.15 + } + }, + "task_id": "4f1f73202b21d2f30cec9737538f1184f119d5ec", + "website": "https://creators.spotify.com" + }, + { + "confirmed_task": "I’m putting together a serious background brief on ongoing U.S. health system problems, and I don’t want a quick two-paragraph summary. Please use public sources to identify 5 ongoing national health system problems that are still being actively discussed or measured—things like affordability, insurance access, maternal health, mental health access, chronic disease burden, provider shortages, or similar issues if the evidence is stronger. For each of the 5 problems, compare how it affects at least 4 demographic groups differently, choosing from groups such as race/ethnicity, income level, age, sex, rural versus urban residents, disability status, or insurance status depending on what the sources actually show. Use high-quality public sources like CDC, CMS, NIH, AHRQ, KFF, Commonwealth Fund, and major peer-reviewed or academic-health sources when needed, and prefer pages with charts, maps, tables, or survey results that make the differences visible. Open and keep the strongest evidence tabs available for each problem, including at least one chart- or data-heavy page for every problem if public evidence exists. Then create one organized briefing document in CryptoPad that has exactly 5 sections, one per problem, and for each section include: a plain-language description of the problem, why it is ongoing, at least 2 supporting sources, at least 1 concrete way the burden differs across at least 4 demographic groups, and a short note on any important caveat like outdated data, differing definitions, or missing subgroup detail. End the briefing with a cross-cutting comparison section that names which 2 problems seem most unequal across demographic lines based on the evidence you found and explains why. Leave the finished briefing open, and also leave open the most useful evidence tabs so I can review the charts and source pages myself. Generate a CryptoPad Presentation for this information as well, so I can present on it.", + "level": "hard", + "reference_length": 10, + "rubrics": { + "R1": { + "requirement": "A finished briefing document is created and left open, with exactly 5 sections covering 5 ongoing national U.S. health system problems plus a final cross-cutting comparison section, and the finished briefing is created in CryptoPad and left open.", + "verification": "Confirm the open final document contains 5 distinct problem sections and one ending comparison section, with no extra or missing problem sections; the open final deliverable is a CryptoPad briefing document with the requested sections.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each of the 5 problem sections includes a plain-language description of the problem and an explanation of why it is ongoing.", + "verification": "Review each section of the briefing document for both elements: a description and a why-it-is-ongoing explanation.", + "weight": 0.14 + }, + "R3": { + "requirement": "For each of the 5 problems, the briefing uses at least 2 supporting public sources from high-quality health or policy sources such as CDC, CMS, NIH, AHRQ, KFF, Commonwealth Fund, or comparable academic-health sources.", + "verification": "Check that each section cites at least 2 public sources and that the cited sources are from the requested source types.", + "weight": 0.18 + }, + "R4": { + "requirement": "For each of the 5 problems, the briefing describes at least 1 concrete way the burden differs across at least 4 demographic groups, using groups actually supported by the source evidence.", + "verification": "Inspect each section for a disparity comparison involving at least 4 demographic groups and confirm the comparison is tied to cited evidence.", + "weight": 0.2 + }, + "R5": { + "requirement": "The browsing session keeps the strongest evidence tabs available for each problem, including at least 1 chart-, map-, table-, or data-heavy public page for every problem when such evidence exists.", + "verification": "Check that useful evidence tabs remain open across the 5 problems and that each problem has an open visual or data-heavy source page when available.", + "weight": 0.14 + }, + "R6": { + "requirement": "Each of the 5 problem sections includes a short caveat note covering an important limitation such as outdated data, differing definitions, or missing subgroup detail.", + "verification": "Review the final document and confirm that every problem section has a caveat note of the requested type.", + "weight": 0.08 + }, + "R7": { + "requirement": "The final cross-cutting comparison section identifies which 2 of the 5 problems appear most unequal across demographic lines based on the gathered evidence and explains why, and a CryptoPad Presentation is also created for presenting the findings.", + "verification": "Check the ending section of the briefing document for a clear selection of 2 problems and an evidence-based explanation for that judgment; an open CryptoPad Presentation summarizing the five problems and cross-cutting conclusions is available.", + "weight": 0.08 + } + }, + "task_id": "6e19b9c15e44d2971216e6cd3eee212e33df6586", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I’m helping shape a future women’s cancer-equity outreach project, and I don’t want a generic summary — I need a browser-based evidence sweep that helps me decide which inequities are most important to focus on first. Please use public pages from authoritative sources such as NCI, CDC, ACS, NIH, major cancer centers, and peer-reviewed or professional society sources, and build me one organized briefing document that I could actually use. I want you to identify at least 12 distinct drivers of cancer inequities affecting women, but also group them by where they show up in the care pathway: prevention or risk exposure, screening and early detection, diagnosis, treatment access or quality, and survivorship or follow-up. To keep this grounded, compare at least 4 cancer areas that are especially relevant for women — for example breast, cervical, ovarian, and endometrial or another clearly justified substitution if the evidence is stronger — and note which drivers appear across multiple cancers versus which seem more specific to one cancer type. I also want at least 6 concrete intervention or program examples from reputable public sources that aim to reduce these inequities, such as navigation programs, screening access efforts, trial-inclusion initiatives, rural access models, or culturally tailored outreach, and for each one note what inequity it is trying to address and whether any outcome or evaluation is publicly described. As you work, keep the most important evidence tabs open, including at least 6 key source pages that directly support the final priorities, and leave open at least 2 public data or dashboard pages that help show the burden or disparity patterns. In the final briefing, end with a ranked top-5 priority list for where a women’s cancer-equity project should focus first, and make each priority include the main driver, the affected care stage, the cancer areas it shows up in, the strongest supporting sources, and a short explanation of why it belongs near the top.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "An organized briefing document is produced using public authoritative sources and is structured around the requested care-pathway stages: prevention or risk exposure, screening and early detection, diagnosis, treatment access or quality, and survivorship or follow-up.", + "verification": "Check that the final document exists, is organized by the five requested stages, and cites public sources from authoritative organizations or peer-reviewed/professional sources.", + "weight": 0.16 + }, + "R2": { + "requirement": "The briefing identifies at least 12 distinct drivers of cancer inequities affecting women and groups them into the requested care-pathway categories.", + "verification": "Count the listed drivers in the document and confirm they total 12 or more, with each assigned to one of the specified care-pathway stages.", + "weight": 0.18 + }, + "R3": { + "requirement": "At least 4 cancer areas especially relevant for women are compared, including breast, cervical, ovarian, and endometrial unless a clearly justified substitution is explicitly explained, and the document distinguishes cross-cutting drivers from cancer-specific ones.", + "verification": "Check that four cancer areas are covered, that any substitution is justified in the document, and that the write-up explicitly notes which drivers recur across multiple cancers versus which are more specific.", + "weight": 0.18 + }, + "R4": { + "requirement": "The briefing includes at least 6 concrete intervention or program examples from reputable public sources, and each example notes the inequity addressed plus whether any public outcome or evaluation is described.", + "verification": "Count the intervention examples in the document and confirm there are at least six, each with the targeted inequity and an outcome/evaluation note or an explicit indication that it is not shown.", + "weight": 0.16 + }, + "R5": { + "requirement": "The browser session keeps open at least 6 key source pages that directly support the final priorities and at least 2 public data or dashboard pages showing burden or disparity patterns.", + "verification": "Inspect the open tabs at the end and confirm there are at least six substantive evidence tabs plus two data/dashboard tabs relevant to the final briefing.", + "weight": 0.12 + }, + "R6": { + "requirement": "The final briefing ends with a ranked top-5 priority list for where a women’s cancer-equity project should focus first, and each priority includes the main driver, affected care stage, cancer areas involved, strongest supporting sources, and a short why-it-ranks-high explanation.", + "verification": "Check that exactly five ranked priorities are present and that each contains all five requested elements.", + "weight": 0.2 + } + }, + "task_id": "d17c0e2905a166df99fa492edc8f7cc5a641de4e", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I’m seriously considering becoming a personal trainer, but I don’t want a shallow summary of one NASM page. Please build me a real browser-based decision guide for choosing the best certification path to start this career. Start with NASM, ACE, ISSA, ACSM, and NSCA, and use each organization’s public official pages to capture the core CPT option, minimum eligibility requirements, study format, exam or test details, whether CPR/AED is required, renewal cycle, continuing-education expectations, and the current public price or ‘not shown’ if it isn’t listed. For NASM specifically, go deeper and compare at least 3 distinct CPT package options side by side so I can tell what extra support or materials I’d actually be paying for. Then pressure-test the market by checking at least 24 current public personal-trainer job listings across 6 major U.S. metro areas, and record which certifications employers explicitly accept or prefer, plus whether they mention CPR/AED, experience, or specialties. Keep the key official certification pages open for all 5 organizations, and also keep open at least 6 representative job-listing tabs that show the employer demand evidence. Finally, put everything into one organized comparison sheet or document with one section for the 5 certification bodies, one section for the NASM package comparison, one section summarizing the 24 job listings, and a final recommendation for the best option in each of these 3 scenarios: cheapest credible path, best-supported beginner path, and strongest choice for broad employer recognition. Leave the finished comparison open at the end along with the most useful evidence tabs.", + "level": "hard", + "reference_length": 23, + "rubrics": { + "R1": { + "requirement": "A finished comparison sheet or document is created and left open, with separate sections for the 5 certification bodies, the NASM package comparison, the 24 job listings, and the final 3-scenario recommendation.", + "verification": "Check that the final artifact is open and visibly organized into the four requested sections, and that it contains entries for all required comparisons and recommendations.", + "weight": 0.18 + }, + "R2": { + "requirement": "The task compares NASM, ACE, ISSA, ACSM, and NSCA using official public pages, capturing each organization’s CPT option, minimum eligibility requirements, study format, exam or test details, CPR/AED requirement status, renewal cycle, continuing-education expectations, and current public price or 'not shown.'", + "verification": "Review the final artifact and the open provider tabs to confirm all 5 organizations are covered and each requested field is present from official public pages.", + "weight": 0.22 + }, + "R3": { + "requirement": "NASM is examined in extra depth with a side-by-side comparison of at least 3 distinct CPT package options, showing what added support or materials each package includes.", + "verification": "Confirm the artifact contains at least 3 NASM package entries with package-specific inclusions, and that relevant NASM tabs are open as evidence.", + "weight": 0.15 + }, + "R4": { + "requirement": "At least 24 current public personal-trainer job listings across 6 major U.S. metro areas are checked, and the results record which certifications employers explicitly accept or prefer, plus whether CPR/AED, experience, or specialties are mentioned.", + "verification": "Count the recorded job listings and metro areas in the final artifact, and confirm the requested employer-demand fields are captured for the listings reviewed.", + "weight": 0.2 + }, + "R5": { + "requirement": "Key browser evidence is preserved by leaving open the official certification pages for all 5 organizations and at least 6 representative job-listing tabs showing employer demand.", + "verification": "Inspect the open tabs at the end to confirm that all 5 official provider pages and at least 6 job-listing pages remain open.", + "weight": 0.1 + }, + "R6": { + "requirement": "The final recommendation explicitly names the best option for each of these 3 scenarios: cheapest credible path, best-supported beginner path, and strongest choice for broad employer recognition.", + "verification": "Check the recommendation section of the final artifact for 3 distinct scenario-based picks, each tied back to the comparison and job-listing evidence.", + "weight": 0.15 + } + }, + "task_id": "1f90184fb61690b3d2f0350196418d726a850a7d", + "website": "https://pages-delivery.nasm.org" + }, + { + "confirmed_task": "I want to choose an AI note-taking service I could actually subscribe to without regretting it, and I do not want ChatGPT or Grok. Please do a real browser-based comparison of 10 to 12 AI note-taking tools that are publicly available and plausibly suitable for a beginner, including meeting-note tools, voice-note tools, and general AI note apps if they clearly support note capture and summarization. For each one, use official public pages first to record the lowest paid plan price, whether there is a free tier or trial, what the cancellation flow seems to be from public billing/help pages, supported platforms, and the clearest beginner-onboarding evidence you can find from the product site or official app listing. Also check public founder or leadership pages and note whether the company appears to be woman-founded or woman-led; if that is not clearly shown, say not shown rather than guessing. Keep at least 6 of the strongest evidence tabs open at the end, including a mix of pricing, cancellation/help, and product/demo pages from the finalists. Put the results into one organized comparison sheet or document with one entry per tool, then rank your top 5 overall for a beginner on a budget, call out the best cheapest option, the best easiest-to-cancel option, and the best woman-founded or woman-led option if any qualifying candidate is clearly supported by public evidence. End with a short recommendation for which one I should try first and why, and a spreadhseet of the different options and their pricing.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "An organized comparison sheet or document exists with one entry per tool for 10 to 12 publicly available AI note-taking tools, excluding ChatGPT and Grok.", + "verification": "Check that the final artifact is open and contains 10 to 12 distinct tools, each relevant to AI note-taking, with ChatGPT and Grok excluded.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each tool, the artifact records the lowest paid plan price, whether there is a free tier or trial, supported platforms, and the clearest beginner-onboarding evidence from an official product page or official app listing.", + "verification": "Inspect entries in the artifact for all required fields and confirm they are populated from official public sources or marked clearly when unavailable.", + "weight": 0.18 + }, + "R3": { + "requirement": "For each tool, the artifact records what the cancellation flow seems to be from public billing, subscription, or help pages, using official sources where possible.", + "verification": "Check that each tool has a cancellation or subscription-management note tied to public policy/help content, with unclear cases marked clearly instead of inferred.", + "weight": 0.16 + }, + "R4": { + "requirement": "For each tool, the artifact notes whether the company appears to be woman-founded or woman-led from public founder or leadership pages, and uses 'not shown' when that is not clearly supported.", + "verification": "Review the ownership or leadership field for every tool and confirm it is either supported by public evidence or explicitly marked 'not shown' without speculation.", + "weight": 0.14 + }, + "R5": { + "requirement": "At least 6 strong evidence tabs are left open at the end, including a mix of pricing pages, cancellation/help pages, and product/demo or official app-listing pages from the finalists.", + "verification": "Confirm that at least 6 relevant tabs remain open and that they visibly cover the requested evidence types across finalist products.", + "weight": 0.14 + }, + "R6": { + "requirement": "The final output ranks the top 5 overall tools for a beginner on a budget and explicitly identifies the best cheapest option, the best easiest-to-cancel option, and the best woman-founded or woman-led option if any qualifying candidate is clearly supported, and a spreadsheet of the compared options and their pricing is also produced.", + "verification": "Check the final ranked section for exactly these recommendation categories and ensure each is directly traceable to the comparison findings; the deliverables include a spreadsheet listing the tools and their pricing details.", + "weight": 0.12 + }, + "R7": { + "requirement": "The task ends with a short recommendation of which tool to try first and why.", + "verification": "Confirm there is a concise final recommendation naming one tool to try first with a reason grounded in the comparison.", + "weight": 0.08 + } + }, + "task_id": "d6007c19e6419c9eefdd57996fc151a2263b22fa", + "website": "https://34.170.30.232" + }, + { + "confirmed_task": "I’m trying to decide whether the adidas Everyset Training Shoes for Men on Sierra are actually the best buy for me in men’s size 12, or just one decent option in a crowded field. Please start on Sierra, confirm the live product page for the adidas Everyset, and note the current price, any strike-through/original price if shown, available colors, and whether size 12 is in stock. Then build me a serious comparison set of exactly 10 men’s training or cross-training shoes in size 12 total, including the Sierra adidas pair plus 9 comparable options from public product pages on Sierra and other major retailers or brand sites. For each shoe, capture the product name, retailer, current price, original price if shown, whether size 12 is available, intended use cues from the page, and one or two practical notes from product details or review summaries such as stability, cushioning, versatility, or durability; if any field is missing, write not shown. I also want you to sanity-check the buying experience, so for each retailer represented in your comparison, open the public return-policy or shipping-information page and note the key basics that would matter to a normal buyer, like return window, obvious return fees if stated, and any free-shipping threshold if publicly shown. After that, compare the 10 shoes and narrow them to a final shortlist of 4 that cover different priorities: best overall gym shoe, best value, best for heavier lifting/stability, and best for mixed cardio-and-weights use. Keep the most useful product tabs open for those 4 finalists, plus the Sierra adidas Everyset page and the key policy pages you used, so I can review the evidence myself. End with a concise recommendation telling me whether I should buy the Sierra adidas Everyset now or pick one of the alternatives instead, and why.", + "level": "hard", + "reference_length": 7, + "rubrics": { + "R1": { + "requirement": "The browsing session confirms the live Sierra product page for the adidas Everyset Training Shoes for Men and records its current price, original/strike-through price if shown, available colors, and whether men's size 12 is in stock.", + "verification": "Check that the final output includes these Sierra-specific details and that the Sierra adidas product page is left open as requested.", + "weight": 0.16 + }, + "R2": { + "requirement": "The comparison set contains exactly 10 men's training or cross-training shoes in size 12 total, including the Sierra adidas pair plus 9 comparable options from public product pages on Sierra and/or other major retailers or brand sites.", + "verification": "Count the shoes in the final comparison and confirm there are exactly 10 total with the Sierra adidas included and all sourced from public product pages.", + "weight": 0.22 + }, + "R3": { + "requirement": "For each of the 10 shoes, the final comparison records product name, retailer, current price, original price if shown, whether size 12 is available, intended-use cues from the page, and one or two practical notes from product details or review summaries, using 'not shown' where needed.", + "verification": "Review all 10 entries and confirm each required field is present or explicitly marked 'not shown' when unavailable.", + "weight": 0.22 + }, + "R4": { + "requirement": "For each retailer represented in the comparison, a public return-policy or shipping-information page is opened and the final output notes the buyer-relevant basics: return window, obvious return fees if stated, and any free-shipping threshold if publicly shown.", + "verification": "Match the retailers in the shoe comparison to corresponding policy pages and confirm the requested policy basics are summarized for each represented retailer.", + "weight": 0.14 + }, + "R5": { + "requirement": "The final output narrows the 10 shoes to a shortlist of exactly 4 finalists covering the four requested priorities: best overall gym shoe, best value, best for heavier lifting/stability, and best for mixed cardio-and-weights use.", + "verification": "Check that there are exactly 4 finalists and that each of the four named priority categories is filled by one finalist.", + "weight": 0.14 + }, + "R6": { + "requirement": "The most useful product tabs are left open for the 4 finalists, along with the Sierra adidas Everyset page and the key policy pages used, and the session ends with a concise buy-now recommendation on whether to choose the Sierra adidas Everyset or an alternative.", + "verification": "Confirm the requested evidence tabs remain open and that the final recommendation explicitly answers whether to buy the Sierra adidas Everyset now or choose another option instead, with a brief why.", + "weight": 0.12 + } + }, + "task_id": "8e358ff165d3ade4cabe8a64cc95b6058a9aa107", + "website": "https://www.sierra.com" + }, + { + "confirmed_task": "I’m planning a future climate-controlled shed that I might use as a backyard office or workshop, and I don’t want a shallow 'watch two videos and guess' answer. Please do a real browser-based comparison of insulation approaches and help me choose one assembly I could actually build. Start by reviewing at least 8 strong public sources total: at least 3 YouTube videos that show or explain shed or small outbuilding insulation, at least 3 non-video sources from building-science, manufacturer, or energy-efficiency guidance pages, and at least 2 retail product pages with current specs or pricing. Use those sources to compare exactly 3 full insulation strategies for a climate-controlled shed—for example some mix of fiberglass or mineral wool batts, rigid foam board, and spray foam—covering walls, roof/ceiling, and floor, not just one cavity type. For each strategy, note the stated or implied R-value approach, how moisture or vapor control is handled, how much interior space it consumes, the likely DIY difficulty, and the main failure risks or watch-outs; if a source does not show something, write 'not shown.' Then build one practical recommendation for a common small shed size like 10x12, including a suggested wall assembly, roof/ceiling assembly, and floor assembly that work together for a climate-controlled setup. After that, use public retail pages to price out a rough materials basket for each of the 3 strategies as realistically as you can from visible package sizes or unit pricing, and compare the total estimated material cost with 'not shown' where needed. Keep the most convincing evidence tabs open for the final winner, including at least one video tab, one technical/spec page, and one or two retail product pages, so I can sanity-check the recommendation visually afterward. Finish with a concise decision memo in Cryptopad Documents that names the best overall option, the best budget option, and the option you would avoid unless a special condition makes it worthwhile.", + "level": "hard", + "reference_length": 17, + "rubrics": { + "R1": { + "requirement": "The browsing session uses at least 8 public sources total, including at least 3 YouTube videos, at least 3 non-video building-science/manufacturer/energy-efficiency pages, and at least 2 retail product pages.", + "verification": "Count the sources actually used in the final synthesis and confirm they meet the requested source-type minimums.", + "weight": 0.16 + }, + "R2": { + "requirement": "Exactly 3 full insulation strategies are compared, and each strategy covers walls, roof/ceiling, and floor rather than discussing only one part of the shed.", + "verification": "Check that the final comparison has three and only three strategies, with wall, roof/ceiling, and floor treatment described for each one.", + "weight": 0.18 + }, + "R3": { + "requirement": "For each of the 3 strategies, the comparison records the R-value approach, moisture or vapor-control approach, interior space impact, DIY difficulty, and main failure risks or watch-outs, using 'not shown' where a source does not provide a field.", + "verification": "Review each strategy entry and confirm all requested fields are present, with 'not shown' used when information is missing.", + "weight": 0.18 + }, + "R4": { + "requirement": "A single recommended climate-controlled shed assembly for a 10x12 shed is produced, including a wall assembly, roof/ceiling assembly, and floor assembly that are presented as one coherent system.", + "verification": "Confirm the final recommendation names one overall winner and specifies all three assembly parts for the 10x12 shed.", + "weight": 0.18 + }, + "R5": { + "requirement": "A rough materials basket and estimated material cost is produced for each of the 3 strategies using visible public retail specs or pricing, with 'not shown' for anything unavailable.", + "verification": "Check that each strategy has a materials/pricing section grounded in retail product pages and that missing values are marked 'not shown' rather than invented.", + "weight": 0.14 + }, + "R6": { + "requirement": "The final output includes a concise decision memo naming the best overall option, the best budget option, and the option to avoid unless a special condition makes it worthwhile, and the concise decision memo is written in CryptoPad Documents.", + "verification": "Confirm the memo includes all three requested judgments and that each is directly tied to the comparison findings; the final decision memo is present as an open CryptoPad Document.", + "weight": 0.08 + }, + "R7": { + "requirement": "Useful browser evidence is left open for the winning approach, including at least one video tab, one technical/spec page, and one or two retail product pages.", + "verification": "Inspect the remaining open tabs and confirm they include the requested mix of evidence pages relevant to the recommended winner.", + "weight": 0.08 + } + }, + "task_id": "ddfec1f140fd9fb3611e241c87c220ab77985b01", + "website": "https://www.youtube.com" + }, + { + "confirmed_task": "I’m trying to choose a bar-catering company for a future private event in the Minneapolis area, and Steady Pour is one of the vendors I want seriously evaluated rather than just looked up in isolation. Please start with Steady Pour’s public site, pull every bar-catering or home-bar service option you can find, and record the listed pricing if it’s published; if a price is missing, mark it clearly as not shown instead of guessing. Then broaden this into a real comparison by finding 7 other Minneapolis-area mobile bar or bar-catering companies with public pages, so we end up with exactly 8 total vendors including Steady Pour. For each vendor, use only public pages to capture the core offering types, whether alcohol is included or BYO, staffing or bartender details if shown, minimum guest count or event minimum if shown, service area, pricing or starting price if shown, and any obvious extras like mixers, glassware, mocktails, coffee cart, trailer bar, or dry hire options; use not shown wherever the site does not publish something. As you work, keep the strongest evidence visible by opening the main service or pricing page for all 8 vendors, and also open photo, gallery, or social-proof pages for at least 4 of them so I can visually compare presentation quality. Compare the vendors for two realistic use cases: a budget-conscious casual party and a more polished wedding-style event, and tell me which vendors look strongest for each case based only on what their public pages support. At the end, give me one organized comparison with all 8 vendors, call out where Steady Pour stands on pricing transparency and service fit versus the others, recommend a top 3 shortlist, and leave the most useful service/pricing tabs open for Steady Pour plus the two best alternatives.", + "level": "hard", + "reference_length": 7, + "rubrics": { + "R1": { + "requirement": "The final comparison covers exactly 8 total Minneapolis-area bar-catering vendors, including Steady Pour plus 7 other vendors with public pages.", + "verification": "Check the final output for exactly 8 distinct vendors and confirm Steady Pour is included alongside 7 additional Minneapolis-area mobile bar or bar-catering companies.", + "weight": 0.16 + }, + "R2": { + "requirement": "Steady Pour’s public site is used to capture every bar-catering or home-bar service option found, with listed pricing recorded when published and 'not shown' used when pricing is missing.", + "verification": "Review the Steady Pour entry and supporting open tab(s) to confirm all relevant service options found on the site are summarized and each has either listed pricing or an explicit 'not shown' note.", + "weight": 0.17 + }, + "R3": { + "requirement": "For each of the 8 vendors, the comparison records the requested public-page fields: core offering types, whether alcohol is included or BYO, staffing or bartender details if shown, minimum guest count or event minimum if shown, service area, pricing or starting price if shown, and notable extras, with 'not shown' used for missing fields.", + "verification": "Inspect the final comparison row-by-row or vendor-by-vendor to confirm every requested field is present for all 8 vendors and that missing information is marked 'not shown' rather than inferred.", + "weight": 0.24 + }, + "R4": { + "requirement": "The browser session keeps open the main service or pricing page for all 8 vendors, plus photo, gallery, or social-proof pages for at least 4 of them so presentation quality can be visually compared.", + "verification": "Confirm that open tabs include one primary service/pricing page for each of the 8 vendors and at least 4 additional tabs showing gallery, photos, reviews, or similar visual/social-proof pages.", + "weight": 0.14 + }, + "R5": { + "requirement": "The final synthesis explicitly compares the 8 vendors for two use cases: a budget-conscious casual party and a polished wedding-style event.", + "verification": "Check that both use cases are addressed separately and that the comparison explains which vendors fit each scenario based on evidence from public pages.", + "weight": 0.11 + }, + "R6": { + "requirement": "The final recommendation identifies a top 3 shortlist and explicitly states where Steady Pour stands on pricing transparency and service fit versus the other vendors.", + "verification": "Review the conclusion to confirm a ranked or clearly named top 3 exists and that Steady Pour is directly positioned against competitors on transparency and fit.", + "weight": 0.1 + }, + "R7": { + "requirement": "The most useful tabs are left open for Steady Pour and the two best alternative vendors, centered on their service or pricing pages.", + "verification": "Confirm that the ending browser state retains the key service/pricing tabs for Steady Pour and the two recommended alternative vendors.", + "weight": 0.08 + } + }, + "task_id": "fb2f8bea3fa9528a581ce9e46bcc552c93e186a6", + "website": "https://www.steadypour.com" + }, + { + "confirmed_task": "I’m trying to figure out whether Netflix alone can carry a really good mystery-TV run for me over the next few months, not just hand me a random list of 5 shows. Please start on Netflix and identify at least 20 mystery series that are actually available there, using Netflix’s browse/search/title pages as the backbone of the sweep. Then compare them on the public web so I can make a real decision: for each title, capture the basic hook, whether it’s more detective / crime mystery / thriller / supernatural mystery / puzzle-box, whether it looks finished or still ongoing if that is publicly shown, and the rough viewing commitment in episodes or seasons when available. I also want quality and fit, so cross-check each title with public review or metadata pages and note an IMDb score, a Rotten Tomatoes score, or ‘not shown’ if one of those isn’t available. From that larger pool, narrow it to exactly 10 strongest options for me, making sure the final 10 are not all the same kind of mystery and include at least 2 non-English series if Netflix has enough good candidates. For those final 10, open and keep the key evidence tabs available so I can inspect them later: keep the Netflix title pages open for the final 10, plus at least 5 outside pages that were especially useful for comparing reception or episode counts. End with a clear ranked watchlist from 1 to 10 that tells me which series to start with first, which ones are best if I want something short and high-quality, which ones are best for a long binge, and whether Netflix’s mystery lineup actually looks deep enough that I would not need another streaming service right now.", + "level": "hard", + "reference_length": 6, + "rubrics": { + "R1": { + "requirement": "At least 20 mystery TV series available on Netflix are identified from Netflix browse/search/title pages as the comparison pool.", + "verification": "The final output lists 20 or more distinct mystery series and shows they were sourced from Netflix pages rather than only outside summaries.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each title in the comparison pool includes the requested decision details: basic hook, subgenre classification, finished/ongoing status when publicly shown, and rough viewing commitment in episodes or seasons when available.", + "verification": "For each of the 20+ titles, the recorded entry contains those fields or explicitly says 'not shown' where the public pages do not provide them.", + "weight": 0.18 + }, + "R3": { + "requirement": "Each title is cross-checked on the public web for quality signals, with an IMDb score, a Rotten Tomatoes score, or 'not shown' when unavailable.", + "verification": "The comparison pool includes public-source quality metadata for every title, and missing values are marked 'not shown' rather than omitted.", + "weight": 0.16 + }, + "R4": { + "requirement": "The larger pool is narrowed to exactly 10 strongest options, and the final 10 are meaningfully varied rather than all being the same kind of mystery, including at least 2 non-English series if enough good candidates were found.", + "verification": "The final shortlist contains exactly 10 titles, reflects multiple mystery subtypes, and includes 2 or more non-English entries unless the output explicitly explains that Netflix did not provide enough strong candidates.", + "weight": 0.17 + }, + "R5": { + "requirement": "Key browser evidence is kept open: the Netflix title pages for the final 10, plus at least 5 outside pages that were especially useful for comparing reception or episode counts.", + "verification": "The session ends with those evidence tabs still open and aligned with the titles used in the final shortlist.", + "weight": 0.13 + }, + "R6": { + "requirement": "A clear ranked watchlist from 1 to 10 is produced, including which series to start first, which are best for a short high-quality watch, and which are best for a long binge.", + "verification": "The final synthesis contains a numbered ranking of all 10 shortlisted series and explicitly identifies the best starting pick, short-watch picks, and long-binge picks.", + "weight": 0.1 + }, + "R7": { + "requirement": "The final recommendation explicitly answers whether Netflix’s mystery lineup looks deep enough that the user would not need another streaming service right now.", + "verification": "The conclusion makes a direct yes/no or nuanced recommendation about Netflix-only viability and ties it to the evidence gathered from the 20-title sweep and 10-title shortlist.", + "weight": 0.08 + } + }, + "task_id": "7bfd58a150de3345ae03d53d828424d95a91d301", + "website": "https://www.netflix.com" + }, + { + "confirmed_task": "I’m planning a special-occasion future trip to New York City and I don’t want just three fancy hotel names — I want a real luxury-hotel decision I could actually book from. Start with Google Hotels or Google Travel and build me a comparison set of exactly 12 NYC luxury hotels that are clearly positioned at the top end of the market, ideally true 5-star properties when that’s shown. Use the same future 3-night stay window for all 12 so the prices are comparable, and record the nightly rate or average nightly rate shown there, plus any resort, destination, or mandatory fees if they’re publicly shown. Make sure the 12 hotels cover at least 4 distinct NYC luxury-stay areas such as Central Park South, Midtown/Fifth Avenue, Downtown/Tribeca/SoHo, and one other area if relevant. Then open the official property pages for the 8 strongest contenders and verify the core details that matter for an actual decision: room style and size cues if shown, standout amenities like spa/pool, dining on-site, view/location advantages, and cancellation flexibility if publicly listed. For those 8 contenders, also use maps and photos to sanity-check what the surrounding area feels like and how convenient each one is for a luxury leisure trip. After that, narrow the list to a final 6-hotel shortlist and give me a structured recommendation: best overall luxury stay, best classic NYC splurge, best quieter or more private option, and best value-for-luxury among the finalists. Please keep the most useful tabs open at the end: the Google hotel results page for the search, the official pages for the final 6 hotels, and at least 2 map or photo views that help show the neighborhood tradeoffs.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "A comparison set of exactly 12 NYC luxury hotels is created from Google Hotels or Google Travel using the same future 3-night stay window for all 12, and each entry includes the nightly rate or average nightly rate shown there.", + "verification": "Check that the final output lists exactly 12 hotels, all tied to one consistent 3-night future stay window, with a displayed nightly or average nightly rate recorded for each from the Google hotel search workflow.", + "weight": 0.2 + }, + "R2": { + "requirement": "The 12 hotels span at least 4 distinct NYC luxury-stay areas, including Central Park South, Midtown/Fifth Avenue, Downtown/Tribeca/SoHo, and one additional relevant area if used.", + "verification": "Check the final comparison for neighborhood labeling or grouping that covers at least 4 distinct areas and includes the specifically requested core areas.", + "weight": 0.14 + }, + "R3": { + "requirement": "For all 12 hotels, any resort, destination, or mandatory fees are recorded when publicly shown.", + "verification": "Inspect the comparison and confirm fee information is included wherever publicly shown, with missing cases left absent or clearly noted rather than invented.", + "weight": 0.12 + }, + "R4": { + "requirement": "The official property pages for the 8 strongest contenders are opened and used to verify decision-critical details: room style and size cues if shown, standout amenities like spa/pool, on-site dining, location or view advantages, and cancellation flexibility if publicly listed.", + "verification": "Confirm that 8 contenders were advanced to official-page review and that the requested categories of details were captured from those official pages for each contender where publicly available.", + "weight": 0.2 + }, + "R5": { + "requirement": "For those 8 contenders, maps and photos are used to sanity-check the surrounding area and leisure-trip convenience.", + "verification": "Look for evidence that map/photo views were consulted for the 8 contenders and that neighborhood feel or convenience observations were incorporated into the comparison.", + "weight": 0.12 + }, + "R6": { + "requirement": "A final 6-hotel shortlist is produced with four explicit recommendations: best overall luxury stay, best classic NYC splurge, best quieter or more private option, and best value-for-luxury among the finalists.", + "verification": "Check that the output narrows to exactly 6 finalists and includes all four named recommendation categories with a clearly chosen hotel for each.", + "weight": 0.14 + }, + "R7": { + "requirement": "The most useful evidence tabs are left open at the end: the Google hotel results page, the official pages for the final 6 hotels, and at least 2 map or photo views showing neighborhood tradeoffs.", + "verification": "Confirm the retained browser state includes the Google hotel search/results page, 6 official hotel pages corresponding to the finalists, and at least 2 map/photo tabs relevant to comparing neighborhoods.", + "weight": 0.08 + } + }, + "task_id": "ad918b380d5fb668438fada2d0ca9cad172f759c", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I’ve been wanting to rebuild my social life and meet new people around Glendale, California, but I don’t want just a couple of random Meetup links. Please do a real browser-based sweep of recurring social options that someone could actually use over the next month or two, centered on Glendale and extending to nearby areas that are still practical like Burbank, Pasadena, Eagle Rock, Atwater Village, Silver Lake, or Los Feliz if needed. Find at least 12 viable ways to meet people, and make sure they come from at least 4 different public sources or categories such as Meetup groups, volunteer opportunities, community classes, hobby clubs, running groups, language exchanges, book clubs, board-game nights, or public event series. For each option, capture the name, what kind of people it seems aimed at, the neighborhood, whether it looks recurring or one-off, the next visible date or schedule if shown, the price or 'not shown,' and one short note on why it seems good for meeting new people rather than just attending passively. Open and compare the actual public pages, and also use maps/photos when helpful so you can sanity-check whether the locations feel practical and active. Then narrow it to the 6 strongest options for someone new in the area who wants genuine repeat social contact, variety, and a reasonable budget, and build a first-month plan using exactly 4 picks that together give a good mix of social styles and neighborhoods without requiring a ridiculous amount of driving. Keep the most useful evidence tabs open, including at least 6 final option pages and at least 2 map or venue pages, and finish with a concise recommendation on which single option looks best for easiest first-time socializing versus which one looks best for building longer-term community.", + "level": "hard", + "reference_length": 18, + "rubrics": { + "R1": { + "requirement": "At least 12 viable ways to meet people are identified, centered on Glendale and only extending to nearby practical areas named in the prompt when needed.", + "verification": "Count the final options listed and confirm they are social options in Glendale or the specified nearby neighborhoods.", + "weight": 0.18 + }, + "R2": { + "requirement": "The 12 options come from at least 4 different public sources or categories named in the prompt, such as Meetup groups, volunteer opportunities, community classes, hobby clubs, running groups, language exchanges, book clubs, board-game nights, or public event series.", + "verification": "Check that the final set spans at least 4 distinct source/category types explicitly represented in the results.", + "weight": 0.16 + }, + "R3": { + "requirement": "For each option, the output includes the name, audience or social fit, neighborhood, whether it appears recurring or one-off, the next visible date or schedule if shown, the price or 'not shown,' and a note on why it seems good for meeting new people.", + "verification": "Inspect each listed option for all required fields and allow 'not shown' only where the public page does not display the information.", + "weight": 0.2 + }, + "R4": { + "requirement": "The browsing session uses actual public pages and map/photo sanity checks where helpful to judge practicality and activity.", + "verification": "Confirm that evidence tabs/pages were opened for the options and that at least some location judgments are supported by map or venue/photo pages rather than unsupported claims.", + "weight": 0.14 + }, + "R5": { + "requirement": "The final synthesis narrows the research to the 6 strongest options for someone new in the area who wants repeat social contact, variety, and a reasonable budget.", + "verification": "Check that exactly 6 finalists are selected and that the reasoning clearly uses the stated criteria of repeat contact, variety, and budget.", + "weight": 0.12 + }, + "R6": { + "requirement": "A first-month plan is built using exactly 4 picks that together provide a mix of social styles and neighborhoods without excessive driving.", + "verification": "Confirm that exactly 4 options are chosen for the plan and that the writeup explains the mix of styles/neighborhoods and driving practicality.", + "weight": 0.1 + }, + "R7": { + "requirement": "The most useful evidence tabs are left open, including at least 6 final option pages and at least 2 map or venue pages, and the final recommendation distinguishes the best option for easiest first-time socializing from the best option for longer-term community.", + "verification": "Check the open tabs against the minimum counts and confirm the final recommendation explicitly names both the easiest first-time option and the best longer-term community option.", + "weight": 0.1 + } + }, + "task_id": "17f506970491ac59150ad919a8cc9fbefce52ff4", + "website": "https://www.meetup.com" + }, + { + "confirmed_task": "I want to find a real board game community in Los Angeles that I could actually start showing up to regularly, not just grab two random Meetup links. Please use public pages to build me a serious shortlist of at least 12 recurring board game groups, events, or game-night venues across the broader Los Angeles area, and pull them from at least 3 different source types where possible, such as Meetup, board game cafes or store calendars, Eventbrite, library or community-center listings, or other public event pages. For each option, record the name, the group or event page, the neighborhood or city area, what kind of play it seems to focus on, whether it looks beginner-friendly or more hardcore, whether there is any obvious cost or purchase requirement, and whether there is visible evidence that it is still active or recently updated; if a field is not shown, say not shown. I also want you to compare the options by region so I can tell whether the best fits are on the Westside, Eastside, central LA/Hollywood, South Bay, San Gabriel Valley, or the Valley, and open map or venue pages for the strongest candidates so I can visually sanity-check the locations. Then narrow the list to the 6 best choices for someone trying to meet people and actually play regularly, making sure the final 6 are not all the same type of scene. For those 6, keep the most useful group or event pages open, plus maps or venue pages where helpful, and tell me why each one made the cut. Finally, give me a practical starter plan: recommend the best 3 options for a newcomer, the best 2 for more serious hobby gaming, and the best 1 low-pressure social option, with a suggested order to try them over a future month so I can test different scenes without committing too quickly.", + "level": "hard", + "reference_length": 14, + "rubrics": { + "R1": { + "requirement": "At least 12 recurring Los Angeles-area board game groups, events, or game-night venues are identified from public pages, using at least 3 different source types where possible.", + "verification": "Count the final shortlist entries and confirm source diversity from the recorded pages.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each of the at least 12 options includes the requested recorded details: name, group or event page, neighborhood or city area, apparent play style, beginner-friendly versus more hardcore impression, any obvious cost or purchase requirement, and visible evidence of recent activity; missing fields are marked not shown.", + "verification": "Check each option entry for all required fields and that missing information is explicitly labeled not shown.", + "weight": 0.22 + }, + "R3": { + "requirement": "The options are compared by region so the user can distinguish candidates across the Westside, Eastside, central LA/Hollywood, South Bay, San Gabriel Valley, and the Valley.", + "verification": "Review the synthesis and confirm that regional grouping or comparison is present and tied to the identified options.", + "weight": 0.13 + }, + "R4": { + "requirement": "Map or venue pages are opened for the strongest candidates so the user can visually sanity-check locations, and the most useful candidate pages remain open.", + "verification": "Inspect the open tabs to confirm candidate group/event pages plus supporting map or venue pages are left open for the strongest options.", + "weight": 0.14 + }, + "R5": { + "requirement": "The list is narrowed to exactly 6 best choices for someone trying to meet people and play regularly, and those 6 are not all the same type of scene.", + "verification": "Confirm there are exactly 6 finalists and that the final explanations reflect scene diversity rather than one repetitive category.", + "weight": 0.14 + }, + "R6": { + "requirement": "For the final 6, the response explains why each one made the cut and keeps the most useful group or event pages open, plus maps or venue pages where helpful.", + "verification": "Check that each finalist has an explicit rationale and corresponding useful evidence tabs remain open.", + "weight": 0.09 + }, + "R7": { + "requirement": "A practical starter plan is provided with the best 3 newcomer options, the best 2 more serious hobby-gaming options, the best 1 low-pressure social option, and a suggested order to try them over a future month.", + "verification": "Review the final recommendation section for the exact 3-2-1 breakdown and the month-long suggested trial order.", + "weight": 0.1 + } + }, + "task_id": "437482860f2bd9d99a668952d4516a9d2c2c5878", + "website": "https://www.meetup.com" + }, + { + "confirmed_task": "I’m trying to buy a pair of black calf-high dress boots that actually look sleek on the leg instead of loose or slouchy, and I wear women’s size 5.5, so I want a real browser-based shopping sweep rather than one lucky guess. Please start with DSW since that’s where I began looking, but expand across other major public retail sites if needed and build me a shortlist of exactly 12 viable options. Every candidate should be black, calf-high or very close to calf-high, have a thick or block-style heel, and show women’s size 5.5 or a clear equivalent; if shaft height, calf circumference, or width details are missing, just write “not shown” instead of guessing. As you review each product page, use the photos, description, measurements, and customer-review evidence to judge whether the boot looks form-fitting through the calf rather than wide, slouchy, or obviously gappy. For all 12 options, capture the product name, retailer, current price, heel type and height if shown, shaft height if shown, calf circumference if shown, material, size-5.5 availability status, and a short fit-confidence note. Then compare the retailer’s return-policy basics and any obvious shipping constraints because fit is the biggest risk here. After that, narrow the 12 down to the best 4 finalists: one best overall, one best value, one most form-fitting-looking, and one safest-to-try because of returns. Keep the product tabs open for those final 4, and also keep open the most relevant return-policy pages for the retailers behind them so I can sanity-check the fit risk later. Finish with a concise decision memo in the chat that lists all 12 candidates and clearly tells me which of the 4 I should try first and why.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "A shortlist of exactly 12 viable boots is produced, and each candidate matches the requested style constraints: black, calf-high or very close to calf-high, thick or block-style heel, and women’s size 5.5 or a clear equivalent.", + "verification": "Check the final comparison for 12 total entries and confirm each entry explicitly records the requested style/size fit rather than drifting into unrelated boot types.", + "weight": 0.2 + }, + "R2": { + "requirement": "Each of the 12 candidates includes the requested product details: product name, retailer, current price, heel type and height if shown, shaft height if shown, calf circumference if shown, material, size-5.5 availability status, and a short fit-confidence note, using 'not shown' where needed.", + "verification": "Review the final memo and confirm all listed fields appear for every one of the 12 options, with missing measurements marked as 'not shown' instead of omitted or guessed.", + "weight": 0.2 + }, + "R3": { + "requirement": "The comparison uses product-page evidence to judge whether each boot looks form-fitting through the calf rather than wide, slouchy, or obviously gappy.", + "verification": "Check that each candidate has a fit-confidence note grounded in photos, descriptions, measurements, or customer-review evidence from the product pages.", + "weight": 0.16 + }, + "R4": { + "requirement": "Return-policy basics and any obvious shipping constraints are compared for the retailers involved, since fit risk is part of the decision.", + "verification": "Confirm the final memo includes return-policy basics and shipping notes for the relevant retailers, rather than only product-level attributes.", + "weight": 0.14 + }, + "R5": { + "requirement": "The 12 options are narrowed to exactly 4 finalists labeled as best overall, best value, most form-fitting-looking, and safest-to-try because of returns.", + "verification": "Check that the final recommendation section names exactly 4 finalists and assigns each one to the explicit category requested in the prompt.", + "weight": 0.16 + }, + "R6": { + "requirement": "Browser evidence is preserved by keeping open the product tabs for the final 4 boots and the most relevant retailer return-policy pages tied to those finalists.", + "verification": "Inspect the open tabs at the end to confirm the 4 finalist product pages remain open along with the corresponding return-policy pages the prompt asked to keep visible.", + "weight": 0.14 + } + }, + "task_id": "545a1b36d5ada637dea9e0213f0a8c54c31c29dd", + "website": "https://www.dsw.com" + }, + { + "confirmed_task": "I’m trying to figure out whether I should focus my post-grad job search on New York City for film and adjacent media work, and I want a real browser-based sweep instead of just a few random listings. Start with LinkedIn, but don’t stay stuck there if better public pages exist. I want you to find exactly 18 current NYC-area opportunities that are genuinely entry-level, internship, assistant, coordinator, production, post-production, development, distribution, exhibition, or other clearly film-related early-career roles. Use a mix of public sources such as LinkedIn job pages, company careers pages, entertainment job boards, and studio or production-company hiring pages, and make sure at least 6 of the 18 come from sources other than LinkedIn so this is not just one-site sampling. For each opportunity, record the title, company, borough or city area if shown, whether it is internship or full-time, whether it looks remote, hybrid, or on-site, salary or pay if shown, and the key qualifications or software/tools mentioned; if any field is missing, mark it as not shown. As you work, open and keep visible the strongest evidence tabs for at least 8 of the listings, including a mix of source types and at least 3 actual employer career pages, so I can sanity-check that the openings are real. After collecting the 18 roles, compare them and identify the 6 best fits for a recent graduate with limited professional experience but serious interest in film, and briefly explain why those 6 rise to the top based on accessibility, relevance, and transparency of requirements. Then give me a short synthesis of what this search suggests about the NYC entry-level film market right now: which role types appear most often, whether paid opportunities seem common enough, and whether the market looks broad enough that I should concentrate my search there for the next few weeks. Leave the final shortlist and the most useful evidence tabs open. Make a spreadsheet with all the listings and the links to apply.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "Exactly 18 current NYC-area opportunities are collected, and they are genuinely entry-level, internship, assistant, coordinator, production, post-production, development, distribution, exhibition, or otherwise clearly film-related early-career roles.", + "verification": "Check the final shortlist count is exactly 18 and confirm each entry fits the role-scope and NYC-area constraints stated in the prompt.", + "weight": 0.2 + }, + "R2": { + "requirement": "The 18 opportunities are sourced from a mix of public pages, with at least 6 coming from sources other than LinkedIn.", + "verification": "Review the source labels or open tabs for all 18 entries and confirm that at least 6 are from non-LinkedIn public sources such as employer career pages or other entertainment/job boards.", + "weight": 0.15 + }, + "R3": { + "requirement": "Each of the 18 opportunities includes the required recorded fields: title, company, borough or city area if shown, internship or full-time status, remote/hybrid/on-site status if shown, salary or pay if shown, and key qualifications or software/tools mentioned, using 'not shown' where needed.", + "verification": "Inspect the final compiled results and confirm every listing contains all requested fields or explicitly says 'not shown' for missing information.", + "weight": 0.2 + }, + "R4": { + "requirement": "Strong browser evidence is preserved by keeping visible tabs for at least 8 listings, with a mix of source types and at least 3 actual employer career pages.", + "verification": "Count the kept-open evidence tabs and confirm there are at least 8 relevant listing pages, including at least 3 employer career pages and more than one source type.", + "weight": 0.15 + }, + "R5": { + "requirement": "A prioritized set of the 6 best fits for a recent graduate with limited professional experience but strong film interest is identified and each of the 6 includes a brief explanation of why it rises to the top.", + "verification": "Check that exactly 6 opportunities are highlighted as best fits and that each has a justification tied to accessibility, relevance, and/or transparency of requirements.", + "weight": 0.15 + }, + "R6": { + "requirement": "A short synthesis is provided about what the search suggests regarding the NYC entry-level film market, including which role types appear most often, whether paid opportunities seem common enough, and whether the market looks broad enough to justify concentrating the search there for the next few weeks, and a spreadsheet is also created with all listings and the links to apply.", + "verification": "Review the final summary and confirm it addresses all three requested synthesis points based on the collected openings; the final deliverables include a spreadsheet containing all listings and application links.", + "weight": 0.15 + } + }, + "task_id": "59c9d2da1b99ec2e643aadf7a74d31998cda2376", + "website": "https://www.linkedin.com" + }, + { + "confirmed_task": "I’m trying to put together a genuinely usable snack plan for a 30-person film set, and I don’t want just one random cart from one store. Please use public retail pages to build and compare 3 complete snack bundles that could each cover 30 people for one shoot day while keeping the total under $100 before tax. Start with Sam’s Club since that was my first idea, but expand naturally to 3 to 5 public retailers if that gives a better answer. Each bundle should include at least 6 total products and should cover at minimum: 2 sweet options, 2 savory options, 1 fruit-based option, and 1 more filling option like granola bars, trail mix, jerky, or another shelf-stable snack; if a field is unclear on the page, write not shown. Please favor snacks that are individually wrapped or otherwise easy to portion, reasonably low-mess for a set, and shelf-stable enough that they don’t depend on refrigeration. As you compare options, open the actual product pages for the strongest candidates and keep the most useful evidence tabs open, especially the final products used in the winning bundle plus at least one alternative bundle from a different retailer. In the end, give me one recommended final bundle with item names, retailer, package size, quantity to buy, estimated servings or pieces, per-item price, total price, and a short note on why it works for a film set better than the other two bundles.", + "level": "hard", + "reference_length": 29, + "rubrics": { + "R1": { + "requirement": "Three complete snack bundles are produced, and each bundle is designed to cover 30 people for one shoot day while staying under $100 before tax.", + "verification": "Check that exactly 3 bundles are presented, each explicitly states coverage for 30 people and includes a calculated total under $100 before tax.", + "weight": 0.2 + }, + "R2": { + "requirement": "The browsing starts from Sam’s Club and expands to a total of 3 to 5 public retailers when useful for comparison.", + "verification": "Check that Sam’s Club is included and that products are drawn from at least 3 and no more than 5 public retail sites overall.", + "weight": 0.14 + }, + "R3": { + "requirement": "Each bundle includes at least 6 total products and covers all requested snack categories: 2 sweet options, 2 savory options, 1 fruit-based option, and 1 more filling option.", + "verification": "For each bundle, count the listed products and verify that all four category requirements are explicitly satisfied.", + "weight": 0.2 + }, + "R4": { + "requirement": "For every selected product, the result includes item name, retailer, package size, quantity to buy, estimated servings or pieces, per-item price, total price, and uses 'not shown' when the page does not provide a requested field.", + "verification": "Check each product line in the bundles for all requested fields and confirm that missing information is marked 'not shown' rather than omitted.", + "weight": 0.16 + }, + "R5": { + "requirement": "The recommendations explicitly favor snacks that are individually wrapped or easy to portion, low-mess for a film set, and shelf-stable enough not to depend on refrigeration.", + "verification": "Check that each bundle or the final comparison includes practical notes addressing portioning, mess level, and shelf stability for set use.", + "weight": 0.12 + }, + "R6": { + "requirement": "Actual product pages are opened for the strongest candidates, and the most useful evidence tabs are kept open, including the final products in the winning bundle and at least one alternative bundle from a different retailer.", + "verification": "Confirm that browser evidence remains available for the chosen bundle’s products and for at least one competing bundle from another retailer.", + "weight": 0.1 + }, + "R7": { + "requirement": "A single recommended final bundle is identified and briefly justified against the other two bundles as the best fit for a film set.", + "verification": "Check that one bundle is clearly named as the final recommendation and includes a short comparative explanation of why it works better than the other two.", + "weight": 0.08 + } + }, + "task_id": "41a741b7b748dece0a069a7f1b1f9279902c4000", + "website": "https://www.samsclub.com" + }, + { + "confirmed_task": "I’m trying to choose a genuinely healthy meal delivery service for a future routine, and I don’t want a one-brand summary that leaves me guessing about the alternatives. Start with Hungryroot as one contender, but compare it against 7 other nationally available services that belong in the same decision set, for 8 total services. Use each service’s official public plan or pricing pages to capture what it actually offers: whether it’s meal kits, prepared meals, groceries, or a hybrid; the main dietary styles or nutrition angle; the starting price or visible price range; any shipping, membership, or delivery fees if shown; the minimum order structure if shown; and how ordering, skipping, or canceling appears to work. If a public page does not show one of those fields, record it as “not shown” instead of guessing. Then, for each of the 8 services, use at least one reputable public review source to sanity-check customer sentiment, but keep the official pages as the main evidence. I want a side-by-side comparison and a final recommendation for exactly 3 cases: best overall healthy option, best budget-conscious option, and best option for maximum flexibility/customization. Keep the official pricing or plan tabs open for Hungryroot plus the 3 finalists, and also leave open the review pages you relied on most so I can verify the reasoning myself.", + "level": "hard", + "reference_length": 5, + "rubrics": { + "R1": { + "requirement": "The browsing session compares exactly 8 total services: Hungryroot plus 7 other nationally available services in the same healthy meal delivery decision set.", + "verification": "Count the services included in the final comparison and confirm Hungryroot is one of the 8.", + "weight": 0.18 + }, + "R2": { + "requirement": "For each of the 8 services, the comparison records from official public plan or pricing pages whether it is meal kits, prepared meals, groceries, or a hybrid; its main dietary styles or nutrition angle; its starting price or visible price range; any shipping, membership, or delivery fees if shown; the minimum order structure if shown; and how ordering, skipping, or canceling appears to work, using “not shown” where needed.", + "verification": "Check each service entry for all requested fields and confirm the information is grounded in official public pages rather than guesses.", + "weight": 0.24 + }, + "R3": { + "requirement": "Each of the 8 services is sanity-checked with at least one reputable public review source, while the official pages remain the primary evidence.", + "verification": "Confirm that every service has at least one review source referenced and that the comparison still relies mainly on official plan or pricing pages for service details.", + "weight": 0.14 + }, + "R4": { + "requirement": "The final synthesis provides a side-by-side comparison and gives exactly 3 recommendations: best overall healthy option, best budget-conscious option, and best option for maximum flexibility/customization.", + "verification": "Check that all 3 requested recommendation categories appear and that each winner is supported by comparative reasoning from the gathered evidence.", + "weight": 0.2 + }, + "R5": { + "requirement": "The browser is left with the official pricing or plan tabs open for Hungryroot and the 3 finalists.", + "verification": "Inspect the open tabs at the end and confirm that official pricing or plan pages remain open for Hungryroot plus the 3 finalist services.", + "weight": 0.12 + }, + "R6": { + "requirement": "The browser is also left with open the review pages relied on most heavily for the recommendation reasoning.", + "verification": "Inspect the final open tabs and confirm that the key review pages used in the analysis are still available for user verification.", + "weight": 0.12 + } + }, + "task_id": "3fd9dfbf35247a1d4db1f88025028e95fb5aef88", + "website": "https://www.hungryroot.com" + }, + { + "confirmed_task": "I’m trying to choose the best basketball camp for my 12-year-old son for a future school break or summer, and I don’t want a one-page answer that only checks one brand. Please start with PGC Basketball because that’s already on my radar, but then broaden naturally into a real parent decision: use my location if it’s available, or the nearest major metro you can infer from the browser, and build a shortlist of exactly 10 age-appropriate basketball camp options that are realistically reachable for us, prioritizing day camps and drivable options first but including strong overnight camps if they look meaningfully better. Make sure at least 3 of the 10 options are PGC camps if age-eligible and reachable, and fill the rest with credible alternatives from other organizations. For each camp, verify from the official public page the location, dates or session window, age or grade fit for a 12-year-old, camp type, price if shown, and whether it looks more skills-focused, shooting-focused, or general development; if something is missing, mark it as not shown. Then compare public ratings or parent-review signals wherever they exist, using Google, Yelp, Facebook, or other public review pages, and keep the most useful evidence tabs open rather than closing everything after you check it. Also open maps for the strongest contenders so you can compare actual distance or drive time, and note any obvious parent-logistics issues like overnight-only format, limited dates, or registration pages that look sold out or unclear. At the end, give me one ranked top-5 recommendation list pulled from the 10 camps, explain which single camp is the best overall choice versus which one is the best close-to-home choice, and leave the final shortlist and the key camp, review, and map pages open so I can inspect them myself. Put all the options in a CryptoPad Document with detailed analysis and comparisons.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "A shortlist of exactly 10 age-appropriate basketball camp options is produced, using the user's location if available or the nearest inferred major metro, with at least 3 of the 10 being reachable PGC camps when such PGC options are age-eligible.", + "verification": "Check that the final output lists exactly 10 camps, that each is presented as reachable under the prompt's standard, and that at least 3 entries are PGC camps unless the browsing evidence explicitly shows fewer qualifying PGC options.", + "weight": 0.2 + }, + "R2": { + "requirement": "Each of the 10 camps has the required official-page details recorded: location, dates or session window, age or grade fit for a 12-year-old, camp type, price if shown, and whether the camp appears skills-focused, shooting-focused, or general development, with 'not shown' used where needed.", + "verification": "Inspect the final shortlist and confirm that all required fields are present for every camp and that missing fields are marked 'not shown' rather than omitted.", + "weight": 0.2 + }, + "R3": { + "requirement": "Public ratings or parent-review signals are compared for the camps wherever available, using public review pages such as Google, Yelp, Facebook, or similar sources.", + "verification": "Confirm that review evidence or explicit 'not found/not shown' notes are included for the camps and that the comparison draws on public review pages rather than unsupported assertions.", + "weight": 0.15 + }, + "R4": { + "requirement": "Maps are used for the strongest contenders so distance or drive time can be compared, and obvious parent-logistics issues such as overnight-only format, limited dates, or sold-out/unclear registration status are noted.", + "verification": "Check that map evidence is reflected for the leading camps and that the final comparison includes concrete logistics notes where the browsing surfaced them.", + "weight": 0.15 + }, + "R5": { + "requirement": "The task ends with one ranked top-5 recommendation list drawn from the 10 camps, including a clear call on the single best overall camp and the single best close-to-home camp.", + "verification": "Verify that the final answer contains a ranked top 5, that all 5 come from the 10-camp shortlist, and that it explicitly names both the best overall choice and the best close-to-home choice.", + "weight": 0.2 + }, + "R6": { + "requirement": "Useful browser evidence is kept visible by leaving open the final shortlist plus the key camp, review, and map pages for the strongest options, and all 10 options are also captured in a CryptoPad Document with detailed analysis and comparisons.", + "verification": "Inspect the open tabs or browser state and confirm that the important camp pages, review evidence pages, and map pages for the leading contenders remain open at the end; an open CryptoPad Document contains the full camp list with detailed analysis and comparisons.", + "weight": 0.1 + } + }, + "task_id": "4fe5b02a8d0d55d5ad7111173e184d4b1c5d3697", + "website": "https://pgcbasketball.com" + }, + { + "confirmed_task": "I’m seriously considering getting a pet bird, but I don’t want generic blog advice or a random species quiz — I want a credible owner-education packet I could actually read before deciding what kind of bird is realistic for me. Please start with the Association of Avian Veterinarians bird-owner materials, then expand to at least 3 other reputable public sources such as veterinary, university, or established exotic-animal education sites. Build me one organized decision memo that covers exactly 12 commonly kept pet-bird groups, using species groups like budgies, cockatiels, conures, lovebirds, African greys, Amazons, macaws, cockatoos, canaries, finches, pigeons/doves, and one poultry or waterfowl companion-bird category if credible owner handouts exist; if a good species-specific resource is missing, write 'not shown' instead of guessing. For each of the 12 groups, find the best available owner handout or client-education page and record the handout title, source organization, whether it is downloadable, and the main care topics it actually covers. Also collect at least 8 general bird-owner resources across cross-cutting topics like diet, housing, behavior, signs of illness, veterinary care, household dangers, zoonotic or public-health issues, and emergency or disaster planning. As you work, keep the main AAV bird-owner page open plus at least 6 of the strongest individual handout or PDF tabs from multiple organizations so I can inspect the evidence myself. Then finish the memo with a practical synthesis: identify the 3 bird groups that seem best supported by credible beginner-friendly owner education, the 3 that appear most complex or least well covered, and the biggest coverage gaps or red flags a first-time owner should know before choosing a species. Leave the finished memo and the most useful evidence tabs open at the end.", + "level": "hard", + "reference_length": 4, + "rubrics": { + "R1": { + "requirement": "The final memo covers exactly 12 commonly kept pet-bird groups, and each group has either one best available owner handout/client-education page recorded or an explicit 'not shown' when no credible species-specific resource was found.", + "verification": "Check the memo for exactly 12 species-group entries and confirm each entry includes either a specific resource or the text 'not shown' rather than an invented substitute.", + "weight": 0.2 + }, + "R2": { + "requirement": "For each of the 12 bird groups, the memo records the handout title, source organization, whether it is downloadable, and the main care topics the resource actually covers.", + "verification": "Inspect each species-group entry for all four requested fields: title, organization, downloadable status, and coverage summary.", + "weight": 0.18 + }, + "R3": { + "requirement": "The resource sweep starts from the Association of Avian Veterinarians and expands to at least 3 other reputable public sources, for at least 4 total source organizations represented in the memo.", + "verification": "Review the listed source organizations in the memo and confirm AAV is included and at least 3 additional reputable public organizations are also represented.", + "weight": 0.16 + }, + "R4": { + "requirement": "The memo includes at least 8 general bird-owner resources covering cross-cutting topics such as diet, housing, behavior, signs of illness, veterinary care, household dangers, zoonotic/public-health issues, and emergency/disaster planning.", + "verification": "Count the general-topic resources and confirm there are at least 8, with topic coverage spanning the requested cross-cutting care areas.", + "weight": 0.16 + }, + "R5": { + "requirement": "Browser evidence is preserved by leaving open the main AAV bird-owner page plus at least 6 strong individual handout or PDF tabs from multiple organizations.", + "verification": "Inspect open tabs at the end and confirm the AAV overview page remains open along with at least 6 individual handout/PDF pages from more than one organization.", + "weight": 0.12 + }, + "R6": { + "requirement": "The memo ends with a practical synthesis naming the 3 bird groups best supported by credible beginner-friendly owner education and the 3 that appear most complex or least well covered, based only on the materials found.", + "verification": "Check the final synthesis section for exactly 3 beginner-supported groups and exactly 3 complex/least-covered groups, with reasoning tied to the gathered materials.", + "weight": 0.1 + }, + "R7": { + "requirement": "The memo identifies the biggest coverage gaps or first-time-owner red flags that emerged from the resource search rather than just listing handouts.", + "verification": "Review the memo for an explicit gaps/red-flags section or equivalent notes showing what key information was missing, thin, inconsistent, or especially cautionary for new owners.", + "weight": 0.08 + } + }, + "task_id": "08e2ad6afc624b6f759afa9950907f470b92f11b", + "website": "https://www.aav.org" + }, + { + "confirmed_task": "I’m trying to get past the oversimplified version of Trump’s Venezuela policy and understand what the strategy actually was, how it evolved during his presidency, and whether it worked. Please do this as a serious browser-based research session using public pages, with priority on primary sources first: archived White House statements, State Department material, Treasury/OFAC sanctions pages, and any congressional or CRS background you can find, then add a few reputable policy-analysis sources like CFR, Brookings, CSIS, or similar to help interpret the record. I want you to reconstruct the policy in a way I could actually use for studying or debate, not just give me a paragraph. Build a dated timeline with at least 12 concrete events or policy moves spanning the full presidency, and make sure it covers sanctions, recognition of Juan Guaidó, diplomatic pressure, any military-threat rhetoric, humanitarian or oil-policy adjustments if shown, and notable responses from Maduro’s government or international actors. For each timeline item, note the date, what happened, which policy tool it represents, and whether the evidence is primary-source or analysis-based. Then compare at least 4 different expert assessments of whether the strategy succeeded, partially succeeded, or failed, and pull out where they agree versus disagree. I also want a short section separating stated goals from likely implied goals, as an inference only where the sources support it. Keep key evidence tabs open for the most important primary documents and at least 3 of the strongest analysis pages so I can inspect them afterward. Finish with one organized briefing memo that explains the strategy in plain English, identifies the major phases, and gives a bottom-line judgment on what Trump’s Venezuela strategy was and how effective it appears to have been. Write this memo in a digestible manner in CrpytoPad Documents.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The final briefing memo reconstructs Trump’s Venezuela policy as a strategy rather than a loose summary, and explains in plain English what the administration was trying to do, how the strategy evolved, and the bottom-line judgment on effectiveness.", + "verification": "Check that the final memo includes a strategy explanation, major phases over time, and a clear concluding judgment on what the strategy was and how effective it appears to have been.", + "weight": 0.2 + }, + "R2": { + "requirement": "A dated timeline with at least 12 concrete events or policy moves is produced, spanning the presidency and covering sanctions, recognition of Juan Guaidó, diplomatic pressure, military-threat rhetoric, humanitarian or oil-policy adjustments if shown, and notable responses from Maduro’s government or international actors.", + "verification": "Count at least 12 dated entries and confirm the required topic areas are represented in the timeline where the public evidence shows them.", + "weight": 0.2 + }, + "R3": { + "requirement": "For each timeline item, the memo records the date, what happened, which policy tool it represents, and whether the supporting evidence is primary-source or analysis-based.", + "verification": "Inspect the timeline format and confirm each entry includes all four requested fields: date, event description, policy-tool classification, and evidence type.", + "weight": 0.15 + }, + "R4": { + "requirement": "The research prioritizes primary sources, specifically using public pages such as archived White House statements, State Department material, Treasury or OFAC sanctions pages, and congressional or CRS background before adding outside analysis.", + "verification": "Review the sources cited or referenced in the memo and confirm the presence of the requested primary-source categories with primary materials clearly used as the main evidentiary base.", + "weight": 0.15 + }, + "R5": { + "requirement": "The memo compares at least 4 different expert assessments of whether the strategy succeeded, partially succeeded, or failed, and explicitly identifies where those assessments agree and disagree.", + "verification": "Count at least 4 expert or institutional assessments and confirm the memo summarizes both consensus points and disagreements among them.", + "weight": 0.12 + }, + "R6": { + "requirement": "The memo includes a separate section distinguishing stated goals from likely implied goals, with any implied-goals claims presented as inference only where supported by the sources.", + "verification": "Check for a dedicated stated-vs-implied-goals section and confirm that inferred goals are labeled as inference rather than asserted as direct fact.", + "weight": 0.08 + }, + "R7": { + "requirement": "Key evidence tabs are left open for the most important primary documents and at least 3 of the strongest analysis pages so the user can inspect the evidence afterward, and the briefing memo is written in CryptoPad Documents.", + "verification": "Confirm that major primary-source pages remain open and that at least 3 analysis tabs from strong policy sources are also left open at the end; the final deliverable includes an open CryptoPad Document containing the briefing memo.", + "weight": 0.1 + } + }, + "task_id": "23081b41564a070a7e5a286b20be23d78e3e561c", + "website": "https://chatgpt.com" + }, + { + "confirmed_task": "I’m trying to choose the best-value hotel for a 2-night future stay near Walt Disney World, and I don’t want a single listing lookup or a vague top-10 blog list. Please start from a major booking site like Hotels.com or similar and build a serious comparison of exactly 12 hotel options that are plausibly within about 15 minutes’ drive of Walt Disney World for the same 2-night stay window. For each hotel, capture the nightly price or total stay price shown, guest rating, review count if shown, parking fee, resort fee, cancellation policy if visible, and mark anything that is not shown as “not shown.” Then cross-check each candidate in Google Maps so the distance/time-to-Disney claim is realistic, and open the map or directions view for the strongest contenders. I also want you to open the actual listing pages for at least 6 of the 12 hotels and compare the room photos and amenity details so we can weed out places that are cheap but clearly worse in quality or location. After that, narrow the list to the best 5 options balancing low price and strong ratings, and give me a final recommendation for three categories: cheapest acceptable pick, best overall value, and nicest option that still seems reasonably priced. Keep the most useful hotel listing tabs and map tabs open at the end so I can review the finalists myself. Can you also separately provide the transit options for getting from each stay to the park, and open the directions for each in Google Maps?", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The browsing session identifies exactly 12 hotel options for the same 2-night future stay window, each plausibly within about 15 minutes’ drive of Walt Disney World.", + "verification": "The final comparison includes 12 distinct hotels and each entry includes a Disney drive-time or map-based plausibility check.", + "weight": 0.16 + }, + "R2": { + "requirement": "For each of the 12 hotels, the comparison records the visible booking details requested: nightly price or total stay price, guest rating, review count if shown, parking fee, resort fee, cancellation policy if visible, and 'not shown' where the page does not provide a field.", + "verification": "Each hotel entry contains all requested fields with no silent omissions; missing fields are explicitly labeled 'not shown'.", + "weight": 0.2 + }, + "R3": { + "requirement": "Each candidate is cross-checked in Google Maps so the time-to-Disney claim is realistic, and map or directions views are opened for the strongest contenders.", + "verification": "There is browser evidence of Google Maps or directions views for the contenders, and the final write-up reflects those checks.", + "weight": 0.12 + }, + "R4": { + "requirement": "The actual listing pages for at least 6 of the 12 hotels are opened and compared using room photos and amenity details to filter out low-quality or poorly located options.", + "verification": "At least 6 hotel listing pages are visibly used as evidence, and the shortlist discussion cites photo or amenity-based quality judgments from those pages.", + "weight": 0.14 + }, + "R5": { + "requirement": "The research narrows the 12 hotels down to the best 5 options that balance low price and strong ratings.", + "verification": "A final ranked or clearly identified shortlist of 5 hotels is produced, with reasoning that uses both price and rating considerations.", + "weight": 0.14 + }, + "R6": { + "requirement": "A final recommendation is given for exactly three categories: cheapest acceptable pick, best overall value, and nicest option that still seems reasonably priced, and the most useful hotel listing tabs and map tabs are left open.", + "verification": "The final output names one hotel for each of the three requested categories and leaves relevant finalist listing pages plus map tabs open for user review.", + "weight": 0.12 + }, + "R7": { + "requirement": "Transit options from each finalist stay to the park are separately provided, and Google Maps directions are opened for those finalist stays.", + "verification": "The final deliverable includes transit notes for each finalist hotel and the browser keeps open Google Maps directions for those finalists.", + "weight": 0.12 + } + }, + "task_id": "adc644f33f82d4454d84a983211494eb887074cb", + "website": "https://www.hotels.com" + }, + { + "confirmed_task": "I’m trying to decide whether a future Phoenix trip from Jacksonville is actually worth booking, and I don’t want just one fare snapshot. Please do a serious flight-shopping pass focused on Wednesday departures from JAX to PHX. Use a major flight search tool plus the operating airlines’ public booking pages to compare exactly 6 future Wednesday outbound dates spread across the next few months. For each of those 6 Wednesdays, check matching return options for exactly 3 trip lengths: 2 nights, 3 nights, and 4 nights. Prioritize nonstop flights when they’re publicly shown; if a nonstop is not shown for a given outbound or return, note that and use the cheapest reasonable one-stop fallback instead. For every one of the 18 round-trip combinations, record the airline, whether it is nonstop or one-stop, departure and arrival times, the lowest publicly shown bookable price, any basic-economy style restrictions that are clearly shown, carry-on and checked-bag costs if publicly shown, and whether the fare is refundable or changeable if that is shown. Then synthesize the results so I can see which Wednesday trip window is the cheapest true total once obvious bag fees are considered, and which option is the best overall if I care more about comfort and schedule than the absolute lowest fare. Keep at least 6 useful evidence tabs open at the end, including the strongest outbound fare results and at least 2 airline fare-detail pages that show the restrictions or fare breakdowns. If any field is not publicly shown, say not shown rather than guessing.", + "level": "hard", + "reference_length": 6, + "rubrics": { + "R1": { + "requirement": "The browsing session compares exactly 6 future Wednesday outbound dates from JAX to PHX and, for each outbound date, checks exactly 3 return trip lengths: 2 nights, 3 nights, and 4 nights.", + "verification": "Final synthesis or notes explicitly cover 18 round-trip combinations formed from 6 Wednesday outbounds × 3 return lengths, with no missing or extra combinations.", + "weight": 0.18 + }, + "R2": { + "requirement": "Each of the 18 round-trip combinations includes the airline, whether the itinerary is nonstop or one-stop, and the departure and arrival times.", + "verification": "For every combination, the recorded result shows airline, stop pattern, and outbound/return timing details drawn from public fare pages.", + "weight": 0.17 + }, + "R3": { + "requirement": "Each of the 18 round-trip combinations includes the lowest publicly shown bookable price, prioritizes nonstop when publicly shown, and otherwise notes that nonstop was not shown and uses the cheapest reasonable one-stop fallback.", + "verification": "Results show a price for every combination and clearly indicate where nonstop was available versus where a one-stop fallback was used because nonstop was not shown.", + "weight": 0.2 + }, + "R4": { + "requirement": "For each combination, the task records any basic-economy-style restrictions clearly shown, carry-on and checked-bag costs if publicly shown, and whether the fare is refundable or changeable if shown, using 'not shown' where needed.", + "verification": "Each combination includes restriction/fee/flexibility fields populated with actual public-page details or the explicit text 'not shown' rather than guesses.", + "weight": 0.17 + }, + "R5": { + "requirement": "The final synthesis identifies the single cheapest true-total trip window after considering obvious bag fees and also identifies the single best overall option for comfort/schedule, with the tradeoff explained.", + "verification": "There are two explicit recommendations—best cheapest-true-total option and best comfort/schedule option—with a short rationale comparing fare, fees, and timing tradeoffs.", + "weight": 0.16 + }, + "R6": { + "requirement": "At least 6 useful evidence tabs are left open, including the strongest outbound fare results and at least 2 airline fare-detail pages showing restrictions or fare breakdowns.", + "verification": "The final browser state retains 6 or more relevant public pages, and at least 2 of those are airline fare-detail pages rather than only metasearch results.", + "weight": 0.12 + } + }, + "task_id": "3d5fe15ca128986861d570a1e43ee687f2264b57", + "website": "https://www.hotels.com" + }, + { + "confirmed_task": "I’m trying to choose the best NCES public-use datasets for a serious secondary analysis project in education, and I don’t just want a quick list of names. Please do a thorough browser-based sweep of NCES public-use data options across the main program areas and build me a usable comparison I could actually start from. Start on NCES and identify at least 10 distinct NCES datasets or data collections that are publicly usable, drawing from a mix of areas like longitudinal studies, postsecondary data, assessment data, adult skills, early childhood, and school or district data where applicable. For each one, open the official NCES dataset or program page and, when available, also open the documentation, codebook, survey, or download page that helps confirm what is actually in the public-use files. I want you to compare each dataset on the questions I’d really care about before choosing one: what population it covers, approximate years available, unit of analysis, whether it is cross-sectional or longitudinal, whether the files are clearly public-use or have restrictions, what topics or variables it seems strongest for, and what download or access format is offered; if something is not shown, say not shown. Then narrow the full scan to the 5 strongest dataset options for someone studying educational opportunity and student outcomes, and explain the tradeoffs among them rather than just ranking blindly. Put the results into one organized comparison sheet or document with one section for all datasets scanned and one final shortlist section for the top 5 picks. Keep the most useful official NCES pages open for the finalist datasets, including at least a few documentation or download pages, so I can verify the evidence myself afterward.", + "level": "hard", + "reference_length": 14, + "rubrics": { + "R1": { + "requirement": "An organized comparison sheet or document is produced with one section covering all datasets scanned and a separate final shortlist section for the top 5 picks.", + "verification": "Confirm that the final artifact exists and contains both an all-datasets section and a distinct finalist section with 5 shortlisted datasets.", + "weight": 0.16 + }, + "R2": { + "requirement": "At least 10 distinct NCES datasets or data collections that are publicly usable are identified from a mix of relevant NCES program areas such as longitudinal studies, postsecondary data, assessment data, adult skills, early childhood, and school or district data where applicable.", + "verification": "Count the datasets listed in the scan and check that they are distinct, NCES-related, publicly usable, and span multiple program areas rather than all coming from one narrow category.", + "weight": 0.18 + }, + "R3": { + "requirement": "For each scanned dataset, the official NCES dataset or program page is opened, and when available, a documentation, codebook, survey, or download page is also opened to confirm the public-use contents.", + "verification": "Review browser evidence to confirm dataset/program pages were opened for the scanned datasets and that supporting documentation-style pages were also opened when available.", + "weight": 0.16 + }, + "R4": { + "requirement": "Each scanned dataset is compared on the requested decision fields: population covered, approximate years available, unit of analysis, whether it is cross-sectional or longitudinal, whether the files are clearly public-use or have restrictions, strongest topics or variables, and download or access format, using 'not shown' where needed.", + "verification": "Inspect the comparison artifact and verify that every scanned dataset includes entries for all requested fields, with 'not shown' used instead of leaving gaps.", + "weight": 0.2 + }, + "R5": { + "requirement": "The final shortlist narrows the scan to exactly 5 strongest dataset options for studying educational opportunity and student outcomes, with an explanation of the tradeoffs among them rather than a bare ranking.", + "verification": "Check that the shortlist contains exactly 5 datasets and that each is accompanied by comparative reasoning about strengths, weaknesses, or fit for the stated research focus.", + "weight": 0.18 + }, + "R6": { + "requirement": "The most useful official NCES pages for the finalist datasets are left open at the end, including finalist dataset pages and at least a few documentation or download pages for verification.", + "verification": "Confirm that relevant finalist tabs remain open and that they include both dataset/program pages and several documentation or download pages tied to the shortlisted datasets.", + "weight": 0.12 + } + }, + "task_id": "fe1a5127a1329930e356744b7fd66a214592c630", + "website": "https://nces.ed.gov" + }, + { + "confirmed_task": "I’m trying to figure out whether Nike Phantom low-top soccer cleats are actually a smart budget choice for ultimate frisbee, not just whether one listing happens to be cheap on one site. Please start with the Nike Phantom 6 Club FG/MG low-top men’s model and verify its current price on the source retailer plus at least 2 other public retailer or brand pages if available, noting any visible sale pricing and whether common men’s sizes are shown or marked not shown. Then build me a serious comparison against 7 other realistic cleat options that someone might actually wear for ultimate frisbee, for 8 total models altogether, with at least 3 brands represented and at least 4 models priced under about $80 if you can find them. For each model, check public product pages and capture the current listed price, surface or stud description, whether it looks better suited to firm ground, multi-ground, turf, or not clearly stated, whether it is low-top or mid/high-top if shown, weight if shown, and return policy basics from the seller or brand page if easy to verify, otherwise mark not shown. I also want you to spend time on the real-frisbee-use question: look at at least 4 public pages from ultimate frisbee communities, reviews, guides, or discussion threads about what people actually prefer in cleats, especially around soccer versus football/lacrosse cleats, traction, toe shape, and comfort for cutting, and use that to explain whether the Phantom-style option seems like a good fit or a compromise. Keep the browser evidence visible as you work: leave open the Nike Phantom product page, the 3 strongest alternative product pages, and 2 of the most useful ultimate-frisbee advice pages. At the end, give me one organized decision memo with all 8 models, the verified price range you found, which ones seem best for strict budget, best overall value, and best specifically if I want something close to the Nike Phantom feel, and tell me clearly whether buying the Nike Phantom is actually the right move for ultimate frisbee or whether I should get something else instead.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The final decision memo includes the Nike Phantom 6 Club FG/MG low-top men’s model plus 7 other cleat models, for 8 total models altogether, with at least 3 brands represented.", + "verification": "Check that the memo lists exactly 8 models, one of which is the specified Nike Phantom model, and that the set spans at least 3 brands.", + "weight": 0.18 + }, + "R2": { + "requirement": "The Nike Phantom model’s current price is verified on the source retailer plus at least 2 other public retailer or brand pages if available, with visible sale pricing noted and common men’s sizes recorded as shown or not shown.", + "verification": "Confirm the memo cites price checks for the Phantom from 3 public pages total when available, including the source retailer, and includes sale-price notes and size-availability notes.", + "weight": 0.18 + }, + "R3": { + "requirement": "For each of the 8 models, the memo records the current listed price, surface or stud description, whether it seems better suited to firm ground, multi-ground, turf, or not clearly stated, whether it is low-top or mid/high-top if shown, weight if shown, and return-policy basics or not shown.", + "verification": "Review each model entry and confirm all requested comparison fields are filled with values or explicitly marked not shown where missing.", + "weight": 0.2 + }, + "R4": { + "requirement": "The comparison set includes at least 4 models priced under about $80 if such models are found during browsing, and the final memo identifies best for strict budget, best overall value, and best specifically for a Nike-Phantom-like feel.", + "verification": "Check the listed prices and final recommendations to confirm the under-$80 target was pursued and that all 3 requested recommendation categories are explicitly named.", + "weight": 0.14 + }, + "R5": { + "requirement": "The real-frisbee-use analysis uses at least 4 public pages from ultimate frisbee communities, reviews, guides, or discussion threads to explain soccer versus football/lacrosse cleat tradeoffs, traction, toe shape, and comfort for cutting.", + "verification": "Confirm at least 4 relevant ultimate-frisbee-oriented public pages were used and that the memo synthesizes the requested tradeoffs rather than only summarizing product listings.", + "weight": 0.17 + }, + "R6": { + "requirement": "The browser evidence is left visible with the Nike Phantom product page, the 3 strongest alternative product pages, and 2 of the most useful ultimate-frisbee advice pages kept open, and the memo ends with a clear verdict on whether the Nike Phantom is actually the right move for ultimate frisbee or whether another cleat is the better choice.", + "verification": "Inspect the open tabs to confirm the 6 requested evidence pages remain open, and check that the memo includes a direct final verdict comparing the Phantom against alternatives.", + "weight": 0.13 + } + }, + "task_id": "42e2344431639ea57815c2c6b42b047cc7814176", + "website": "https://www.als.com" + }, + { + "confirmed_task": "I’m trying to figure out what I should actually do on a future first trip to the United Arab Emirates, not just get a generic top-3 list. Please use public travel and official attraction pages to build me a serious shortlist of exactly 12 candidate experiences across the UAE, with coverage from at least 3 different emirates and a mix of categories like landmark/observation, cultural or historic site, desert or nature experience, museum/art, and one distinctive food or market experience. For each candidate, note the name, which emirate it’s in, what type of experience it is, the source or sources recommending it, the official site if there is one, whether advance booking seems required, ticket price if shown or \"not shown,\" and any obvious timing constraint like sunset, evening, weekday-only, or seasonal best time. As you do this, keep key tabs open for the strongest candidates, including a few official attraction pages plus a few reputable recommendation pages, and use maps/photos to sanity-check that the places are real, distinct, and not awkwardly far apart for a first-time visitor. Then narrow the 12 down to the best 7 experiences for a one-week UAE trip, explain why those 7 beat the others, and organize them into a practical day-by-day plan that minimizes backtracking and groups nearby things together when possible. End with a concise recommendation on the top 3 must-do experiences overall, but only after showing the fuller comparison and leaving the most useful evidence tabs open.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "Exactly 12 candidate UAE experiences are identified, spanning at least 3 different emirates and covering the requested mix of categories: landmark/observation, cultural or historic site, desert or nature experience, museum/art, and one distinctive food or market experience.", + "verification": "Check the final comparison for a count of 12 total candidates, confirm emirate coverage of at least 3, and verify that all requested experience categories are represented.", + "weight": 0.2 + }, + "R2": { + "requirement": "Each of the 12 candidates includes the requested comparison details: name, emirate, experience type, source or sources recommending it, official site if available, whether advance booking seems required, ticket price if shown or 'not shown,' and any obvious timing constraint.", + "verification": "Inspect each candidate entry and confirm that every listed field is present and populated or marked 'not shown' where appropriate.", + "weight": 0.2 + }, + "R3": { + "requirement": "The browsing session uses both recommendation sources and official attraction pages, and key tabs are kept open for the strongest candidates, including multiple official pages and multiple reputable recommendation pages.", + "verification": "Review the open tabs at the end and confirm that they include a mix of official attraction pages and reputable recommendation pages corresponding to the shortlisted candidates.", + "weight": 0.15 + }, + "R4": { + "requirement": "Maps and/or photos are used to sanity-check the candidates so the selected places are real, distinct, and not awkwardly far apart for a first-time visitor.", + "verification": "Confirm that the final reasoning explicitly references map/photo checks or geographic sanity checks when comparing or selecting candidates.", + "weight": 0.1 + }, + "R5": { + "requirement": "The 12 candidates are narrowed down to exactly 7 selected experiences for a one-week UAE trip, with an explanation of why those 7 were chosen over the others.", + "verification": "Check that exactly 7 experiences are selected from the original 12 and that the final write-up includes comparative reasons for inclusion and exclusion.", + "weight": 0.15 + }, + "R6": { + "requirement": "The final output includes a practical day-by-day one-week plan that groups nearby experiences together when possible and aims to minimize backtracking.", + "verification": "Inspect the itinerary for a day-by-day structure and confirm that the sequencing rationale mentions proximity, clustering, or reduced backtracking.", + "weight": 0.1 + }, + "R7": { + "requirement": "The task ends with a concise recommendation of the top 3 must-do UAE experiences overall, after the fuller comparison and planning work is complete.", + "verification": "Check that a final top-3 summary appears after the shortlist comparison and one-week itinerary, not as a standalone shallow answer.", + "weight": 0.1 + } + }, + "task_id": "2075861f234062f252a54c70824524b134a15860", + "website": "https://www.google.com" + }, + { + "confirmed_task": "I want to do a serious black history research session on the relationship between Cassius Clay, later Muhammad Ali, and Malcolm X, and I don’t want just a quick summary from one video. Please use public pages to build me a careful research brief that I could actually study from. Start by finding at least 8 credible sources total, including at least 3 primary or archival sources if available, such as interviews, speeches, letters, newspaper archives, museum or library collections, or official historical institutions, and at least 3 strong secondary sources like biographies, reputable history outlets, or university material. As you work, keep the most useful evidence tabs open, especially the best primary-source pages and at least 2 strong secondary-source pages that clearly explain the relationship.\n\nI want you to figure out the relationship as a timeline, not just a paragraph, so trace at least 6 dated milestones covering how they met, how Malcolm X influenced Clay/Ali’s religious and public identity, what changed around Ali’s rise to the heavyweight title, how the Nation of Islam split affected them, and what happened in the period before Malcolm X was assassinated. Where exact dates or wording are unclear, say not shown rather than guessing.\n\nThen compare how different credible sources frame the relationship. I want at least 3 points of agreement and at least 2 meaningful differences in interpretation, for example whether Malcolm X was mainly a mentor, political guide, spiritual influence, media strategist, or some combination, and how historians describe the reasons for the break between them. If a documentary clip or educational video is especially useful, include it, but anchor the research in credible written sources rather than video alone.\n\nFinish with one organized research brief that includes: a source list grouped into primary versus secondary, the 6-or-more-point timeline with dates, a concise explanation of the relationship’s development and rupture, the agreement-versus-disagreement section across sources, and a short final takeaway on why this relationship matters in Black history and in Ali’s public transformation. Leave the final research brief open along with the key evidence tabs you relied on most. Generate a presentation in CryptoPad Presentations I can present your findings on.", + "level": "hard", + "reference_length": 3, + "rubrics": { + "R1": { + "requirement": "The final research brief uses at least 8 credible public sources total, including at least 3 primary or archival sources and at least 3 strong secondary sources.", + "verification": "Check the final brief for a source list grouped by source type and confirm the count and classification match the prompt.", + "weight": 0.2 + }, + "R2": { + "requirement": "The browsing session keeps the most useful evidence tabs open, including key primary-source pages and at least 2 strong secondary-source pages explaining the relationship.", + "verification": "Inspect the open tabs at the end for visible archival/primary evidence and at least 2 substantive secondary explanation pages.", + "weight": 0.12 + }, + "R3": { + "requirement": "The final brief includes a timeline with at least 6 dated milestones tracing how Clay/Ali and Malcolm X met, how the influence developed, what changed around Ali’s heavyweight-title rise, how the Nation of Islam split affected them, and what happened before Malcolm X’s assassination.", + "verification": "Review the timeline in the brief and confirm it contains 6 or more dated entries covering the specified phases; unclear details may be marked 'not shown' instead of guessed.", + "weight": 0.2 + }, + "R4": { + "requirement": "The brief explains the relationship itself in a concise synthesized narrative covering Malcolm X’s influence on Clay/Ali and the causes and nature of their rupture.", + "verification": "Read the synthesis section and confirm it addresses both the development of the relationship and the breakdown, not just one side.", + "weight": 0.18 + }, + "R5": { + "requirement": "The brief compares how credible sources frame the relationship, identifying at least 3 points of agreement and at least 2 meaningful differences in interpretation.", + "verification": "Check the comparison section for the required counts and ensure the differences are interpretive, not merely wording changes.", + "weight": 0.16 + }, + "R6": { + "requirement": "If a documentary clip or educational video is used, it is included as supporting material, but the research remains anchored in credible written sources rather than relying mainly on video.", + "verification": "Confirm any included video is supplementary and that the brief’s core claims are supported by the written primary and secondary sources listed.", + "weight": 0.04 + }, + "R7": { + "requirement": "The final organized research brief is left open and contains the grouped source list, the dated timeline, the relationship summary, the agreement-versus-disagreement section, and a short explanation of why the relationship matters in Black history and Ali’s public transformation, and a CryptoPad Presentation is also created so the findings can be presented.", + "verification": "Inspect the final open page and confirm all requested sections are present and clearly organized; an open CryptoPad Presentation containing the timeline, source comparison, and takeaway is available.", + "weight": 0.1 + } + }, + "task_id": "3bdaf7a3557ebeb664c36ae28326cc90984c4768", + "website": "https://www.youtube.com" + }, + { + "confirmed_task": "I want to figure out which core exercises are actually worth adding to a powerlifting routine, not just get a quick summary of one Reddit thread. Start by searching Reddit and open at least 8 substantial public discussion threads from places like r/Fitness, r/powerlifting, or similar lifting communities that talk about ab or core work for squat, bench, and deadlift carryover. From those threads, pull the exercises that come up repeatedly and note the main reason people recommend each one, like bracing strength, anti-extension, anti-rotation, direct ab hypertrophy, or lower-back stability. Then sanity-check those Reddit favorites against at least 5 credible public coaching or exercise-technique sources, and open demo pages or videos for the most commonly mentioned movements so it’s clear what each exercise actually is. Build me one organized final recommendation memo with at least 12 distinct exercises total, grouped by purpose, and for each one include a short note on why lifters use it, what equipment it needs, whether it seems better for beginners or more advanced lifters, and whether it looks like a high-confidence pick, mixed-opinion pick, or probably overrated pick based on the sources you reviewed. Also make a final shortlist of exactly 6 exercises I could realistically rotate into a powerlifting accessory setup: 2 bodyweight-only options, 2 gym-machine-or-cable options, and 2 heavily loaded options. Keep the most useful Reddit threads, at least 2 coaching-source tabs, and at least 2 exercise demo tabs open at the end so I can review the evidence myself. Write up this information in CryptoPad Documents for a comprehensive workout guide and plan.", + "level": "hard", + "reference_length": 6, + "rubrics": { + "R1": { + "requirement": "At least 8 substantial public Reddit discussion threads about ab/core work for powerlifting or carryover to squat, bench, and deadlift are found and used.", + "verification": "Check that the browsing session includes at least 8 relevant Reddit thread pages and that the final memo draws exercise recommendations from across those threads rather than from only one thread.", + "weight": 0.16 + }, + "R2": { + "requirement": "The final memo identifies at least 12 distinct exercises total and groups them by purpose such as bracing strength, anti-extension, anti-rotation, direct ab hypertrophy, or lower-back stability.", + "verification": "Check the final memo for at least 12 unique exercise entries and visible grouping by training purpose.", + "weight": 0.2 + }, + "R3": { + "requirement": "For each exercise in the final memo, there is a short note covering why lifters use it, what equipment it needs, whether it seems better for beginners or more advanced lifters, and whether it is labeled high-confidence, mixed-opinion, or probably overrated.", + "verification": "Review the final memo and confirm that every listed exercise includes all four requested fields.", + "weight": 0.18 + }, + "R4": { + "requirement": "The Reddit-derived favorites are sanity-checked against at least 5 credible public coaching or exercise-technique sources.", + "verification": "Check that at least 5 non-Reddit coaching or technique sources were opened and used in the synthesis, and that the memo reflects cross-checking rather than Reddit-only aggregation.", + "weight": 0.15 + }, + "R5": { + "requirement": "Demo pages or videos are opened for the most commonly mentioned movements so it is clear what the exercises actually are.", + "verification": "Confirm that exercise demo pages or videos were opened for the commonly discussed movements and that at least 2 such demo tabs remain open at the end.", + "weight": 0.11 + }, + "R6": { + "requirement": "The final memo includes exactly 6 practical rotation picks: 2 bodyweight-only options, 2 gym-machine-or-cable options, and 2 heavily loaded options, and the final material is written up in CryptoPad Documents as a comprehensive workout guide and plan.", + "verification": "Check the final recommendation section for exactly 6 exercises split into the requested 2/2/2 categories; the final deliverable includes an open CryptoPad Document containing the workout guide and plan.", + "weight": 0.12 + }, + "R7": { + "requirement": "The most useful evidence tabs are left open at the end, including Reddit threads, at least 2 coaching-source tabs, and at least 2 exercise demo tabs.", + "verification": "Inspect the final browser state and confirm that useful evidence tabs remain open across those three source types.", + "weight": 0.08 + } + }, + "task_id": "4d033a71318c5a2869b0f7132b0bae578bb75876", + "website": "https://www.reddit.com" + }, + { + "confirmed_task": "I want to buy a pair of wired earbuds on Amazon, but I care much more about durability than hype, so please turn this into a real buying decision instead of just grabbing the first listing. Start by finding 10 to 12 plausible wired earbud or IEM models that are still easy to buy from major public pages, with Amazon included whenever possible, and keep the scope focused on everyday-use wired options rather than studio gear. For each candidate, compare the things that usually matter for longevity: cable thickness or strain relief, whether the cable is detachable, connector type, inline mic if shown, warranty length if publicly stated, price, and any obvious build-quality cues from the photos and product details. Then cross-check those candidates with at least one credible review or detailed discussion source per model so we are not relying on retailer marketing alone, and pay special attention to recurring failure complaints like one side dying, weak plugs, or bad strain relief. After that, narrow the list to exactly 4 finalists that look strongest on durability-for-price, and for each finalist keep open the Amazon listing plus one supporting non-Amazon evidence page. Among those 4, tell me which one is the best overall pick, which one is the best budget pick, which one is best if I want a detachable cable for easier long-term replacement, and which one should be avoided even if the sound looks attractive because the durability evidence is weak. Put the final comparison into one short decision memo organized by candidate, with prices marked as shown or not shown, and leave the most useful tabs open so I can review the finalists and buy from Amazon afterward. Use CryptoPad documents, and also store the different options in a separate CrpytoPad Spreadsheet.", + "level": "hard", + "reference_length": 6, + "rubrics": { + "R1": { + "requirement": "A comparison set of 10 to 12 plausible wired earbud or IEM models is assembled from public pages, with Amazon included whenever possible and the scope kept to everyday-use wired options rather than studio gear.", + "verification": "Check that the final memo lists 10 to 12 distinct candidate models and that the browsing session includes corresponding public product pages, including Amazon pages where available.", + "weight": 0.16 + }, + "R2": { + "requirement": "For each candidate, the memo compares the requested durability-related factors: cable thickness or strain relief, detachable-cable status, connector type, inline mic if shown, warranty length if publicly stated, price, and visible build-quality cues from photos or product details.", + "verification": "Check that every listed candidate has entries for each requested factor, using 'not shown' where necessary instead of leaving fields ambiguous.", + "weight": 0.18 + }, + "R3": { + "requirement": "Each candidate is cross-checked with at least one credible review or detailed discussion source beyond the retailer page, with attention to recurring failure complaints such as one side dying, weak plugs, or bad strain relief.", + "verification": "Check that every candidate has at least one linked or cited non-retailer evidence source in the memo and that recurring durability complaints or positive durability signals are summarized for each.", + "weight": 0.2 + }, + "R4": { + "requirement": "The list is narrowed to exactly 4 finalists that appear strongest on durability-for-price.", + "verification": "Check that the final section names exactly 4 finalists and that they are presented as the narrowed shortlist rather than an open-ended set.", + "weight": 0.12 + }, + "R5": { + "requirement": "For each of the 4 finalists, the browser keeps open the Amazon listing plus one supporting non-Amazon evidence page.", + "verification": "Check that there are 8 relevant finalist tabs left open in total: 4 Amazon listing pages and 4 corresponding non-Amazon supporting pages.", + "weight": 0.14 + }, + "R6": { + "requirement": "The final decision memo identifies one best overall pick, one best budget pick, one best choice for detachable-cable longevity, and one option to avoid despite appealing sound if the durability evidence is weak.", + "verification": "Check that all four requested verdict categories are explicitly filled with named models and short justifications tied to the collected evidence.", + "weight": 0.12 + }, + "R7": { + "requirement": "The final comparison is delivered as one short decision memo organized by candidate, with prices marked as shown or not shown, and the most useful finalist tabs are left open for follow-up Amazon purchase, and the decision memo is created in CryptoPad Documents and the compared options are also stored in a separate CryptoPad Spreadsheet.", + "verification": "Check that there is a single organized memo containing the requested candidate-by-candidate comparison and that the useful finalist tabs remain open at the end; the final deliverables include both an open CryptoPad Document memo and a separate CryptoPad Spreadsheet of the options.", + "weight": 0.08 + } + }, + "task_id": "2a8418c2dccdaf5fe23ff143745cc5659d35fc69", + "website": "https://www.amazon.com" + } +] \ No newline at end of file diff --git a/packages/evals/evals.config.json b/packages/evals/evals.config.json index c7cff0b532..9ed4140a7c 100644 --- a/packages/evals/evals.config.json +++ b/packages/evals/evals.config.json @@ -17,6 +17,9 @@ }, "webtailbench": { "limit": 25 + }, + "odysseysbench": { + "limit": 25 } } } diff --git a/packages/evals/framework/benchPlanner.ts b/packages/evals/framework/benchPlanner.ts index 5f93ba39b3..2c5717cf31 100644 --- a/packages/evals/framework/benchPlanner.ts +++ b/packages/evals/framework/benchPlanner.ts @@ -3,6 +3,7 @@ import { EvalsError } from "../errors.js"; import { buildOnlineMind2WebTestcases } from "../suites/onlineMind2Web.js"; import { buildWebTailBenchTestcases } from "../suites/webtailbench.js"; import { buildWebVoyagerTestcases } from "../suites/webvoyager.js"; +import { buildOdysseysBenchTestcases } from "../suites/odysseysbench.js"; import { getAgentModelEntries, getModelList, @@ -513,6 +514,7 @@ export function generateSuiteTestcases( "agent/webvoyager": (models) => buildWebVoyagerTestcases(models), "agent/onlineMind2Web": (models) => buildOnlineMind2WebTestcases(models), "agent/webtailbench": (models) => buildWebTailBenchTestcases(models), + "agent/odysseysbench": (models) => buildOdysseysBenchTestcases(models), }; const legacyOnlySuites = new Set(["agent/gaia"]); diff --git a/packages/evals/framework/discovery.ts b/packages/evals/framework/discovery.ts index c688292c5b..aa64e0411e 100644 --- a/packages/evals/framework/discovery.ts +++ b/packages/evals/framework/discovery.ts @@ -63,6 +63,7 @@ const CATEGORY_OVERRIDES: Record = { "agent/webvoyager": ["external_agent_benchmarks"], "agent/onlineMind2Web": ["external_agent_benchmarks"], "agent/webtailbench": ["external_agent_benchmarks"], + "agent/odysseysbench": ["external_agent_benchmarks"], }; function getTaskBasename(taskName: string): string { diff --git a/packages/evals/framework/externalHarnessPlan.ts b/packages/evals/framework/externalHarnessPlan.ts index fa23bf99e6..f8539c134e 100644 --- a/packages/evals/framework/externalHarnessPlan.ts +++ b/packages/evals/framework/externalHarnessPlan.ts @@ -2,7 +2,7 @@ import { EvalsError } from "../errors.js"; import type { EvalInput } from "../types/evals.js"; export interface ExternalHarnessTaskPlan { - dataset: "webvoyager" | "onlineMind2Web" | "webtailbench"; + dataset: "webvoyager" | "onlineMind2Web" | "webtailbench" | "odysseysbench"; taskId?: string; startUrl: string; instruction: string; @@ -68,7 +68,22 @@ export function buildExternalHarnessTaskPlan( }; } + if (input.name === "agent/odysseysbench") { + const instruction = readString(params, "confirmed_task"); + if (!instruction) { + throw new EvalsError( + `Missing OdysseysBench params for external harness: expected confirmed_task.`, + ); + } + return { + dataset: "odysseysbench", + taskId: readString(params, "task_id"), + startUrl: readString(params, "website") ?? "https://www.google.com", + instruction, + }; + } + throw new EvalsError( - `External harness "${input.name}" is not supported yet. Supported: agent/webvoyager, agent/onlineMind2Web, agent/webtailbench.`, + `External harness "${input.name}" is not supported yet. Supported: agent/webvoyager, agent/onlineMind2Web, agent/webtailbench, agent/odysseysbench.`, ); } diff --git a/packages/evals/index.eval.ts b/packages/evals/index.eval.ts index 010585fe8b..1dadc37d29 100644 --- a/packages/evals/index.eval.ts +++ b/packages/evals/index.eval.ts @@ -52,6 +52,7 @@ import { buildWebVoyagerTestcases } from "./suites/webvoyager.js"; import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web.js"; import { endBrowserbaseSession } from "./browserbaseCleanup.js"; import { buildWebTailBenchTestcases } from "./suites/webtailbench.js"; +import { buildOdysseysBenchTestcases } from "./suites/odysseysbench.js"; import { getCurrentDirPath } from "./runtimePaths.js"; import dotenv from "dotenv"; @@ -252,6 +253,25 @@ const generateFilteredTestcases = (): Testcase[] => { taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webtailbench"); } + // Special handling: fan out OdysseysBench dataset for agent/odysseysbench + const isOdysseysBenchTaskIncluded = taskNamesToRun.includes( + "agent/odysseysbench", + ); + + if ( + isOdysseysBenchTaskIncluded && + (!datasetFilter || datasetFilter === "odysseysbench") + ) { + taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/odysseysbench"); + allTestcases.push(...buildOdysseysBenchTestcases(currentModels)); + } else if ( + isOdysseysBenchTaskIncluded && + datasetFilter && + datasetFilter !== "odysseysbench" + ) { + taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/odysseysbench"); + } + // Create a list of all remaining testcases using the determined task names and models const isAgentCategory = effectiveCategory === "agent" || diff --git a/packages/evals/scripts/build-odysseysbench-dataset.ts b/packages/evals/scripts/build-odysseysbench-dataset.ts new file mode 100644 index 0000000000..35bdc7d3f1 --- /dev/null +++ b/packages/evals/scripts/build-odysseysbench-dataset.ts @@ -0,0 +1,209 @@ +/** + * Build packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl from the + * published OdysseysBench task set. + * + * OdysseysBench (https://odysseysbench.com) is a 200-task web-agent benchmark + * (45 easy / 46 medium / 109 hard). Every task ships a weighted rubric whose + * weights sum to 1.0. This script converts each task's `rubrics` map into the + * verifier's `precomputed_rubric` shape ({ items: [{ criterion, description, + * max_points }] }) so the suite can hand it straight to V3Evaluator.verify() + * without generating a rubric. + * + * Source of truth is the committed snapshot at + * packages/evals/datasets/odysseysbench/source/tasks.json + * (mirrored from https://odysseysbench.com/assets/data/tasks.json). Re-fetch + * with `--fetch` to refresh that snapshot before rebuilding. + * + * Run after pulling the branch (or whenever the source snapshot changes): + * pnpm tsx packages/evals/scripts/build-odysseysbench-dataset.ts + * + * Idempotent — regenerates the JSONL deterministically from the snapshot. + */ +import fs from "node:fs/promises"; +import path from "node:path"; + +const SOURCE_URL = "https://odysseysbench.com/assets/data/tasks.json"; + +const DATASET_DIR = path.join( + path.resolve(import.meta.dirname, ".."), + "datasets", + "odysseysbench", +); +const SOURCE_PATH = path.join(DATASET_DIR, "source", "tasks.json"); +const JSONL_PATH = path.join(DATASET_DIR, "OdysseysBench_data.jsonl"); + +interface SourceRubric { + requirement: string; + verification: string; + weight: number; +} + +interface SourceTask { + task_id: string; + confirmed_task: string; + website: string; + reference_length: number; + level: "easy" | "medium" | "hard"; + rubrics: Record; + categories?: string[]; + num_categories?: number; +} + +interface RubricItem { + criterion: string; + description: string; + max_points: number; +} + +interface OutputRow { + task_id: string; + confirmed_task: string; + website: string; + level: "easy" | "medium" | "hard"; + reference_length: number; + categories?: string[]; + precomputed_rubric: { items: RubricItem[] }; +} + +/** Order rubric keys R1, R2, … R10 numerically rather than lexicographically. */ +function sortRubricKeys(keys: string[]): string[] { + return [...keys].sort((a, b) => { + const na = Number.parseInt(a.replace(/^\D+/, ""), 10); + const nb = Number.parseInt(b.replace(/^\D+/, ""), 10); + if (Number.isFinite(na) && Number.isFinite(nb) && na !== nb) return na - nb; + return a.localeCompare(b); + }); +} + +const POINT_SCALE = 1000; + +/** + * Convert one OdysseysBench rubric entry into a verifier rubric item. + * + * `weight` (summing to 1.0 across a task) is scaled to integer points. The + * process score is Σ earned / Σ max, so the absolute scale is immaterial — but + * rounding is *not* a uniform scaling, so a coarse scale (e.g. ×100) would + * distort the relative weighting of small criteria. ×1000 keeps the rounding + * error well under 1% even for the smallest published weights. `max(1, …)` is + * a defensive floor; with valid weights it never binds. + */ +function toRubricItem(r: SourceRubric): RubricItem { + return { + criterion: r.requirement, + description: `${r.requirement}\n\nHow a grader verifies this: ${r.verification}`, + max_points: Math.max(1, Math.round(r.weight * POINT_SCALE)), + }; +} + +async function loadSource(): Promise { + if (process.argv.includes("--fetch")) { + const res = await fetch(SOURCE_URL); + if (!res.ok) { + throw new Error(`Failed to fetch ${SOURCE_URL}: ${res.status}`); + } + const text = await res.text(); + await fs.mkdir(path.dirname(SOURCE_PATH), { recursive: true }); + await fs.writeFile(SOURCE_PATH, text); + console.log(`Refreshed snapshot: ${SOURCE_PATH}`); + return JSON.parse(text) as SourceTask[]; + } + const text = await fs.readFile(SOURCE_PATH, "utf8"); + return JSON.parse(text) as SourceTask[]; +} + +async function main(): Promise { + const tasks = await loadSource(); + if (!Array.isArray(tasks) || tasks.length === 0) { + throw new Error("Source tasks.json is empty or not an array"); + } + + const lines: string[] = []; + for (const task of tasks) { + // Fail loud on an upstream schema change rather than silently emitting a + // row the suite validator would later drop (shrinking the benchmark). + if (typeof task.task_id !== "string" || !task.task_id) { + throw new Error( + `Task is missing a string task_id: ${JSON.stringify(task).slice(0, 200)}`, + ); + } + if (typeof task.confirmed_task !== "string" || !task.confirmed_task) { + throw new Error(`Task ${task.task_id} is missing confirmed_task`); + } + const rubricKeys = sortRubricKeys(Object.keys(task.rubrics ?? {})); + if (rubricKeys.length === 0) { + throw new Error(`Task ${task.task_id} has no rubrics`); + } + // Validate each rubric entry individually — an aggregate sum check alone + // lets a bad weight (e.g. negative, offset by a larger one) or an empty + // requirement/verification slip through and produce a mis-weighted or + // malformed item. Fail loud on source schema drift instead. + for (const k of rubricKeys) { + const r = task.rubrics[k]; + if (typeof r?.requirement !== "string" || !r.requirement.trim()) { + throw new Error( + `Task ${task.task_id} rubric ${k} has an empty requirement`, + ); + } + if (typeof r.verification !== "string" || !r.verification.trim()) { + throw new Error( + `Task ${task.task_id} rubric ${k} has an empty verification`, + ); + } + if ( + typeof r.weight !== "number" || + !Number.isFinite(r.weight) || + r.weight <= 0 || + r.weight > 1 + ) { + throw new Error( + `Task ${task.task_id} rubric ${k} has invalid weight ${r.weight}; expected a number in (0, 1]`, + ); + } + } + // The published weights are a normalized distribution; a re-fetched snapshot + // that breaks that convention would silently mis-weight the rubric. + const weightSum = rubricKeys.reduce( + (acc, k) => acc + task.rubrics[k].weight, + 0, + ); + if (Math.abs(weightSum - 1) > 0.02) { + throw new Error( + `Task ${task.task_id} rubric weights sum to ${weightSum}, expected ~1.0`, + ); + } + const items = rubricKeys.map((k) => toRubricItem(task.rubrics[k])); + + const row: OutputRow = { + task_id: task.task_id, + confirmed_task: task.confirmed_task, + website: task.website, + level: task.level, + reference_length: task.reference_length, + ...(Array.isArray(task.categories) && task.categories.length > 0 + ? { categories: task.categories } + : {}), + precomputed_rubric: { items }, + }; + lines.push(JSON.stringify(row)); + } + + if (lines.length !== tasks.length) { + throw new Error( + `Expected ${tasks.length} rows, produced ${lines.length} — a task was dropped`, + ); + } + + await fs.writeFile(JSONL_PATH, lines.join("\n") + "\n"); + const byLevel = tasks.reduce>((acc, t) => { + acc[t.level] = (acc[t.level] ?? 0) + 1; + return acc; + }, {}); + console.log( + `Wrote ${lines.length} rows to ${JSONL_PATH} (${JSON.stringify(byLevel)})`, + ); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/packages/evals/suites/odysseysbench.ts b/packages/evals/suites/odysseysbench.ts new file mode 100644 index 0000000000..f4ce58c76d --- /dev/null +++ b/packages/evals/suites/odysseysbench.ts @@ -0,0 +1,148 @@ +import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js"; +import { normalizeRubric, type AvailableModel } from "@browserbasehq/stagehand"; +import { tasksConfig } from "../taskConfig.js"; +import { getPackageRootDir } from "../runtimePaths.js"; +import { + readJsonlFile, + parseJsonlRows, + applySampling, + normalizeAgentModelEntries, +} from "../utils.js"; + +/** + * Build OdysseysBench testcases. + * + * OdysseysBench (https://odysseysbench.com) is a 200-task web-agent benchmark + * spanning easy/medium/hard difficulty. Every task ships a weighted rubric + * (baked into `precomputed_rubric` by scripts/build-odysseysbench-dataset.ts), + * so the verifier scores against the published criteria directly rather than + * generating its own. + * + * Env knobs: + * - EVAL_MAX_K / EVAL_ODYSSEYSBENCH_LIMIT — cap the number of tasks (default 25). + * - EVAL_ODYSSEYSBENCH_SAMPLE — random sample size (overrides the limit cap). + * - EVAL_ODYSSEYSBENCH_LEVEL — comma-separated difficulty filter (easy,medium,hard). + * - EVAL_ODYSSEYSBENCH_IDS — comma-separated task_ids to run exactly, in order + * (ignores sampling / limit / level knobs). + */ +/** Parse an env var to a positive integer; undefined for unset/non-numeric. */ +function parsePositiveIntEnv(value: string | undefined): number | undefined { + if (!value) return undefined; + const n = Number(value); + return Number.isInteger(n) && n > 0 ? n : undefined; +} + +export const buildOdysseysBenchTestcases = ( + models: string[] | AgentModelEntry[], +): Testcase[] => { + const odysseysbenchFilePath = + getPackageRootDir() + "/datasets/odysseysbench/OdysseysBench_data.jsonl"; + + const lines = readJsonlFile(odysseysbenchFilePath); + + // Ignore unset / non-numeric env values rather than letting Number("foo") + // become NaN, which would slip past applySampling's `>= maxCases` cap and + // silently fan out the full 200-task dataset. + const maxCases = + parsePositiveIntEnv(process.env.EVAL_MAX_K) ?? + parsePositiveIntEnv(process.env.EVAL_ODYSSEYSBENCH_LIMIT) ?? + 25; + const sampleCount = parsePositiveIntEnv( + process.env.EVAL_ODYSSEYSBENCH_SAMPLE, + ); + + type OdysseysBenchRow = { + task_id: string; + confirmed_task: string; + website?: string; + level?: "easy" | "medium" | "hard"; + reference_length?: number; + categories?: string[]; + /** + * Per-task weighted rubric in verifier `{ items: [...] }` shape, produced + * from the published rubrics by scripts/build-odysseysbench-dataset.ts. + */ + precomputed_rubric?: unknown; + [key: string]: unknown; + }; + + function isOdysseysBenchRow(parsed: unknown): parsed is OdysseysBenchRow { + if (parsed === null || typeof parsed !== "object") return false; + const obj = parsed as Record; + return ( + typeof obj.task_id === "string" && typeof obj.confirmed_task === "string" + ); + } + + const candidates = parseJsonlRows(lines, isOdysseysBenchRow); + + // EVAL_ODYSSEYSBENCH_IDS restricts the suite to exactly those task IDs, + // preserving the order given and ignoring sampling / limit / level knobs. + const explicitIds = process.env.EVAL_ODYSSEYSBENCH_IDS + ? process.env.EVAL_ODYSSEYSBENCH_IDS.split(",") + .map((s) => s.trim()) + .filter(Boolean) + : null; + + let rows: OdysseysBenchRow[]; + if (explicitIds && explicitIds.length > 0) { + const byId = new Map(candidates.map((r) => [r.task_id, r])); + rows = explicitIds + .map((id) => byId.get(id)) + .filter((r): r is OdysseysBenchRow => Boolean(r)); + } else { + // Optional difficulty filter, applied before sampling. + const levelFilter = process.env.EVAL_ODYSSEYSBENCH_LEVEL + ? new Set( + process.env.EVAL_ODYSSEYSBENCH_LEVEL.split(",") + .map((s) => s.trim().toLowerCase()) + .filter(Boolean), + ) + : null; + const filtered = levelFilter + ? candidates.filter((r) => r.level && levelFilter.has(r.level)) + : candidates; + rows = applySampling(filtered, sampleCount, maxCases); + } + + const allTestcases: Testcase[] = []; + for (const modelEntry of normalizeAgentModelEntries(models)) { + for (const row of rows) { + const input: EvalInput = { + name: "agent/odysseysbench", + modelName: modelEntry.modelName as AvailableModel, + agentMode: modelEntry.mode, + isCUA: modelEntry.mode === "cua", + params: { + task_id: row.task_id, + confirmed_task: row.confirmed_task, + website: row.website, + level: row.level, + reference_length: row.reference_length, + precomputed_rubric: normalizeRubric(row.precomputed_rubric), + }, + }; + const taskCategories = + tasksConfig.find((t) => t.name === input.name)?.categories || []; + allTestcases.push({ + input, + name: input.name, + tags: [modelEntry.modelName, modelEntry.mode, "odysseysbench"], + metadata: { + model: modelEntry.modelName as AvailableModel, + test: `${input.name}:${row.task_id}`, + tier: "bench", + task: input.name, + category: taskCategories[0] || "agent", + categories: taskCategories, + dataset: "odysseysbench", + task_id: row.task_id, + task_category: row.level, + }, + expected: true, + }); + } + } + + return allTestcases; +}; diff --git a/packages/evals/taskConfig.ts b/packages/evals/taskConfig.ts index 2e438f4496..971536fae6 100644 --- a/packages/evals/taskConfig.ts +++ b/packages/evals/taskConfig.ts @@ -139,6 +139,7 @@ const CATEGORY_OVERRIDES: Record = { "agent/webvoyager": ["external_agent_benchmarks"], "agent/onlineMind2Web": ["external_agent_benchmarks"], "agent/webtailbench": ["external_agent_benchmarks"], + "agent/odysseysbench": ["external_agent_benchmarks"], }; /** diff --git a/packages/evals/tasks/bench/agent/odysseysbench.ts b/packages/evals/tasks/bench/agent/odysseysbench.ts new file mode 100644 index 0000000000..c4f9f929c3 --- /dev/null +++ b/packages/evals/tasks/bench/agent/odysseysbench.ts @@ -0,0 +1,129 @@ +import { normalizeRubric, type TaskSpec } from "@browserbasehq/stagehand"; + +import { defineBenchTask } from "../../../framework/defineTask.js"; +import { + evaluationResultToSuccess, + runWithVerifier, +} from "../../../framework/verifierAdapter.js"; + +/** + * OdysseysBench bench task. + * + * OdysseysBench (https://odysseysbench.com) is a 200-task web-agent benchmark + * (45 easy / 46 medium / 109 hard). Every task ships a weighted rubric, baked + * into `precomputed_rubric` by scripts/build-odysseysbench-dataset.ts, so the + * verifier scores process + outcome against the published criteria directly. + * + * Runs the agent through TrajectoryRecorder + V3Evaluator.verify() like the + * other rubric-bearing suites (WebTailBench). + * + * --success knob: defaults to "outcome". + * Override via the EVAL_SUCCESS_MODE env var: outcome | process | both. + */ +export default defineBenchTask( + { name: "agent/odysseysbench" }, + async ({ v3, logger, debugUrl, sessionUrl, modelName, input }) => { + try { + const params = ((input && input.params) || {}) as { + task_id?: string; + confirmed_task?: string; + website?: string; + level?: "easy" | "medium" | "hard"; + reference_length?: number; + precomputed_rubric?: unknown; + }; + + if (!params.confirmed_task) { + return { + _success: false, + error: `Missing odysseysbench params (confirmed_task). Got: ${JSON.stringify(params)}`, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + + // OdysseysBench ships a published rubric for every task; scoring against a + // generated one would silently break benchmark fidelity. Fail the case + // rather than fall back to rubric generation when it's absent. + const precomputedRubric = normalizeRubric(params.precomputed_rubric); + if (!precomputedRubric) { + return { + _success: false, + error: `OdysseysBench task ${params.task_id ?? input.name} is missing a precomputed rubric; refusing to fall back to a generated rubric.`, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + + const page = v3.context.pages()[0]; + const startUrl = params.website || "https://www.google.com"; + await page.goto(startUrl, { timeoutMs: 120_000 }); + + const systemPrompt = `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: " summarizing the requested result (e.g., score, list, or text). Current page: ${await page.title()}. You will need to navigate to the appropriate website to complete the task.`; + const agentMode = input.agentMode ?? (input.isCUA ? "cua" : "hybrid"); + const agent = v3.agent({ + mode: agentMode, + model: modelName, + systemPrompt, + }); + + const taskSpec: TaskSpec = { + id: params.task_id ?? `odysseysbench/${input.name}`, + instruction: params.confirmed_task, + initUrl: startUrl, + precomputedRubric, + }; + + const { evaluationResult, trajectory, trajectoryDir, rubric } = + await runWithVerifier({ + v3, + agent, + taskSpec, + dataset: "odysseysbench", + agentOptions: { + maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50, + }, + }); + + const successMode = process.env.EVAL_SUCCESS_MODE; + + logger.log({ + category: "evaluation", + message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} criteria=${rubric.items.length} steps=${trajectory.steps.length}`, + level: 1, + }); + + return { + _success: evaluationResultToSuccess(evaluationResult, successMode), + outcomeSuccess: evaluationResult.outcomeSuccess, + processScore: evaluationResult.processScore, + evidenceInsufficient: evaluationResult.evidenceInsufficient, + criterionCount: rubric.items.length, + stepCount: trajectory.steps.length, + trajectoryDir, + primaryIntent: evaluationResult.rawSteps?.primaryIntent, + reasoning: evaluationResult.rawSteps?.reasoning, + level: params.level, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir; + return { + _success: false, + error, + trajectoryDir, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + }, +); + +function formatProcessScore(score: number | undefined): string { + return typeof score === "number" ? score.toFixed(2) : "n/a"; +} diff --git a/packages/evals/tui/commands/parse.ts b/packages/evals/tui/commands/parse.ts index d5d7234032..dccf5fb1ae 100644 --- a/packages/evals/tui/commands/parse.ts +++ b/packages/evals/tui/commands/parse.ts @@ -99,6 +99,7 @@ const SUPPORTED_BENCHMARKS = new Set([ "webvoyager", "onlineMind2Web", "webtailbench", + "odysseysbench", ]); const LEGACY_ONLY_BENCHMARKS = new Set(["gaia", "osworld"]);