<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/">
  <channel>
    <title>Evals on Shane Caldwell</title>
    <link>https://hackbot.dad/tags/evals/</link>
    <description>Recent content in Evals on Shane Caldwell</description>
    <image>
      <title>Shane Caldwell</title>
      <url>https://hackbot.dad/</url>
      <link>https://hackbot.dad/</link>
    </image>
    <generator>Hugo -- 0.146.2</generator>
    <language>en-us</language>
    <lastBuildDate>Mon, 20 Apr 2026 00:00:00 +0000</lastBuildDate>
    <atom:link href="https://hackbot.dad/tags/evals/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>No Autonomy Without Scalable Oversight</title>
      <link>https://hackbot.dad/writing/no-autonomy-without-scalable-oversight/</link>
      <pubDate>Mon, 20 Apr 2026 00:00:00 +0000</pubDate>
      <guid>https://hackbot.dad/writing/no-autonomy-without-scalable-oversight/</guid>
      <description>What to expect as we enter the Year of The Judge</description>
    </item>
    <item>
      <title>ProofJudge: Can we align vibe-proving with human taste?</title>
      <link>https://hackbot.dad/writing/proofjudge-and-taste/</link>
      <pubDate>Sun, 29 Mar 2026 00:00:00 +0000</pubDate>
      <guid>https://hackbot.dad/writing/proofjudge-and-taste/</guid>
      <description>Towards measuring alignment with human taste in autoformalization with judge agents.</description>
    </item>
    <item>
      <title>The Tests All Pass</title>
      <link>https://hackbot.dad/writing/tests-all-pass/</link>
      <pubDate>Sat, 14 Mar 2026 00:00:00 +0000</pubDate>
      <guid>https://hackbot.dad/writing/tests-all-pass/</guid>
      <description>METR&amp;#39;s SWE-bench analysis shows us taste isn&amp;#39;t verifiable.</description>
    </item>
    <item>
      <title>Offsec Evals: Growing Up In The Dark Forest</title>
      <link>https://hackbot.dad/writing/offsec-evals-dark-forest/</link>
      <pubDate>Tue, 28 Oct 2025 00:00:00 +0000</pubDate>
      <guid>https://hackbot.dad/writing/offsec-evals-dark-forest/</guid>
      <description>If you contribute a public benchmark, are you giving free capability to your competitors?</description>
    </item>
    <item>
      <title>GPT-5 is Good, Actually: The Agony and Ecstasy of Public Benchmarks</title>
      <link>https://hackbot.dad/writing/agony-and-ecstasy-evals/</link>
      <pubDate>Sun, 17 Aug 2025 00:00:00 +0000</pubDate>
      <guid>https://hackbot.dad/writing/agony-and-ecstasy-evals/</guid>
      <description>An attempt to explain why benchmarks are either bad or secret, and why the bar charts don&amp;#39;t matter so much.</description>
    </item>
  </channel>
</rss>
