{"date":"2026-06-23T08:55:19Z","drift":[{"model":"openai/gpt-5.4-mini","now":2.25,"prompt":"spatial-1","type":"SCORE_DROP","was":3.67},{"model":"anthropic/claude-sonnet-4-6","now":4.75,"prompt":"common-sense-1","type":"SCORE_RISE","was":3.67},{"model":"anthropic/claude-haiku-4-5","now":false,"prompt":"common-sense-1","type":"REGRESSION","was":true},{"model":"gemini/gemini-2.5-flash","now":1.75,"prompt":"causality-1","type":"SCORE_DROP","was":3.5}],"headline":"claude-haiku-4-5 lost common-sense-1. gpt-5.4-mini dropped on spatial-1; gemini-2.5-flash dropped on causality-1. gpt-5.4-mini failing spatial-1; gemini-2.5-flash failing causality-1. claude-sonnet-4-6 scores rising.","previous":{"anthropic/claude-haiku-4-5":{"ambiguity-1":{"correct":true,"score":4.17},"causality-1":{"correct":true,"score":4.83},"code-1":{"correct":true,"score":4.67},"common-sense-1":{"correct":true,"score":3.33},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"anthropic/claude-opus-4-6":{"ambiguity-1":{"correct":true,"score":5},"causality-1":{"correct":true,"score":5},"code-1":{"correct":true,"score":4.67},"common-sense-1":{"correct":true,"score":4.33},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"anthropic/claude-sonnet-4-6":{"ambiguity-1":{"correct":true,"score":4.33},"causality-1":{"correct":true,"score":5},"code-1":{"correct":true,"score":4.67},"common-sense-1":{"correct":true,"score":3.67},"logic-1":{"correct":true,"score":4.83},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"gemini/gemini-2.5-flash":{"ambiguity-1":{"correct":true,"score":4.33},"causality-1":{"correct":false,"score":3.5},"code-1":{"correct":true,"score":4.83},"common-sense-1":{"correct":true,"score":4.33},"logic-1":{"correct":true,"score":4.83},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"gemini/gemini-2.5-pro":{"ambiguity-1":{"correct":true,"score":4.33},"causality-1":{"correct":true,"score":4.67},"code-1":{"correct":true,"score":4.83},"common-sense-1":{"correct":true,"score":5},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"ollama/llama3":{"ambiguity-1":{"correct":null,"score":null},"causality-1":{"correct":null,"score":null},"code-1":{"correct":null,"score":null},"common-sense-1":{"correct":null,"score":null},"logic-1":{"correct":null,"score":null},"math-1":{"correct":null,"score":null},"spatial-1":{"correct":null,"score":null}},"openai/gpt-5.4":{"ambiguity-1":{"correct":true,"score":4.5},"causality-1":{"correct":true,"score":4.83},"code-1":{"correct":true,"score":4.83},"common-sense-1":{"correct":true,"score":4.33},"logic-1":{"correct":true,"score":4.83},"math-1":{"correct":true,"score":4.67},"spatial-1":{"correct":true,"score":5}},"openai/gpt-5.4-mini":{"ambiguity-1":{"correct":true,"score":4.67},"causality-1":{"correct":true,"score":4.67},"code-1":{"correct":true,"score":4.67},"common-sense-1":{"correct":true,"score":4.33},"logic-1":{"correct":true,"score":4.83},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":false,"score":3.67}}},"run_id":"2026-06-23T13-55-19","scorecard":{"anthropic/claude-haiku-4-5":{"ambiguity-1":{"correct":true,"score":4.5},"causality-1":{"correct":true,"score":4.75},"code-1":{"correct":true,"score":4.5},"common-sense-1":{"correct":false,"score":3},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"anthropic/claude-opus-4-6":{"ambiguity-1":{"correct":true,"score":5},"causality-1":{"correct":true,"score":5},"code-1":{"correct":true,"score":4.5},"common-sense-1":{"correct":true,"score":4.5},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"anthropic/claude-sonnet-4-6":{"ambiguity-1":{"correct":true,"score":4.75},"causality-1":{"correct":true,"score":5},"code-1":{"correct":true,"score":4.5},"common-sense-1":{"correct":true,"score":4.75},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"gemini/gemini-2.5-flash":{"ambiguity-1":{"correct":true,"score":4.5},"causality-1":{"correct":false,"score":1.75},"code-1":{"correct":true,"score":4.75},"common-sense-1":{"correct":true,"score":3.75},"logic-1":{"correct":true,"score":4.67},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"gemini/gemini-2.5-pro":{"ambiguity-1":{"correct":true,"score":4.75},"causality-1":{"correct":true,"score":5},"code-1":{"correct":true,"score":4.75},"common-sense-1":{"correct":true,"score":4.75},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"ollama/llama3":{"ambiguity-1":{"correct":null,"score":null},"causality-1":{"correct":null,"score":null},"code-1":{"correct":null,"score":null},"common-sense-1":{"correct":null,"score":null},"logic-1":{"correct":null,"score":null},"math-1":{"correct":null,"score":null},"spatial-1":{"correct":null,"score":null}},"openai/gpt-5.4":{"ambiguity-1":{"correct":true,"score":4.5},"causality-1":{"correct":true,"score":4.75},"code-1":{"correct":true,"score":4.75},"common-sense-1":{"correct":true,"score":4.75},"logic-1":{"correct":true,"score":5},"math-1":{"correct":true,"score":5},"spatial-1":{"correct":true,"score":5}},"openai/gpt-5.4-mini":{"ambiguity-1":{"correct":true,"score":4.5},"causality-1":{"correct":true,"score":5},"code-1":{"correct":true,"score":4.5},"common-sense-1":{"correct":true,"score":4.5},"logic-1":{"correct":true,"score":4.83},"math-1":{"correct":true,"score":4.83},"spatial-1":{"correct":false,"score":2.25}}},"status":{"anthropic/claude-haiku-4-5":"down","anthropic/claude-opus-4-6":"stable","anthropic/claude-sonnet-4-6":"up","gemini/gemini-2.5-flash":"down","gemini/gemini-2.5-pro":"stable","openai/gpt-5.4":"stable","openai/gpt-5.4-mini":"down"}}