[
  {
    "id": 1,
    "name": "Claude Opus 4.7 (thinking)",
    "company": "Anthropic",
    "apiName": "claude-opus-4-7-thinking",
    "release_date": "2026-04-01",
    "elo": 1502,
    "raw": {
      "gpqa_diamond": 0.9420000000000001,
      "hle": 0.396,
      "arc_agi_2": 0.683,
      "swe_bench_verified": 0.82,
      "livebench": 0.7691,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 57,
      "osworld": 0.78,
      "browsecomp": 0.793,
      "swe_bench_pro": 0.643,
      "frontier_math": 0.43799999999999994,
      "terminal_bench": 0.6854
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "gpqa_diamond": "https://benchlm.ai/benchmarks/gpqaDiamond",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "arc_agi_2": "https://arcprize.org/leaderboard",
      "livebench": "https://livebench.ai",
      "aa_intelligence_index": "https://artificialanalysis.ai/",
      "osworld": "https://benchlm.ai/benchmarks/osWorld",
      "browsecomp": "https://www.vellum.ai/blog/claude-opus-4-7-benchmarks-explained",
      "swe_bench_pro": "https://benchlm.ai/benchmarks/swePro",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "frontier_math": "https://benchlm.ai/benchmarks/frontierMath",
      "terminal_bench": "https://www.vals.ai/benchmarks/terminal-bench-2"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "gpqa_diamond": "2026-05-11",
      "hle": "2026-06-05",
      "arc_agi_2": "2026-04-16",
      "livebench": "2026-01-08",
      "aa_intelligence_index": "2026-04-30",
      "osworld": "2026-05-11",
      "browsecomp": "2026-04-16",
      "swe_bench_pro": "2026-05-11",
      "swe_bench_verified": "2026-04-30",
      "frontier_math": "2026-05-11",
      "terminal_bench": "2026-05-06"
    },
    "pricing": {
      "input_per_mtok": 5.0,
      "output_per_mtok": 25.0,
      "cached_input_per_mtok": 0.5,
      "currency": "USD",
      "source": "https://platform.claude.com/docs/en/about-claude/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Standard tier; same as 4.8"
    }
  },
  {
    "id": 2,
    "name": "Claude Opus 4.7",
    "company": "Anthropic",
    "apiName": "claude-opus-4-7",
    "release_date": "2026-04-01",
    "elo": 1492,
    "manual_tier_override_reason": "Lifted 2026-05-23 (v1.5.1) after Round 26 AA harvest. Override original premise ('no base-distinct cells') no longer holds. AA's 'claude-opus-4-7-non-reasoning' slug provides T1 base-distinct evaluations: gpqa_diamond=0.885, aa_intelligence_index=51.8. Combined with existing arc_agi_2 + simple_bench cells, base variant now meets PROVISIONAL/RANKED thresholds. HLE not added pending methodology config audit.",
    "raw": {
      "gpqa_diamond": 0.885,
      "hle": null,
      "arc_agi_2": 0.683,
      "swe_bench_verified": null,
      "livebench": null,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 51.8,
      "simple_bench": 0.617,
      "mcp_atlas": 0.791,
      "hil_bench": 0.2767
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "arc_agi_2": "https://arcprize.org/leaderboard",
      "simple_bench": "https://simple-bench.com/",
      "mcp_atlas": "https://scale.com/leaderboard",
      "hil_bench": "https://scale.com/leaderboard",
      "gpqa_diamond": "https://artificialanalysis.ai/models/claude-opus-4-7-non-reasoning",
      "aa_intelligence_index": "https://artificialanalysis.ai/models/claude-opus-4-7-non-reasoning"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "arc_agi_2": "2026-04-16",
      "simple_bench": "2026-05-04",
      "mcp_atlas": "2026-04-16",
      "hil_bench": "2026-04-16",
      "gpqa_diamond": "2026-05-23",
      "aa_intelligence_index": "2026-05-23"
    },
    "pricing": {
      "input_per_mtok": 5.0,
      "output_per_mtok": 25.0,
      "cached_input_per_mtok": 0.5,
      "currency": "USD",
      "source": "https://platform.claude.com/docs/en/about-claude/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Standard tier; base billed same as thinking"
    }
  },
  {
    "id": 5,
    "name": "ChatGPT 5.5 (Pro)",
    "company": "OpenAI",
    "apiName": "gpt-5.5-pro",
    "release_date": "2026-04-01",
    "elo": 1488,
    "raw": {
      "gpqa_diamond": 0.936,
      "hle": null,
      "arc_agi_2": 0.846,
      "swe_bench_verified": null,
      "livebench": null,
      "mmmu_pro": 0.821,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": null,
      "osworld": null,
      "browsecomp": 0.901,
      "swe_bench_pro": null,
      "frontier_math": 0.524,
      "simple_bench": 0.769
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "gpqa_diamond": "https://www.vellum.ai/llm-leaderboard",
      "arc_agi_2": "https://arcprize.org/leaderboard",
      "mmmu_pro": "https://mmmu-benchmark.github.io/",
      "browsecomp": "https://openai.com/index/introducing-gpt-5-5/",
      "frontier_math": "https://benchlm.ai/benchmarks/frontierMath",
      "simple_bench": "https://simple-bench.com/"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "gpqa_diamond": "2026-04-23",
      "arc_agi_2": "2026-04-23",
      "mmmu_pro": "2026-03-05",
      "browsecomp": "2026-05-11",
      "frontier_math": "2026-05-11",
      "simple_bench": "2026-05-04"
    },
    "manual_tier_override": "awaiting_verification",
    "manual_tier_override_reason": "Pro variant has only 1 T1 source with explicit 'Pro' row labeling (arcprize.org). OpenAI's release page marks Pro as '-' on multiple agency benchmarks (osworld, swe_bench_pro). Score withheld pending more independent Pro-specific evaluations.",
    "pricing": {
      "input_per_mtok": 30.0,
      "output_per_mtok": 180.0,
      "cached_input_per_mtok": null,
      "currency": "USD",
      "source": "https://developers.openai.com/api/docs/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Premium Pro tier; >272K context uplift"
    }
  },
  {
    "id": 6,
    "name": "ChatGPT 5.5",
    "company": "OpenAI",
    "apiName": "gpt-5.5",
    "release_date": "2026-04-01",
    "elo": 1474,
    "raw": {
      "gpqa_diamond": 0.935,
      "hle": 0.443,
      "arc_agi_2": 0.833,
      "swe_bench_verified": 0.826,
      "livebench": 0.8071,
      "mmmu_pro": 0.81,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 60.2,
      "swe_bench_pro": 0.586,
      "osworld": 0.787,
      "browsecomp": 0.844,
      "frontier_math": 0.517,
      "simple_bench": 0.69,
      "hil_bench": 0.291,
      "mcp_atlas": 0.753,
      "terminal_bench": 0.732
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "gpqa_diamond": "https://artificialanalysis.ai/evaluations/gpqa-diamond",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "arc_agi_2": "https://arcprize.org/leaderboard",
      "livebench": "https://livebench.ai",
      "swe_bench_pro": "https://benchlm.ai/benchmarks/swePro",
      "osworld": "https://benchlm.ai/benchmarks/osWorld",
      "browsecomp": "https://benchlm.ai/benchmarks/browseComp",
      "mmmu_pro": "https://artificialanalysis.ai/evaluations/mmmu-pro",
      "frontier_math": "https://benchlm.ai/benchmarks/frontierMath",
      "simple_bench": "https://simple-bench.com/",
      "hil_bench": "https://scale.com/leaderboard",
      "mcp_atlas": "https://openai.com/index/introducing-gpt-5-5/",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "terminal_bench": "https://www.vals.ai/benchmarks/terminal-bench-2",
      "aa_intelligence_index": "https://artificialanalysis.ai/models/gpt-5-5"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "gpqa_diamond": "2026-06-15",
      "hle": "2026-06-05",
      "arc_agi_2": "2026-04-23",
      "livebench": "2026-01-08",
      "swe_bench_pro": "2026-05-11",
      "osworld": "2026-05-11",
      "browsecomp": "2026-06-15",
      "mmmu_pro": "2026-06-15",
      "frontier_math": "2026-05-11",
      "simple_bench": "2026-05-04",
      "hil_bench": "2026-04-23",
      "mcp_atlas": "2026-05-11",
      "swe_bench_verified": "2026-04-30",
      "terminal_bench": "2026-05-06",
      "aa_intelligence_index": "2026-05-23"
    },
    "pricing": {
      "input_per_mtok": 5.0,
      "output_per_mtok": 30.0,
      "cached_input_per_mtok": 0.5,
      "currency": "USD",
      "source": "https://developers.openai.com/api/docs/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Standard short context; >272K = 2x in / 1.5x out"
    }
  },
  {
    "id": 7,
    "name": "ChatGPT 5.4 (Pro)",
    "company": "OpenAI",
    "apiName": "gpt-5.4-pro",
    "release_date": "2026-03-01",
    "elo": 1475,
    "raw": {
      "gpqa_diamond": 0.944,
      "hle": null,
      "arc_agi_2": 0.833,
      "swe_bench_verified": null,
      "livebench": null,
      "mmmu_pro": 0.812,
      "tau_bench_retail": 0.801,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": null,
      "osworld": null,
      "browsecomp": 0.893,
      "swe_bench_pro": null,
      "frontier_math": 0.5,
      "multi_nrc": 0.6227,
      "multi_challenge": 0.6923,
      "vtb": 0.2917,
      "gaia": 0.505
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "gpqa_diamond": "https://openai.com/index/introducing-gpt-5-4/",
      "arc_agi_2": "https://arcprize.org/leaderboard",
      "mmmu_pro": "https://mmmu-benchmark.github.io/",
      "browsecomp": "https://www.vellum.ai/blog/claude-opus-4-7-benchmarks-explained",
      "frontier_math": "https://benchlm.ai/benchmarks/frontierMath",
      "multi_nrc": "https://scale.com/leaderboard",
      "multi_challenge": "https://scale.com/leaderboard",
      "vtb": "https://scale.com/leaderboard",
      "tau_bench_retail": "https://benchlm.ai/benchmarks/tauBench",
      "gaia": "https://benchlm.ai/benchmarks/gaia"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "gpqa_diamond": "2026-03-05",
      "arc_agi_2": "2026-05-04",
      "mmmu_pro": "2026-03-05",
      "browsecomp": "2026-04-16",
      "frontier_math": "2026-05-11",
      "multi_nrc": "2026-03-05",
      "multi_challenge": "2026-03-05",
      "vtb": "2026-03-05",
      "tau_bench_retail": "2026-05-01",
      "gaia": "2026-05-11"
    },
    "pricing": {
      "input_per_mtok": 30.0,
      "output_per_mtok": 180.0,
      "cached_input_per_mtok": null,
      "currency": "USD",
      "source": "https://developers.openai.com/api/docs/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Premium Pro tier; $60/$270 at >272K"
    }
  },
  {
    "id": 8,
    "name": "ChatGPT 5.4",
    "company": "OpenAI",
    "apiName": "gpt-5.4",
    "release_date": "2026-03-01",
    "elo": 1450,
    "raw": {
      "gpqa_diamond": 0.9279999999999999,
      "hle": 0.416,
      "arc_agi_2": 0.675,
      "swe_bench_verified": 0.782,
      "livebench": 0.8028,
      "mmmu_pro": 0.812,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 56.8,
      "swe_bench_pro": 0.5770000000000001,
      "osworld": 0.75,
      "browsecomp": 0.827,
      "frontier_math": 0.476,
      "gaia": 0.48200000000000004,
      "terminal_bench": 0.5843
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "gpqa_diamond": "https://benchlm.ai/benchmarks/gpqaDiamond",
      "arc_agi_2": "https://arcprize.org/leaderboard",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "swe_bench_pro": "https://benchlm.ai/benchmarks/swePro",
      "osworld": "https://benchlm.ai/benchmarks/osWorld",
      "browsecomp": "https://benchlm.ai/benchmarks/browseComp",
      "mmmu_pro": "https://benchlm.ai/benchmarks/mmmuPro",
      "frontier_math": "https://openai.com/index/introducing-gpt-5-4/",
      "livebench": "https://livebench.ai",
      "gaia": "https://benchlm.ai/benchmarks/gaia",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "terminal_bench": "https://www.vals.ai/benchmarks/terminal-bench-2",
      "aa_intelligence_index": "https://artificialanalysis.ai/models/gpt-5-4"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "gpqa_diamond": "2026-05-11",
      "arc_agi_2": "2026-03-04",
      "hle": "2026-06-05",
      "swe_bench_pro": "2026-05-11",
      "osworld": "2026-05-11",
      "browsecomp": "2026-06-15",
      "mmmu_pro": "2026-05-11",
      "frontier_math": "2026-03-05",
      "livebench": "2026-01-08",
      "gaia": "2026-05-11",
      "swe_bench_verified": "2026-04-30",
      "terminal_bench": "2026-05-06",
      "aa_intelligence_index": "2026-05-23"
    },
    "pricing": {
      "input_per_mtok": 2.5,
      "output_per_mtok": 15.0,
      "cached_input_per_mtok": 0.25,
      "currency": "USD",
      "source": "https://developers.openai.com/api/docs/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Standard short context; long-context uplift"
    }
  },
  {
    "id": 9,
    "name": "Gemini 3.1 Pro (preview)",
    "company": "Google DeepMind",
    "apiName": "gemini-3.1-pro-preview",
    "release_date": "2026-02-01",
    "elo": 1487,
    "raw": {
      "gpqa_diamond": 0.941,
      "hle": 0.447,
      "arc_agi_2": 0.771,
      "swe_bench_verified": 0.788,
      "livebench": 0.7993,
      "mmmu_pro": 0.82,
      "tau_bench_retail": 0.768,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 57.2,
      "osworld": null,
      "browsecomp": 0.859,
      "swe_bench_pro": 0.542,
      "aime_2025": 0.9813,
      "simple_bench": 0.796,
      "mcp_atlas": 0.782,
      "multi_nrc": 0.6474,
      "multi_challenge": 0.7137,
      "vtb": 0.2897,
      "gaia": 0.461,
      "terminal_bench": 0.6742
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "gpqa_diamond": "https://artificialanalysis.ai/evaluations/gpqa-diamond",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "arc_agi_2": "https://arcprize.org/leaderboard",
      "livebench": "https://livebench.ai",
      "mmmu_pro": "https://artificialanalysis.ai/evaluations/mmmu-pro",
      "aa_intelligence_index": "https://artificialanalysis.ai/",
      "tau_bench_retail": "https://benchlm.ai/benchmarks/tauBench",
      "browsecomp": "https://evolink.ai/blog/claude-opus-4-6-vs-gemini-3-1-pro",
      "swe_bench_pro": "https://www.vellum.ai/blog/claude-opus-4-7-benchmarks-explained",
      "aime_2025": "https://www.vals.ai/benchmarks/aime",
      "simple_bench": "https://simple-bench.com/",
      "mcp_atlas": "https://scale.com/leaderboard",
      "multi_nrc": "https://scale.com/leaderboard",
      "multi_challenge": "https://scale.com/leaderboard",
      "vtb": "https://scale.com/leaderboard",
      "gaia": "https://benchlm.ai/benchmarks/gaia",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "terminal_bench": "https://www.vals.ai/benchmarks/terminal-bench-2"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "gpqa_diamond": "2026-06-15",
      "hle": "2026-06-05",
      "arc_agi_2": "2026-02-19",
      "livebench": "2026-01-08",
      "mmmu_pro": "2026-06-15",
      "aa_intelligence_index": "2026-06-15",
      "tau_bench_retail": "2026-05-02",
      "browsecomp": "2026-03-27",
      "swe_bench_pro": "2026-04-16",
      "aime_2025": "2026-04-16",
      "simple_bench": "2026-05-04",
      "mcp_atlas": "2026-05-04",
      "multi_nrc": "2026-05-04",
      "multi_challenge": "2026-05-04",
      "vtb": "2026-05-04",
      "gaia": "2026-05-11",
      "swe_bench_verified": "2026-04-30",
      "terminal_bench": "2026-05-06"
    },
    "pricing": {
      "input_per_mtok": 2.0,
      "output_per_mtok": 12.0,
      "cached_input_per_mtok": 0.2,
      "currency": "USD",
      "source": "https://ai.google.dev/gemini-api/docs/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Paid tier <=200K; >200K = $4/$18; cache storage $4.50/Mtok/hr"
    }
  },
  {
    "id": 10,
    "name": "Gemini 3 Pro",
    "company": "Google DeepMind",
    "apiName": "gemini-3-pro",
    "release_date": "2026-01-01",
    "elo": 1486,
    "raw": {
      "gpqa_diamond": 0.908,
      "hle": null,
      "arc_agi_2": 0.311,
      "swe_bench_verified": 0.764,
      "livebench": 0.7339,
      "mmmu_pro": 0.81,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 48.4,
      "gaia": 0.385,
      "aime_2025": 0.95
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "gpqa_diamond": "https://artificialanalysis.ai/models/gemini-3-pro",
      "arc_agi_2": "https://arcprize.org/leaderboard",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "livebench": "https://livebench.ai",
      "mmmu_pro": "https://mmmu-benchmark.github.io/",
      "gaia": "https://benchlm.ai/benchmarks/gaia",
      "aa_intelligence_index": "https://artificialanalysis.ai/models/gemini-3-pro",
      "aime_2025": "https://matharena.ai"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "gpqa_diamond": "2026-05-23",
      "arc_agi_2": "2026-05-04",
      "swe_bench_verified": "2026-06-15",
      "livebench": "2026-01-08",
      "mmmu_pro": "2025-11-18",
      "gaia": "2026-05-11",
      "aa_intelligence_index": "2026-05-23",
      "aime_2025": "2026-06-15"
    },
    "pricing": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "cached_input_per_mtok": null,
      "currency": "USD",
      "no_public_price": true,
      "source": "https://ai.google.dev/gemini-api/docs/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Superseded by Gemini 3.1 Pro; no distinct current pricing on Google's page"
    }
  },
  {
    "id": 11,
    "name": "Muse Spark",
    "company": "Meta",
    "apiName": "muse-spark",
    "release_date": null,
    "elo": 1487,
    "raw": {
      "gpqa_diamond": 0.884,
      "hle": 0.399,
      "arc_agi_2": 0.425,
      "swe_bench_verified": 0.744,
      "livebench": 0.804,
      "mmmu_pro": 0.81,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 52.2,
      "frontier_math": 0.38,
      "swe_bench_pro": 0.524,
      "mcp_atlas": 0.822,
      "multi_challenge": 0.7552,
      "terminal_bench": 0.5955
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "livebench": "https://livebench.ai",
      "mmmu_pro": "https://artificialanalysis.ai/evaluations/mmmu-pro",
      "aa_intelligence_index": "https://artificialanalysis.ai/",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "frontier_math": "https://ai.meta.com/blog/introducing-muse-spark-msl/",
      "swe_bench_pro": "https://benchlm.ai/benchmarks/swePro",
      "mcp_atlas": "https://scale.com/leaderboard",
      "multi_challenge": "https://scale.com/leaderboard",
      "gpqa_diamond": "https://artificialanalysis.ai/evaluations/gpqa-diamond",
      "arc_agi_2": "https://benchlm.ai/benchmarks/arcAgi2",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "terminal_bench": "https://www.vals.ai/benchmarks/terminal-bench-2"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "livebench": "2026-01-08",
      "mmmu_pro": "2026-06-15",
      "aa_intelligence_index": "2026-06-15",
      "hle": "2026-06-05",
      "frontier_math": "2026-04-08",
      "swe_bench_pro": "2026-05-11",
      "mcp_atlas": "2026-04-08",
      "multi_challenge": "2026-04-08",
      "gpqa_diamond": "2026-06-15",
      "arc_agi_2": "2026-05-11",
      "swe_bench_verified": "2026-06-05",
      "terminal_bench": "2026-05-06"
    },
    "pricing": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "cached_input_per_mtok": null,
      "currency": "USD",
      "no_public_price": true,
      "source": "https://artificialanalysis.ai/models/muse-spark",
      "as_of": "2026-06-04",
      "tier_note": "Private API preview; no public pricing"
    }
  },
  {
    "id": 12,
    "name": "DeepSeek V4 Pro",
    "company": "DeepSeek",
    "apiName": "deepseek-v4-pro",
    "release_date": null,
    "elo": 1443,
    "raw": {
      "gpqa_diamond": 0.888,
      "hle": 0.359,
      "arc_agi_2": null,
      "swe_bench_verified": 0.774,
      "livebench": 0.7358,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 51.5,
      "browsecomp": 0.834,
      "swe_bench_pro": 0.554,
      "terminal_bench": 0.679
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "livebench": "https://livebench.ai",
      "aa_intelligence_index": "https://artificialanalysis.ai/",
      "gpqa_diamond": "https://artificialanalysis.ai/evaluations/gpqa-diamond",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "browsecomp": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro",
      "swe_bench_pro": "https://benchlm.ai/benchmarks/swePro",
      "terminal_bench": "https://benchlm.ai/benchmarks/terminalBench"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "swe_bench_verified": "2026-06-05",
      "livebench": "2026-01-08",
      "aa_intelligence_index": "2026-06-15",
      "gpqa_diamond": "2026-06-15",
      "hle": "2026-06-05",
      "browsecomp": "2026-04-24",
      "swe_bench_pro": "2026-06-15",
      "terminal_bench": "2026-05-11"
    },
    "pricing": {
      "input_per_mtok": 0.435,
      "output_per_mtok": 0.87,
      "cached_input_per_mtok": 0.003625,
      "currency": "USD",
      "source": "https://api-docs.deepseek.com/quick_start/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Cache-miss input"
    }
  },
  {
    "id": 14,
    "name": "GLM 5.1",
    "company": "other",
    "apiName": "glm-5.1",
    "release_date": null,
    "elo": 1475,
    "raw": {
      "gpqa_diamond": 0.867,
      "hle": 0.28,
      "arc_agi_2": null,
      "swe_bench_verified": 0.764,
      "livebench": 0.7018,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 51.4,
      "swe_bench_pro": null
    },
    "sources": {
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "livebench": "https://livebench.ai",
      "gpqa_diamond": "https://artificialanalysis.ai/evaluations/gpqa-diamond",
      "aa_intelligence_index": "https://artificialanalysis.ai/models/glm-5-1",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam"
    },
    "eval_dates": {
      "lmsys_arena_elo": "2026-05-02",
      "swe_bench_verified": "2026-06-05",
      "livebench": "2026-01-08",
      "gpqa_diamond": "2026-06-15",
      "aa_intelligence_index": "2026-05-23",
      "hle": "2026-06-05"
    },
    "pricing": {
      "input_per_mtok": 1.4,
      "output_per_mtok": 4.4,
      "cached_input_per_mtok": 0.26,
      "currency": "USD",
      "source": "https://docs.z.ai/guides/overview/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Standard"
    }
  },
  {
    "id": 16,
    "name": "Kimi K2.6",
    "company": "Moonshot",
    "apiName": "kimi-k2.6",
    "release_date": "2026-04-20",
    "elo": null,
    "raw": {
      "gpqa_diamond": 0.911,
      "hle": 0.359,
      "arc_agi_2": null,
      "swe_bench_verified": 0.762,
      "livebench": 0.7217,
      "mmmu_pro": 0.79,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 53.9,
      "osworld": 0.731,
      "browsecomp": 0.832,
      "swe_bench_pro": 0.586,
      "aime_2025": 0.991,
      "simple_bench": 0.468,
      "terminal_bench": 0.573
    },
    "sources": {
      "gpqa_diamond": "https://artificialanalysis.ai/evaluations/gpqa-diamond",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "mmmu_pro": "https://artificialanalysis.ai/evaluations/mmmu-pro",
      "aa_intelligence_index": "https://artificialanalysis.ai/",
      "osworld": "https://os-world.github.io/",
      "browsecomp": "https://benchlm.ai/benchmarks/browseComp",
      "swe_bench_pro": "https://benchlm.ai/benchmarks/swePro",
      "aime_2025": "https://www.vellum.ai/llm-leaderboard",
      "livebench": "https://livebench.ai",
      "simple_bench": "https://simple-bench.com/",
      "terminal_bench": "https://www.vals.ai/benchmarks/terminal-bench-2"
    },
    "eval_dates": {
      "gpqa_diamond": "2026-06-15",
      "hle": "2026-06-05",
      "swe_bench_verified": "2026-06-05",
      "mmmu_pro": "2026-06-15",
      "aa_intelligence_index": "2026-06-15",
      "osworld": "2026-04-19",
      "browsecomp": "2026-06-15",
      "swe_bench_pro": "2026-05-11",
      "aime_2025": "2026-04-23",
      "livebench": "2026-01-08",
      "simple_bench": "2026-05-04",
      "terminal_bench": "2026-05-06"
    },
    "pricing": {
      "input_per_mtok": 0.95,
      "output_per_mtok": 4.0,
      "cached_input_per_mtok": 0.16,
      "currency": "USD",
      "source": "https://platform.moonshot.ai/",
      "as_of": "2026-06-04",
      "tier_note": "Cache-miss input"
    }
  },
  {
    "id": 17,
    "name": "Grok 4.3",
    "company": "xAI",
    "apiName": "grok-4.3",
    "release_date": "2026-04-30",
    "elo": null,
    "raw": {
      "gpqa_diamond": 0.9141,
      "hle": 0.35,
      "arc_agi_2": null,
      "swe_bench_verified": 0.714,
      "livebench": 0.6674,
      "mmmu_pro": 0.78,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 53.2,
      "osworld": null,
      "browsecomp": null,
      "swe_bench_pro": null,
      "aime_2025": null,
      "terminal_bench": 0.4345
    },
    "sources": {
      "gpqa_diamond": "https://www.vals.ai/models/grok_grok-4.3",
      "swe_bench_verified": "https://www.vals.ai/models/grok_grok-4.3",
      "livebench": "https://livebench.ai",
      "mmmu_pro": "https://artificialanalysis.ai/evaluations/mmmu-pro",
      "aa_intelligence_index": "https://artificialanalysis.ai/",
      "terminal_bench": "https://www.vals.ai/benchmarks/terminal-bench-2",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam"
    },
    "eval_dates": {
      "gpqa_diamond": "2026-04-30",
      "swe_bench_verified": "2026-04-30",
      "livebench": "2026-01-08",
      "mmmu_pro": "2026-06-15",
      "aa_intelligence_index": "2026-06-15",
      "terminal_bench": "2026-05-06",
      "hle": "2026-06-05"
    },
    "pricing": {
      "input_per_mtok": 1.25,
      "output_per_mtok": 2.5,
      "cached_input_per_mtok": 0.2,
      "currency": "USD",
      "source": "https://x.ai/api",
      "as_of": "2026-06-04",
      "tier_note": "Standard; higher >200K context"
    }
  },
  {
    "id": 18,
    "name": "Qwen 3.7 Max",
    "company": "Alibaba",
    "apiName": "qwen-3-7-max",
    "release_date": "2026-05-19",
    "elo": 1474,
    "raw": {
      "gpqa_diamond": 0.923,
      "hle": 0.381,
      "arc_agi_2": null,
      "swe_bench_verified": 0.688,
      "swe_bench_pro": 0.606,
      "terminal_bench": 0.697,
      "aa_intelligence_index": 56.6,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "osworld": null,
      "browsecomp": null,
      "livebench": 0.7429
    },
    "sources": {
      "gpqa_diamond": "https://artificialanalysis.ai/models/qwen3-7-max",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "swe_bench_pro": "https://artificialanalysis.ai/models/qwen3-7-max",
      "terminal_bench": "https://artificialanalysis.ai/models/qwen3-7-max",
      "aa_intelligence_index": "https://artificialanalysis.ai/models/qwen3-7-max",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "livebench": "https://livebench.ai"
    },
    "eval_dates": {
      "gpqa_diamond": "2026-05-23",
      "swe_bench_verified": "2026-06-15",
      "swe_bench_pro": "2026-05-23",
      "terminal_bench": "2026-05-23",
      "aa_intelligence_index": "2026-05-23",
      "hle": "2026-06-05",
      "livebench": "2026-06-15"
    },
    "pricing": {
      "input_per_mtok": 2.5,
      "output_per_mtok": 7.5,
      "cached_input_per_mtok": null,
      "currency": "USD",
      "source": "https://www.alibabacloud.com/help/en/model-studio/",
      "as_of": "2026-06-04",
      "tier_note": "Alibaba Cloud Model Studio standard; global ~$1.65/$4.95"
    }
  },
  {
    "id": 19,
    "name": "Cursor Composer 2.5",
    "company": "Cursor",
    "apiName": "cursor-composer-2-5",
    "release_date": "2026-05-18",
    "elo": null,
    "model_type": "system",
    "model_type_note": "System entry: scores reflect the full Cursor product (Composer 2.5 weights + Cursor's agent scaffolding + tools + environment), not weights alone. Comparable for end-user capability questions; not directly comparable to raw-API scores on the same benchmark.",
    "raw": {
      "gpqa_diamond": null,
      "hle": null,
      "arc_agi_2": null,
      "swe_bench_verified": null,
      "swe_bench_pro": 0.47,
      "terminal_bench": 0.693,
      "aa_intelligence_index": null,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "osworld": null,
      "browsecomp": null,
      "livebench": null
    },
    "sources": {
      "swe_bench_pro": "https://artificialanalysis.ai/agents/coding-agents",
      "terminal_bench": "https://artificialanalysis.ai/agents/coding-agents"
    },
    "eval_dates": {
      "swe_bench_pro": "2026-05-22",
      "terminal_bench": "2026-05-22"
    }
  },
  {
    "id": 20,
    "name": "MiMo V2.5 Pro",
    "company": "Xiaomi",
    "apiName": "mimo-v2-5-pro",
    "release_date": "2026-04-22",
    "elo": null,
    "raw": {
      "gpqa_diamond": 0.857,
      "hle": 0.338,
      "arc_agi_2": null,
      "swe_bench_verified": 0.74,
      "swe_bench_pro": null,
      "terminal_bench": null,
      "aa_intelligence_index": 53.8,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "osworld": null,
      "browsecomp": null,
      "livebench": null
    },
    "sources": {
      "gpqa_diamond": "https://artificialanalysis.ai/evaluations/gpqa-diamond",
      "aa_intelligence_index": "https://artificialanalysis.ai/models/mimo-v2-5-pro",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench"
    },
    "eval_dates": {
      "gpqa_diamond": "2026-06-15",
      "aa_intelligence_index": "2026-05-23",
      "hle": "2026-06-05",
      "swe_bench_verified": "2026-06-15"
    }
  },
  {
    "id": 21,
    "name": "Claude Opus 4.8 (thinking)",
    "company": "Anthropic",
    "apiName": "claude-opus-4-8-thinking",
    "release_date": "2026-05-28",
    "elo": 1486,
    "raw": {
      "gpqa_diamond": 0.92,
      "hle": 0.457,
      "arc_agi_2": 0.721,
      "swe_bench_verified": 0.886,
      "swe_bench_pro": 0.692,
      "terminal_bench": 0.746,
      "aa_intelligence_index": 61.4,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "osworld": 0.834,
      "browsecomp": 0.843,
      "livebench": 0.7722
    },
    "sources": {
      "gpqa_diamond": "https://artificialanalysis.ai/models/claude-opus-4-8",
      "aa_intelligence_index": "https://artificialanalysis.ai/models/claude-opus-4-8",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "swe_bench_pro": "https://benchlm.ai/benchmarks/swePro",
      "osworld": "https://benchlm.ai/benchmarks/osWorld",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "livebench": "https://livebench.ai",
      "arc_agi_2": "https://arcprize.org/leaderboard",
      "browsecomp": "https://benchlm.ai/benchmarks/browseComp",
      "terminal_bench": "https://benchlm.ai/benchmarks/terminalBench"
    },
    "eval_dates": {
      "gpqa_diamond": "2026-05-28",
      "aa_intelligence_index": "2026-05-28",
      "swe_bench_verified": "2026-05-28",
      "swe_bench_pro": "2026-06-15",
      "osworld": "2026-05-28",
      "hle": "2026-06-05",
      "livebench": "2026-06-05",
      "arc_agi_2": "2026-06-01",
      "browsecomp": "2026-06-15",
      "terminal_bench": "2026-06-15"
    },
    "pricing": {
      "input_per_mtok": 5.0,
      "output_per_mtok": 25.0,
      "cached_input_per_mtok": 0.5,
      "currency": "USD",
      "source": "https://platform.claude.com/docs/en/about-claude/pricing",
      "as_of": "2026-06-04",
      "tier_note": "Standard tier (cache write ~$6.25)"
    }
  },
  {
    "id": 22,
    "name": "Claude Fable 5 (thinking)",
    "company": "Anthropic",
    "apiName": "claude-fable-5-thinking",
    "release_date": "2026-06-09",
    "elo": 1510,
    "raw": {
      "gpqa_diamond": 0.9318,
      "hle": 0.53,
      "arc_agi_2": null,
      "aime_2025": null,
      "swe_bench_verified": 0.95,
      "swe_bench_pro": 0.8,
      "terminal_bench": 0.843,
      "osworld": 0.85,
      "browsecomp": 0.869,
      "livebench": 0.7831,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 64.9
    },
    "sources": {
      "gpqa_diamond": "https://www.vals.ai/benchmarks/gpqa-diamond",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "livebench": "https://livebench.ai",
      "swe_bench_pro": "https://benchlm.ai/models/claude-fable",
      "terminal_bench": "https://benchlm.ai/models/claude-fable",
      "osworld": "https://benchlm.ai/models/claude-fable",
      "browsecomp": "https://benchlm.ai/models/claude-fable",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "aa_intelligence_index": "https://artificialanalysis.ai/"
    },
    "eval_dates": {
      "gpqa_diamond": "2026-06-09",
      "swe_bench_verified": "2026-06-09",
      "livebench": "2026-06-09",
      "swe_bench_pro": "2026-06-09",
      "terminal_bench": "2026-06-09",
      "osworld": "2026-06-09",
      "browsecomp": "2026-06-09",
      "hle": "2026-06-09",
      "aa_intelligence_index": "2026-06-15"
    },
    "pricing": {
      "input_per_mtok": 10.0,
      "output_per_mtok": 50.0,
      "cached_input_per_mtok": 1.0,
      "currency": "USD",
      "source": "https://platform.claude.com/docs/en/about-claude/pricing",
      "as_of": "2026-06-11",
      "tier_note": "Confirmed on official page; 2x Opus 4.8. Cache read $1.00, write $12.50."
    },
    "manual_tier_override": "provisional",
    "manual_tier_override_reason": "Computed tier under the published thresholds is RANKED, but every currently ranked model's reasoning component includes an ARC-AGI-2 result and none has been published for Claude Fable 5 yet (arcprize has not evaluated it). Held at Provisional so the model is not ranked on an easier reasoning basket than its peers. The hold lifts the day arcprize publishes a Claude Fable 5 result."
  },
  {
    "id": 23,
    "name": "MiniMax M3",
    "company": "MiniMax",
    "apiName": "minimax-m3",
    "release_date": "2026-06-01",
    "elo": null,
    "raw": {
      "gpqa_diamond": 0.929,
      "hle": 0.371,
      "arc_agi_2": null,
      "aime_2025": null,
      "swe_bench_verified": 0.75,
      "swe_bench_pro": null,
      "terminal_bench": 0.66,
      "osworld": 0.752,
      "browsecomp": 0.835,
      "livebench": 0.7002,
      "mmmu_pro": 0.8,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 54.7
    },
    "sources": {
      "gpqa_diamond": "https://artificialanalysis.ai/models/minimax-m3",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "mmmu_pro": "https://artificialanalysis.ai/models/minimax-m3",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "osworld": "https://os-world.github.io/",
      "aa_intelligence_index": "https://artificialanalysis.ai/models/minimax-m3",
      "livebench": "https://livebench.ai",
      "browsecomp": "https://benchlm.ai/benchmarks/browseComp",
      "terminal_bench": "https://benchlm.ai/benchmarks/terminalBench"
    },
    "eval_dates": {
      "gpqa_diamond": "2026-06-15",
      "hle": "2026-06-15",
      "mmmu_pro": "2026-06-15",
      "swe_bench_verified": "2026-06-15",
      "osworld": "2026-06-15",
      "aa_intelligence_index": "2026-06-15",
      "livebench": "2026-06-15",
      "browsecomp": "2026-06-15",
      "terminal_bench": "2026-06-15"
    },
    "pricing": {
      "input_per_mtok": 0.3,
      "output_per_mtok": 1.2,
      "cached_input_per_mtok": 0.06,
      "currency": "USD",
      "source": "https://www.minimax.io/platform",
      "as_of": "2026-06-15",
      "tier_note": "Pay-as-you-go standard (<=512k context); permanent 50% off. >512k context = $0.60/$2.40."
    }
  },
  {
    "id": 24,
    "name": "MiniMax M2.7",
    "company": "MiniMax",
    "apiName": "minimax-m2.7",
    "release_date": null,
    "elo": null,
    "raw": {
      "gpqa_diamond": 0.874,
      "hle": 0.281,
      "arc_agi_2": null,
      "aime_2025": null,
      "swe_bench_verified": 0.738,
      "swe_bench_pro": null,
      "terminal_bench": null,
      "osworld": null,
      "browsecomp": null,
      "livebench": null,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": 49.6
    },
    "sources": {
      "gpqa_diamond": "https://artificialanalysis.ai/evaluations/gpqa-diamond",
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "aa_intelligence_index": "https://artificialanalysis.ai/"
    },
    "eval_dates": {
      "gpqa_diamond": "2026-06-15",
      "hle": "2026-06-15",
      "swe_bench_verified": "2026-06-15",
      "aa_intelligence_index": "2026-06-15"
    },
    "pricing": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "cached_input_per_mtok": null,
      "currency": "USD",
      "no_public_price": true,
      "source": "https://www.minimax.io/platform",
      "as_of": "2026-06-15",
      "tier_note": "Prior MiniMax flagship; pricing not yet harvested."
    }
  },
  {
    "id": 25,
    "name": "GLM 5.2",
    "company": "other",
    "apiName": "glm-5.2",
    "release_date": "2026-06-16",
    "elo": 1471,
    "manual_tier_override": null,
    "manual_tier_override_reason": null,
    "raw": {
      "gpqa_diamond": 0.8561,
      "hle": 0.4,
      "arc_agi_2": null,
      "aime_2025": null,
      "swe_bench_verified": 0.828,
      "swe_bench_pro": null,
      "terminal_bench": 0.6779,
      "osworld": null,
      "browsecomp": null,
      "livebench": 0.7624,
      "mmmu_pro": null,
      "tau_bench_retail": null,
      "tau_bench_airline": null,
      "aider_polyglot": null,
      "aa_intelligence_index": null
    },
    "sources": {
      "hle": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "gpqa_diamond": "https://www.vals.ai/benchmarks/gpqa-diamond",
      "swe_bench_verified": "https://www.vals.ai/benchmarks/swebench",
      "terminal_bench": "https://www.vals.ai/benchmarks/terminal-bench-2",
      "livebench": "https://livebench.ai",
      "lmsys_arena_elo": "https://lmarena.ai/leaderboard"
    },
    "eval_dates": {
      "hle": "2026-06-17",
      "gpqa_diamond": "2026-06-17",
      "swe_bench_verified": "2026-06-17",
      "terminal_bench": "2026-06-17",
      "livebench": "2026-06-19",
      "lmsys_arena_elo": "2026-06-19"
    },
    "pricing": {
      "input_per_mtok": 1.4,
      "output_per_mtok": 4.4,
      "cached_input_per_mtok": 0.26,
      "currency": "USD",
      "source": "https://docs.z.ai/guides/overview/pricing",
      "as_of": "2026-06-17",
      "tier_note": "Open-weight, 1M context. Same standard tier as GLM 5.1 (in/out per BenchLM launch listing)."
    }
  }
]