A comprehensive benchmarking tool for evaluating LLMs as agents
AgentBench is a comprehensive benchmarking tool developed by the Institute for Data Science at Tsinghua University, designed to evaluate the capabilities of large language models (LLMs) as agents. This tool integrates multiple evaluation dimensions, enabling researchers and developers to gain a deep understanding of LLM performance across different tasks, thereby promoting the development of LLM technology.
This is the machine-readable structured data for this agent. AI systems and search engines use this to understand the agent's capabilities.
[
{
"@context": "https://schema.org",
"@type": "SoftwareApplication",
"@id": "https://agentsignals.ai/agents/agentbench",
"name": "AgentBench",
"description": "AgentBench is a comprehensive benchmarking tool developed by the Institute for Data Science at Tsinghua University, designed to evaluate the capabilities of large language models (LLMs) as agents. This tool integrates multiple evaluation dimensions, enabling researchers and developers to gain a deep understanding of LLM performance across different tasks, thereby promoting the development of LLM technology.",
"url": "https://agentsignals.ai/agents/agentbench",
"applicationCategory": "研究",
"operatingSystem": "GitHub",
"sameAs": "https://github.com/THUDM/AgentBench",
"installUrl": "https://github.com/THUDM/AgentBench",
"offers": {
"@type": "Offer",
"price": "0",
"priceCurrency": "USD",
"description": "免费",
"availability": "https://schema.org/InStock"
},
"featureList": [
"Multi-dimensional evaluation metrics",
"Supports multiple large models",
"Open-source"
],
"datePublished": "2025-12-05T17:13:59.922031+00:00",
"dateModified": "2025-12-19T05:08:34.327767+00:00",
"publisher": {
"@type": "Organization",
"name": "Agent Signals",
"url": "https://agentsignals.ai"
}
},
{
"@context": "https://schema.org",
"@type": "BreadcrumbList",
"itemListElement": [
{
"@type": "ListItem",
"position": 1,
"name": "Home",
"item": "https://agentsignals.ai"
},
{
"@type": "ListItem",
"position": 2,
"name": "Agents",
"item": "https://agentsignals.ai/agents"
},
{
"@type": "ListItem",
"position": 3,
"name": "AgentBench",
"item": "https://agentsignals.ai/agents/agentbench"
}
]
},
{
"@context": "https://schema.org",
"@type": "FAQPage",
"mainEntity": [
{
"@type": "Question",
"name": "What is AgentBench?",
"acceptedAnswer": {
"@type": "Answer",
"text": "A comprehensive benchmarking tool for evaluating LLMs as agents"
}
},
{
"@type": "Question",
"name": "What features does AgentBench offer?",
"acceptedAnswer": {
"@type": "Answer",
"text": "Multi-dimensional evaluation metrics, Supports multiple large models, Open-source"
}
},
{
"@type": "Question",
"name": "What are the use cases for AgentBench?",
"acceptedAnswer": {
"@type": "Answer",
"text": "Academic Research, Model Optimization, Technical Evaluation"
}
},
{
"@type": "Question",
"name": "What are the advantages of AgentBench?",
"acceptedAnswer": {
"@type": "Answer",
"text": "全面的评估体系, 易于使用的API接口, 社区支持和持续更新"
}
},
{
"@type": "Question",
"name": "What are the limitations of AgentBench?",
"acceptedAnswer": {
"@type": "Answer",
"text": "需要一定的技术背景, 资源消耗较高"
}
}
]
}
]