AI Agent Observability and Debugging: Building a Transparent Agent System
This article explains why AI agents behave like black boxes, introduces a three‑pillar observability framework (tracing, metrics, logging), demonstrates practical tracing with LangSmith and LangFuse, shows how to instrument agents with custom metrics, evaluate performance, and share best‑practice guidelines for production‑ready debugging.
Why Observability?
Traditional software has a deterministic execution path and stack traces for error analysis, while agents rely on LLM‑generated reasoning, making their decision flow uncertain, token consumption opaque, and performance bottlenecks hard to locate. To debug agents effectively, we need to surface the full reasoning chain, token usage, tool calls, and latency.
Three Pillars of Agent Observability
┌─────────────────────────────────────────────────────────────────┐
│ Agent Observability Three Pillars │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Tracing │ │ Metrics │ │ Logging │ │
│ │ 链路追踪 │ │ 指标监控 │ │ 日志记录 │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ • 完整调用链 • Token 消耗 • 思考过程 │
│ • 每步耗时 • 工具调用次数 • 决策日志 │
│ • 错误定位 • 成功率/延迟 • 异常记录 │
│ │
└─────────────────────────────────────────────────────────────────┘LangSmith Practical (Strongest Tracing Tool)
Installation & Configuration
pip install langsmith # config.py
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "your-langsmith-api-key"
os.environ["LANGCHAIN_PROJECT"] = "ai-agent-tutorial"Basic Tracing Example
# langsmith_basic.py
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate
llm = ChatOpenAI(model="gpt-4", temperature=0)
@tool
def get_weather(city: str) -> str:
"""获取天气"""
return f"{city}:晴,25°C"
@tool
def calculate(expr: str) -> str:
"""计算"""
return str(eval(expr))
tools = [get_weather, calculate]
prompt = ChatPromptTemplate.from_messages([
("system", "你是智能助手"),
("human", "{input}"),
("placeholder", "{agent_scratchpad}")
])
agent = create_tool_calling_agent(llm, tools, prompt)
executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
# This call will be fully traced
result = executor.invoke({"input": "北京天气怎么样?再算100+200"})
print(result)Manual Trace Creation
# langsmith_trace.py
from langsmith import traceable
from langchain_openai import ChatOpenAI
@traceable(name="weather_agent", run_type="chain")
def weather_agent(city: str):
llm = ChatOpenAI(model="gpt-4")
response = llm.invoke(f"描述{city}的天气")
return response.content
@traceable(name="multi_step_agent")
def multi_step_task(city: str):
# Each step will be traced
step1 = weather_agent(city)
step2 = llm.invoke(f"基于{step1}给出出行建议")
return step2.content
result = multi_step_task("北京")
print(result)LangFuse Practical (Open‑Source Alternative)
Installation & Configuration
pip install langfuse # langfuse_config.py
from langfuse.callback import CallbackHandler
import os
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-xxx"
os.environ["LANGFUSE_SECRET_KEY"] = "sk-xxx"
os.environ["LANGFUSE_HOST"] = "https://cloud.langfuse.com"
langfuse_handler = CallbackHandler()Integrating with LangChain
# langfuse_langchain.py
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.tools import tool
from langfuse.callback import CallbackHandler
langfuse_handler = CallbackHandler()
@tool
def get_weather(city: str) -> str:
return f"{city}:晴"
llm = ChatOpenAI(model="gpt-4")
# Call with callback
result = llm.invoke("北京天气怎么样?", config={"callbacks": [langfuse_handler]})
# Agent call with callback
agent_executor.invoke({"input": "查询天气"}, config={"callbacks": [langfuse_handler]})Custom Metrics
A dataclass AgentMetrics records total calls, successes, failures, token usage, tool call counts, durations, and a call history. Methods record_call, get_summary, and print_summary update the counters and display a concise report.
# metrics.py
from dataclasses import dataclass, field
from typing import List, Dict, Any
from datetime import datetime
import json
@dataclass
class AgentMetrics:
"""Agent 指标收集器"""
total_calls: int = 0
successful_calls: int = 0
failed_calls: int = 0
total_tokens: int = 0
prompt_tokens: int = 0
completion_tokens: int = 0
tool_calls: Dict[str, int] = field(default_factory=dict)
total_duration_ms: float = 0
avg_duration_ms: float = 0
call_history: List[Dict] = field(default_factory=list)
def record_call(self, success: bool, duration_ms: float, tokens: Dict = None, tools: List[str] = None):
self.total_calls += 1
if success:
self.successful_calls += 1
else:
self.failed_calls += 1
self.total_duration_ms += duration_ms
self.avg_duration_ms = self.total_duration_ms / self.total_calls
if tokens:
self.total_tokens += tokens.get("total", 0)
self.prompt_tokens += tokens.get("prompt", 0)
self.completion_tokens += tokens.get("completion", 0)
if tools:
for tool in tools:
self.tool_calls[tool] = self.tool_calls.get(tool, 0) + 1
self.call_history.append({
"timestamp": datetime.now().isoformat(),
"success": success,
"duration_ms": duration_ms,
"tokens": tokens,
"tools": tools
})
def get_summary(self) -> Dict:
return {
"total_calls": self.total_calls,
"success_rate": self.successful_calls / self.total_calls if self.total_calls > 0 else 0,
"total_tokens": self.total_tokens,
"avg_tokens_per_call": self.total_tokens / self.total_calls if self.total_calls > 0 else 0,
"avg_duration_ms": self.avg_duration_ms,
"tool_usage": self.tool_calls
}
def print_summary(self):
summary = self.get_summary()
print("
" + "="*50)
print("📊 Agent 指标摘要")
print("="*50)
print(f"总调用次数: {summary['total_calls']}")
print(f"成功率: {summary['success_rate']*100:.1f}%")
print(f"总 Token 消耗: {summary['total_tokens']}")
print(f"平均 Token/调用: {summary['avg_tokens_per_call']:.0f}")
print(f"平均耗时: {summary['avg_duration_ms']:.0f}ms")
print(f"工具调用: {summary['tool_usage']}")
agent_metrics = AgentMetrics()Agent Wrapped with Metrics
# metrics_agent.py
import time
from metrics import agent_metrics
from langchain_openai import ChatOpenAI
from langchain.tools import tool
from langchain.agents import AgentExecutor, create_tool_calling_agent
class ObservableAgent:
"""可观测的 Agent 包装器"""
def __init__(self, llm, tools, prompt):
self.llm = llm
self.tools = tools
self.agent = create_tool_calling_agent(llm, tools, prompt)
self.executor = AgentExecutor(agent=self.agent, tools=tools)
def invoke(self, input_text: str) -> str:
"""带监控的调用"""
start_time = time.time()
tool_names = []
success = True
tokens = {}
try:
for tool in self.tools:
tool_names.append(tool.name)
result = self.executor.invoke({"input": input_text})
tokens = {
"total": len(input_text) + len(str(result)) // 4,
"prompt": len(input_text) // 4,
"completion": len(str(result)) // 4,
}
except Exception as e:
success = False
result = {"output": f"错误: {e}"}
duration = (time.time() - start_time) * 1000
agent_metrics.record_call(success=success, duration_ms=duration, tokens=tokens, tools=tool_names)
return result.get("output", "")
def get_metrics(self):
return agent_metrics.get_summary()Evaluation Framework
A small test suite defines TestCase objects with input, expected output, expected tools, and a token limit. The Evaluator runs each case through the agent, checks results, and aggregates total, passed, failed, and pass‑rate statistics.
# evaluation.py
from typing import List, Dict
from dataclasses import dataclass
@dataclass
class TestCase:
"""测试用例"""
input: str
expected_output: str
expected_tools: List[str] = None
max_tokens: int = 500
EVAL_DATASET = [
TestCase(input="北京天气怎么样?", expected_output="晴", expected_tools=["get_weather"]),
TestCase(input="100+200等于多少?", expected_output="300", expected_tools=["calculate"]),
TestCase(input="查天气再算个数学题", expected_output="", expected_tools=["get_weather", "calculate"]),
]
class Evaluator:
"""评估器"""
def __init__(self, agent):
self.agent = agent
self.results = []
def evaluate(self, test_cases: List[TestCase]) -> Dict:
for case in test_cases:
result = self.agent.invoke(case.input)
passed = self._check_result(result, case)
self.results.append({
"input": case.input,
"expected": case.expected_output,
"actual": result,
"passed": passed
})
return self._summary()
def _check_result(self, actual: str, case: TestCase) -> bool:
if case.expected_output and case.expected_output not in actual:
return False
return True
def _summary(self) -> Dict:
total = len(self.results)
passed = sum(1 for r in self.results if r["passed"])
return {
"total": total,
"passed": passed,
"failed": total - passed,
"pass_rate": passed / total if total > 0 else 0,
"details": self.results
}Full Observable Agent
The final implementation combines tracing, metrics, and evaluation into a single ObservableAgent class. It records each step (name, input, output, duration), aggregates total duration, token usage, and success flag, and provides methods to export traces to JSON, print a summary, and run an interactive REPL.
# observable_agent.py
import time, json
from datetime import datetime
from dataclasses import dataclass, field
from typing import List, Dict, Any
from langchain_openai import ChatOpenAI
from langchain.tools import tool
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.prompts import ChatPromptTemplate
@dataclass
class TraceStep:
"""追踪步骤"""
step_id: int
name: str
input: str
output: str
duration_ms: float
timestamp: str
metadata: Dict = field(default_factory=dict)
@dataclass
class Trace:
"""完整追踪"""
trace_id: str
user_input: str
steps: List[TraceStep] = field(default_factory=list)
final_output: str = ""
total_duration_ms: float = 0
total_tokens: int = 0
success: bool = True
class ObservableAgent:
"""完整可观测性 Agent"""
def __init__(self, model: str = "gpt-4", verbose: bool = True):
self.llm = ChatOpenAI(model=model, temperature=0)
self.verbose = verbose
self.current_trace = None
self.traces: List[Trace] = []
self._setup_tools()
self._setup_agent()
def _setup_tools(self):
@tool
def get_weather(city: str) -> str:
"""获取天气"""
weathers = {"北京": "晴25°C", "上海": "多云22°C"}
return weathers.get(city, f"{city}:晴20°C")
@tool
def calculate(expr: str) -> str:
"""计算"""
try:
return str(eval(expr))
except:
return "计算错误"
self.tools = [get_weather, calculate]
def _setup_agent(self):
prompt = ChatPromptTemplate.from_messages([
("system", "你是智能助手,可以使用工具。"),
("human", "{input}"),
("placeholder", "{agent_scratchpad}")
])
self.agent = create_tool_calling_agent(self.llm, self.tools, prompt)
self.executor = AgentExecutor(agent=self.agent, tools=self.tools, verbose=self.verbose)
def _start_trace(self, user_input: str) -> str:
import uuid
trace_id = str(uuid.uuid4())[:8]
self.current_trace = Trace(trace_id=trace_id, user_input=user_input)
if self.verbose:
print(f"
🔍 [Trace: {trace_id}] 开始追踪")
return trace_id
def _add_step(self, name: str, input_data: str, output_data: str, duration_ms: float):
step = TraceStep(
step_id=len(self.current_trace.steps) + 1,
name=name,
input=input_data,
output=output_data,
duration_ms=duration_ms,
timestamp=datetime.now().isoformat()
)
self.current_trace.steps.append(step)
if self.verbose:
print(f" 📍 Step {step.step_id}: {name} ({duration_ms:.0f}ms)")
def _end_trace(self, final_output: str, success: bool = True):
self.current_trace.final_output = final_output
self.current_trace.success = success
self.traces.append(self.current_trace)
if self.verbose:
print(f"
✅ [Trace: {self.current_trace.trace_id}] 追踪完成")
print(f" 总步骤: {len(self.current_trace.steps)}")
print(f" 总耗时: {self.current_trace.total_duration_ms:.0f}ms")
def invoke(self, user_input: str) -> str:
self._start_trace(user_input)
start_time = time.time()
try:
llm_start = time.time()
result = self.executor.invoke({"input": user_input})
llm_duration = (time.time() - llm_start) * 1000
self._add_step(
name="agent_execution",
input_data=user_input,
output_data=result.get("output", ""),
duration_ms=llm_duration
)
output = result.get("output", "")
self.current_trace.total_duration_ms = (time.time() - start_time) * 1000
self._end_trace(output, success=True)
return output
except Exception as e:
self.current_trace.total_duration_ms = (time.time() - start_time) * 1000
self._end_trace(str(e), success=False)
return f"错误: {e}"
def get_traces(self) -> List[Trace]:
return self.traces
def export_traces(self, filepath: str):
data = []
for trace in self.traces:
data.append({
"trace_id": trace.trace_id,
"user_input": trace.user_input,
"final_output": trace.final_output,
"total_duration_ms": trace.total_duration_ms,
"success": trace.success,
"steps": [{"name": s.name, "duration_ms": s.duration_ms, "timestamp": s.timestamp} for s in trace.steps]
})
with open(filepath, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"✅ 追踪已导出到 {filepath}")
def print_summary(self):
total = len(self.traces)
if total == 0:
print("暂无追踪数据")
return
success_count = sum(1 for t in self.traces if t.success)
avg_duration = sum(t.total_duration_ms for t in self.traces) / total
print("
" + "="*50)
print("📊 Agent 运行摘要")
print("="*50)
print(f"总请求数: {total}")
print(f"成功率: {success_count/total*100:.1f}%")
print(f"平均耗时: {avg_duration:.0f}ms")
print(f"最大耗时: {max(t.total_duration_ms for t in self.traces):.0f}ms")
print(f"最小耗时: {min(t.total_duration_ms for t in self.traces):.0f}ms")
def interactive_mode(self):
agent = ObservableAgent(verbose=True)
print("
" + "="*60)
print("🔍 可观测性 Agent")
print("="*60)
print("输入 'exit' 退出")
print("输入 'summary' 查看统计")
print("输入 'export' 导出追踪")
print("-"*60)
while True:
try:
user_input = input("
👤 你: ").strip()
if user_input.lower() in ["exit", "quit"]:
print("👋 再见!")
break
if user_input.lower() == "summary":
agent.print_summary()
continue
if user_input.lower() == "export":
filename = f"traces_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
agent.export_traces(filename)
continue
if not user_input:
continue
response = agent.invoke(user_input)
print(f"
🤖 Agent: {response}")
except KeyboardInterrupt:
print("
👋 再见!")
break
if __name__ == "__main__":
ObservableAgent().interactive_mode()Best‑Practice Summary
Always enable tracing, even in production (sampled if needed).
Record key metrics: token usage, latency, success rate.
Establish a baseline evaluation suite and run it after each change.
Set alert thresholds for success‑rate drops or latency spikes.
Periodically analyze logs to discover failure patterns.
Next Episode Preview
AI Agent 智能体从入门到实战(九):生产级 Agent 架构设计 – covering high‑availability design, async queues, caching strategies, and cost optimization.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Coder Trainee
Experienced in Java and Python, we share and learn together. For submissions or collaborations, DM us.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
