Debugging AI Agents
Debug and troubleshoot agent issues effectively with comprehensive debugging tools and techniques.
Overview
Debugging AI agents requires specialized tools and techniques to identify and resolve issues. This guide covers:
- Common Issues - Typical problems and their solutions
- Debugging Tools - Available debugging features
- Trace Analysis - Analyzing agent traces for issues
- Performance Debugging - Identifying performance bottlenecks
- Error Analysis - Understanding and resolving errors
Common Issues
1. Agent Not Responding
from agenticants import AgenticAnts
import time
ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
class AgentDebugger:
def __init__(self):
self.ants = ants
def debug_agent_not_responding(self, agent_name: str):
"""Debug agent not responding issues"""
print(f"🔍 Debugging Agent: {agent_name}")
print("=" * 50)
# Check agent status
agent_status = self.ants.agents.get_status(agent_name)
print(f"Agent Status: {agent_status['status']}")
if agent_status['status'] != 'active':
print(f"❌ Agent is not active. Status: {agent_status['status']}")
return self._fix_agent_status(agent_name)
# Check recent traces
recent_traces = self.ants.traces.query({
"agent": agent_name,
"status": "error",
"period": "last_1h"
})
if recent_traces:
print(f"❌ Found {len(recent_traces)} errors in the last hour")
for trace in recent_traces[:5]: # Show last 5 errors
print(f" Error: {trace.error}")
print(f" Time: {trace.timestamp}")
print(f" Input: {trace.input[:100]}...")
else:
print("✅ No recent errors found")
# Check performance metrics
performance = self.ants.metrics.get_agent_metrics(agent_name, period="last_1h")
print(f"Average Response Time: {performance['avg_response_time']}ms")
print(f"Success Rate: {performance['success_rate']}%")
if performance['avg_response_time'] > 30000: # 30 seconds
print("⚠️ High response time detected")
return self._debug_performance_issues(agent_name)
return {"status": "healthy", "issues": []}
def _fix_agent_status(self, agent_name: str):
"""Fix agent status issues"""
print(f"🔧 Fixing agent status for {agent_name}")
# Restart agent
restart_result = self.ants.agents.restart(agent_name)
if restart_result['success']:
print("✅ Agent restarted successfully")
else:
print(f"❌ Failed to restart agent: {restart_result['error']}")
return restart_result
def _debug_performance_issues(self, agent_name: str):
"""Debug performance issues"""
print(f"🐌 Debugging performance issues for {agent_name}")
# Get performance bottlenecks
bottlenecks = self.ants.sre.find_bottlenecks(agent_name)
print("Performance Bottlenecks:")
for bottleneck in bottlenecks:
print(f" {bottleneck['operation']}: {bottleneck['avg_time']}ms")
print(f" Occurrences: {bottleneck['count']}")
print(f" Optimization: {bottleneck['optimization']}")
return bottlenecks2. High Error Rates
import { AgenticAnts } from '@agenticants/sdk'
const ants = new AgenticAnts({ apiKey: process.env.AGENTICANTS_API_KEY })
class ErrorRateDebugger {
private ants: AgenticAnts
async debugHighErrorRates(agentName: string) {
console.log(`🔍 Debugging High Error Rates for ${agentName}`)
console.log('='.repeat(50))
// Get error metrics
const errorMetrics = await this.ants.metrics.getErrorMetrics({
agent: agentName,
period: 'last_24h'
})
console.log(`Error Rate: ${errorMetrics.errorRate}%`)
console.log(`Total Errors: ${errorMetrics.totalErrors}`)
console.log(`Success Rate: ${errorMetrics.successRate}%`)
if (errorMetrics.errorRate > 5) {
console.log('❌ High error rate detected')
// Analyze error types
const errorTypes = await this.ants.metrics.getErrorTypes({
agent: agentName,
period: 'last_24h'
})
console.log('\nError Types:')
errorTypes.forEach(errorType => {
console.log(` ${errorType.type}: ${errorType.count} occurrences`)
console.log(` Percentage: ${errorType.percentage}%`)
})
// Get recent errors
const recentErrors = await this.ants.traces.query({
agent: agentName,
status: 'error',
period: 'last_1h'
})
console.log('\nRecent Errors:')
recentErrors.slice(0, 5).forEach(error => {
console.log(` Error: ${error.error}`)
console.log(` Time: ${error.timestamp}`)
console.log(` Input: ${error.input.substring(0, 100)}...`)
console.log(' ---')
})
return this.analyzeErrorPatterns(recentErrors)
}
console.log('✅ Error rate is within acceptable range')
return { status: 'healthy', errorRate: errorMetrics.errorRate }
}
private async analyzeErrorPatterns(errors: any[]) {
console.log('🔍 Analyzing Error Patterns')
console.log('='.repeat(50))
// Group errors by type
const errorGroups = this.groupErrorsByType(errors)
console.log('Error Patterns:')
Object.entries(errorGroups).forEach(([type, errorList]) => {
console.log(`\n${type}: ${errorList.length} occurrences`)
// Find common patterns
const patterns = this.findCommonPatterns(errorList)
patterns.forEach(pattern => {
console.log(` Pattern: ${pattern.description}`)
console.log(` Frequency: ${pattern.frequency}`)
console.log(` Solution: ${pattern.solution}`)
})
})
return errorGroups
}
private groupErrorsByType(errors: any[]): Record<string, any[]> {
const groups: Record<string, any[]> = {}
errors.forEach(error => {
const type = error.errorType || 'unknown'
if (!groups[type]) {
groups[type] = []
}
groups[type].push(error)
})
return groups
}
private findCommonPatterns(errors: any[]): any[] {
// Implement pattern recognition logic
return [
{
description: 'Timeout errors',
frequency: 'high',
solution: 'Increase timeout or optimize performance'
}
]
}
}3. Performance Issues
class PerformanceDebugger:
def __init__(self):
self.ants = ants
def debug_performance_issues(self, agent_name: str):
"""Debug performance issues"""
print(f"🐌 Debugging Performance Issues for {agent_name}")
print("=" * 50)
# Get performance metrics
performance = self.ants.metrics.get_agent_metrics(agent_name, period="last_24h")
print(f"Average Response Time: {performance['avg_response_time']}ms")
print(f"P95 Response Time: {performance['p95_response_time']}ms")
print(f"P99 Response Time: {performance['p99_response_time']}ms")
print(f"Throughput: {performance['throughput']} req/s")
# Check for performance issues
issues = []
if performance['avg_response_time'] > 5000:
issues.append("High average response time")
if performance['p95_response_time'] > 10000:
issues.append("High P95 response time")
if performance['throughput'] < 10:
issues.append("Low throughput")
if issues:
print(f"❌ Performance issues detected: {', '.join(issues)}")
return self._analyze_performance_bottlenecks(agent_name)
else:
print("✅ Performance is within acceptable range")
return {"status": "healthy", "issues": []}
def _analyze_performance_bottlenecks(self, agent_name: str):
"""Analyze performance bottlenecks"""
print("🔍 Analyzing Performance Bottlenecks")
print("=" * 50)
# Get bottleneck analysis
bottlenecks = self.ants.sre.find_bottlenecks(agent_name)
print("Bottlenecks:")
for bottleneck in bottlenecks:
print(f"\n{bottleneck['operation']}:")
print(f" Average time: {bottleneck['avg_time']}ms")
print(f" Occurrences: {bottleneck['count']}")
print(f" Total time: {bottleneck['total_time']}ms")
print(f" Optimization: {bottleneck['optimization']}")
# Get slow traces
slow_traces = self.ants.traces.query({
"agent": agent_name,
"duration": ">5000", # > 5 seconds
"period": "last_24h"
})
if slow_traces:
print(f"\nSlow Traces ({len(slow_traces)} found):")
for trace in slow_traces[:5]: # Show first 5
print(f" Duration: {trace.duration}ms")
print(f" Time: {trace.timestamp}")
print(f" Input: {trace.input[:100]}...")
print(" ---")
return bottlenecksDebugging Tools
1. Trace Analysis
class TraceAnalyzer:
def __init__(self):
self.ants = ants
def analyze_trace(self, trace_id: str):
"""Analyze a specific trace"""
print(f"🔍 Analyzing Trace: {trace_id}")
print("=" * 50)
# Get trace details
trace = self.ants.traces.get(trace_id)
print(f"Trace Name: {trace.name}")
print(f"Status: {trace.status}")
print(f"Duration: {trace.duration}ms")
print(f"Start Time: {trace.start_time}")
print(f"End Time: {trace.end_time}")
# Analyze spans
if trace.spans:
print(f"\nSpans ({len(trace.spans)}):")
for span in trace.spans:
print(f" {span.name}: {span.duration}ms")
if span.status == "error":
print(f" ❌ Error: {span.error}")
else:
print(f" ✅ Status: {span.status}")
# Analyze metadata
if trace.metadata:
print(f"\nMetadata:")
for key, value in trace.metadata.items():
print(f" {key}: {value}")
# Check for issues
issues = self._identify_trace_issues(trace)
if issues:
print(f"\nIssues Found:")
for issue in issues:
print(f" ❌ {issue}")
else:
print(f"\n✅ No issues found")
return trace
def _identify_trace_issues(self, trace):
"""Identify issues in trace"""
issues = []
# Check duration
if trace.duration > 30000: # 30 seconds
issues.append("Long duration")
# Check status
if trace.status == "error":
issues.append("Trace failed")
# Check spans
for span in trace.spans:
if span.status == "error":
issues.append(f"Span error: {span.name}")
if span.duration > 10000: # 10 seconds
issues.append(f"Long span duration: {span.name}")
return issues2. Error Analysis
class ErrorAnalyzer {
private ants: AgenticAnts
async analyzeErrors(agentName: string, period: string = 'last_24h') {
console.log(`🔍 Analyzing Errors for ${agentName}`)
console.log('='.repeat(50))
// Get error summary
const errorSummary = await this.ants.metrics.getErrorSummary({
agent: agentName,
period: period
})
console.log(`Total Errors: ${errorSummary.totalErrors}`)
console.log(`Error Rate: ${errorSummary.errorRate}%`)
console.log(`Success Rate: ${errorSummary.successRate}%`)
// Get error breakdown
const errorBreakdown = await this.ants.metrics.getErrorBreakdown({
agent: agentName,
period: period
})
console.log('\nError Breakdown:')
errorBreakdown.forEach(error => {
console.log(` ${error.type}: ${error.count} (${error.percentage}%)`)
})
// Get recent errors
const recentErrors = await this.ants.traces.query({
agent: agentName,
status: 'error',
period: 'last_1h'
})
console.log('\nRecent Errors:')
recentErrors.slice(0, 10).forEach(error => {
console.log(` ${error.timestamp}: ${error.error}`)
console.log(` Input: ${error.input.substring(0, 100)}...`)
console.log(' ---')
})
return {
summary: errorSummary,
breakdown: errorBreakdown,
recent: recentErrors
}
}
async analyzeErrorPatterns(errors: any[]) {
console.log('🔍 Analyzing Error Patterns')
console.log('='.repeat(50))
// Group errors by type
const errorGroups = this.groupErrorsByType(errors)
console.log('Error Groups:')
Object.entries(errorGroups).forEach(([type, errorList]) => {
console.log(`\n${type}: ${errorList.length} occurrences`)
// Analyze patterns
const patterns = this.analyzeErrorPatterns(errorList)
patterns.forEach(pattern => {
console.log(` Pattern: ${pattern.description}`)
console.log(` Frequency: ${pattern.frequency}`)
console.log(` Solution: ${pattern.solution}`)
})
})
return errorGroups
}
private groupErrorsByType(errors: any[]): Record<string, any[]> {
const groups: Record<string, any[]> = {}
errors.forEach(error => {
const type = error.errorType || 'unknown'
if (!groups[type]) {
groups[type] = []
}
groups[type].push(error)
})
return groups
}
private analyzeErrorPatterns(errors: any[]): any[] {
// Implement pattern analysis logic
return [
{
description: 'Timeout errors',
frequency: 'high',
solution: 'Increase timeout or optimize performance'
}
]
}
}3. Performance Analysis
class PerformanceAnalyzer:
def __init__(self):
self.ants = ants
def analyze_performance(self, agent_name: str, period: str = "last_24h"):
"""Analyze performance metrics"""
print(f"📊 Performance Analysis for {agent_name}")
print("=" * 50)
# Get performance metrics
performance = self.ants.metrics.get_agent_metrics(agent_name, period=period)
print(f"Request Rate: {performance['request_rate']} req/s")
print(f"Average Response Time: {performance['avg_response_time']}ms")
print(f"P95 Response Time: {performance['p95_response_time']}ms")
print(f"P99 Response Time: {performance['p99_response_time']}ms")
print(f"Error Rate: {performance['error_rate']}%")
print(f"Success Rate: {performance['success_rate']}%")
# Analyze performance trends
trends = self.ants.metrics.get_performance_trends(agent_name, period=period)
print(f"\nPerformance Trends:")
for trend in trends[-7:]: # Last 7 data points
print(f" {trend['timestamp']}: {trend['avg_response_time']}ms avg, {trend['error_rate']}% errors")
# Identify performance issues
issues = self._identify_performance_issues(performance)
if issues:
print(f"\nPerformance Issues:")
for issue in issues:
print(f" ⚠️ {issue}")
return performance
def _identify_performance_issues(self, performance):
"""Identify performance issues"""
issues = []
if performance['avg_response_time'] > 5000:
issues.append("High average response time")
if performance['p95_response_time'] > 10000:
issues.append("High P95 response time")
if performance['error_rate'] > 5:
issues.append("High error rate")
if performance['request_rate'] < 1:
issues.append("Low request rate")
return issuesDebugging Workflows
1. Systematic Debugging
class SystematicDebugger:
def __init__(self):
self.ants = ants
def debug_agent_systematically(self, agent_name: str):
"""Debug agent systematically"""
print(f"🔍 Systematic Debugging for {agent_name}")
print("=" * 50)
# Step 1: Check agent status
print("Step 1: Checking Agent Status")
agent_status = self.ants.agents.get_status(agent_name)
print(f"Status: {agent_status['status']}")
if agent_status['status'] != 'active':
print("❌ Agent is not active. Fixing...")
self.ants.agents.restart(agent_name)
return
# Step 2: Check recent errors
print("\nStep 2: Checking Recent Errors")
recent_errors = self.ants.traces.query({
"agent": agent_name,
"status": "error",
"period": "last_1h"
})
if recent_errors:
print(f"❌ Found {len(recent_errors)} errors in the last hour")
self._analyze_errors(recent_errors)
else:
print("✅ No recent errors found")
# Step 3: Check performance
print("\nStep 3: Checking Performance")
performance = self.ants.metrics.get_agent_metrics(agent_name, period="last_1h")
if performance['avg_response_time'] > 5000:
print("❌ High response time detected")
self._analyze_performance_issues(agent_name)
else:
print("✅ Performance is acceptable")
# Step 4: Check resource usage
print("\nStep 4: Checking Resource Usage")
resources = self.ants.sre.get_resource_usage(agent_name)
if resources['cpu_usage'] > 80:
print("❌ High CPU usage detected")
if resources['memory_usage'] > 80:
print("❌ High memory usage detected")
print("✅ Systematic debugging complete")
def _analyze_errors(self, errors):
"""Analyze errors"""
print("Error Analysis:")
for error in errors[:5]: # Show first 5 errors
print(f" Error: {error.error}")
print(f" Time: {error.timestamp}")
print(f" Input: {error.input[:100]}...")
print(" ---")
def _analyze_performance_issues(self, agent_name: str):
"""Analyze performance issues"""
print("Performance Issue Analysis:")
bottlenecks = self.ants.sre.find_bottlenecks(agent_name)
for bottleneck in bottlenecks:
print(f" {bottleneck['operation']}: {bottleneck['avg_time']}ms")
print(f" Optimization: {bottleneck['optimization']}")2. Automated Debugging
class AutomatedDebugger {
private ants: AgenticAnts
async setupAutomatedDebugging() {
console.log('🤖 Setting Up Automated Debugging')
console.log('='.repeat(50))
// Configure automated debugging rules
await this.ants.debugging.configureRules({
highErrorRate: {
threshold: 5, // 5% error rate
action: 'analyze_errors',
notification: true
},
highLatency: {
threshold: 5000, // 5 seconds
action: 'analyze_performance',
notification: true
},
lowThroughput: {
threshold: 1, // 1 req/s
action: 'analyze_bottlenecks',
notification: false
}
})
console.log('✅ Automated debugging configured')
}
async runAutomatedDebugging(agentName: string) {
console.log(`🤖 Running Automated Debugging for ${agentName}`)
console.log('='.repeat(50))
// Check all debugging rules
const results = await this.ants.debugging.checkRules(agentName)
results.forEach(result => {
console.log(`\nRule: ${result.rule}`)
console.log(`Status: ${result.status}`)
if (result.triggered) {
console.log(`❌ Rule triggered: ${result.message}`)
console.log(`Action: ${result.action}`)
// Execute action
this.executeDebuggingAction(result.action, agentName)
} else {
console.log(`✅ Rule not triggered`)
}
})
return results
}
private async executeDebuggingAction(action: string, agentName: string) {
console.log(`Executing action: ${action}`)
switch (action) {
case 'analyze_errors':
await this.analyzeErrors(agentName)
break
case 'analyze_performance':
await this.analyzePerformance(agentName)
break
case 'analyze_bottlenecks':
await this.analyzeBottlenecks(agentName)
break
}
}
private async analyzeErrors(agentName: string) {
console.log('Analyzing errors...')
// Implement error analysis
}
private async analyzePerformance(agentName: string) {
console.log('Analyzing performance...')
// Implement performance analysis
}
private async analyzeBottlenecks(agentName: string) {
console.log('Analyzing bottlenecks...')
// Implement bottleneck analysis
}
}Best Practices
1. Debugging Checklist
class DebuggingChecklist:
def __init__(self):
self.ants = ants
def run_debugging_checklist(self, agent_name: str):
"""Run comprehensive debugging checklist"""
print(f"✅ Debugging Checklist for {agent_name}")
print("=" * 50)
checklist = [
("Agent Status", self.check_agent_status),
("Recent Errors", self.check_recent_errors),
("Performance Metrics", self.check_performance_metrics),
("Resource Usage", self.check_resource_usage),
("Configuration", self.check_configuration),
("Dependencies", self.check_dependencies)
]
results = {}
for item, check_function in checklist:
try:
result = check_function(agent_name)
results[item] = result
status = "✅ PASS" if result['passed'] else "❌ FAIL"
print(f"{item}: {status}")
if not result['passed']:
print(f" Issues: {result['issues']}")
except Exception as e:
results[item] = {"passed": False, "issues": [str(e)]}
print(f"{item}: ❌ ERROR - {e}")
return results
def check_agent_status(self, agent_name: str):
"""Check agent status"""
status = self.ants.agents.get_status(agent_name)
return {"passed": status['status'] == 'active', "issues": []}
def check_recent_errors(self, agent_name: str):
"""Check recent errors"""
errors = self.ants.traces.query({
"agent": agent_name,
"status": "error",
"period": "last_1h"
})
return {"passed": len(errors) == 0, "issues": [f"{len(errors)} errors found"]}
def check_performance_metrics(self, agent_name: str):
"""Check performance metrics"""
performance = self.ants.metrics.get_agent_metrics(agent_name, period="last_1h")
issues = []
if performance['avg_response_time'] > 5000:
issues.append("High response time")
if performance['error_rate'] > 5:
issues.append("High error rate")
return {"passed": len(issues) == 0, "issues": issues}
def check_resource_usage(self, agent_name: str):
"""Check resource usage"""
resources = self.ants.sre.get_resource_usage(agent_name)
issues = []
if resources['cpu_usage'] > 80:
issues.append("High CPU usage")
if resources['memory_usage'] > 80:
issues.append("High memory usage")
return {"passed": len(issues) == 0, "issues": issues}
def check_configuration(self, agent_name: str):
"""Check configuration"""
config = self.ants.agents.get_configuration(agent_name)
return {"passed": config['valid'], "issues": []}
def check_dependencies(self, agent_name: str):
"""Check dependencies"""
dependencies = self.ants.agents.get_dependencies(agent_name)
issues = []
for dep in dependencies:
if not dep['healthy']:
issues.append(f"Unhealthy dependency: {dep['name']}")
return {"passed": len(issues) == 0, "issues": issues}2. Debugging Strategy
class DebuggingStrategy {
private ants: AgenticAnts
async implementDebuggingStrategy(agentName: string) {
console.log(`🎯 Implementing Debugging Strategy for ${agentName}`)
console.log('='.repeat(50))
// Phase 1: Initial Assessment
console.log('\nPhase 1: Initial Assessment')
const assessment = await this.performInitialAssessment(agentName)
// Phase 2: Issue Identification
console.log('\nPhase 2: Issue Identification')
const issues = await this.identifyIssues(agentName)
// Phase 3: Root Cause Analysis
console.log('\nPhase 3: Root Cause Analysis')
const rootCauses = await this.analyzeRootCauses(issues)
// Phase 4: Solution Implementation
console.log('\nPhase 4: Solution Implementation')
const solutions = await this.implementSolutions(rootCauses)
// Phase 5: Verification
console.log('\nPhase 5: Verification')
const verification = await this.verifySolutions(agentName)
return {
assessment,
issues,
rootCauses,
solutions,
verification
}
}
private async performInitialAssessment(agentName: string) {
console.log('Performing initial assessment...')
// Implement initial assessment
return { status: 'completed' }
}
private async identifyIssues(agentName: string) {
console.log('Identifying issues...')
// Implement issue identification
return { issues: [] }
}
private async analyzeRootCauses(issues: any[]) {
console.log('Analyzing root causes...')
// Implement root cause analysis
return { rootCauses: [] }
}
private async implementSolutions(rootCauses: any[]) {
console.log('Implementing solutions...')
// Implement solution implementation
return { solutions: [] }
}
private async verifySolutions(agentName: string) {
console.log('Verifying solutions...')
// Implement solution verification
return { verified: true }
}
}Troubleshooting
Common Debugging Issues
Issue: Agent not responding
def debug_agent_not_responding():
# Check agent status
status = ants.agents.get_status("agent_name")
print(f"Agent status: {status}")
# Check recent traces
traces = ants.traces.query({
"agent": "agent_name",
"period": "last_1h"
})
print(f"Recent traces: {len(traces)}")
# Check performance
performance = ants.metrics.get_agent_metrics("agent_name")
print(f"Performance: {performance}")Issue: High error rates
async function debugHighErrorRates() {
const errors = await ants.traces.query({
agent: 'agent_name',
status: 'error',
period: 'last_24h'
})
console.log(`Total errors: ${errors.length}`)
// Group by error type
const errorGroups = groupErrorsByType(errors)
console.log('Error groups:', errorGroups)
}Next Steps
- Production - Learn production best practices
- Cost Optimization - Reduce costs with our optimization guide
- SRE - Deep dive into SRE capabilities
Example Projects
- Debugging Customer Support Bot - GitHub Repository (opens in a new tab)
- Performance Debugging - GitHub Repository (opens in a new tab)
- Error Analysis System - GitHub Repository (opens in a new tab)
Congratulations! 🎉 You now have comprehensive debugging tools and techniques to effectively troubleshoot and resolve AI agent issues.