Docs/Guides/Debugging

Debugging AI Agents

Debug and troubleshoot agent issues effectively with comprehensive debugging tools and techniques.

Overview

Debugging AI agents requires specialized tools and techniques to identify and resolve issues. This guide covers:

  • Common Issues - Typical problems and their solutions
  • Debugging Tools - Available debugging features
  • Trace Analysis - Analyzing agent traces for issues
  • Performance Debugging - Identifying performance bottlenecks
  • Error Analysis - Understanding and resolving errors

Common Issues

1. Agent Not Responding

python
from agenticants import AgenticAnts ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) class AgentDebugger: def __init__(self): self.ants = ants def debug_agent_not_responding(self, agent_name: str): """Debug agent not responding issues""" print(f"🔍 Debugging Agent: {agent_name}") print("=" * 50) # Check agent status agent_status = self.ants.agents.get_status(agent_name) print(f"Agent Status: {agent_status['status']}") if agent_status['status'] != 'active': print(f"❌ Agent is not active. Status: {agent_status['status']}") return self._fix_agent_status(agent_name) # Check recent traces recent_traces = self.ants.traces.query({ "agent": agent_name, "status": "error", "period": "last_1h" }) if recent_traces: print(f"❌ Found {len(recent_traces)} errors in the last hour") for trace in recent_traces[:5]: # Show last 5 errors print(f" Error: {trace.error}") print(f" Time: {trace.timestamp}") print(f" Input: {trace.input[:100]}...") else: print("✅ No recent errors found") # Check performance metrics performance = self.ants.metrics.get_agent_metrics(agent_name, period="last_1h") print(f"Average Response Time: {performance['avg_response_time']}ms") print(f"Success Rate: {performance['success_rate']}%") if performance['avg_response_time'] > 30000: # 30 seconds print("⚠️ High response time detected") return self._debug_performance_issues(agent_name) return {"status": "healthy", "issues": []} def _fix_agent_status(self, agent_name: str): """Fix agent status issues""" print(f"🔧 Fixing agent status for {agent_name}") # Restart agent restart_result = self.ants.agents.restart(agent_name) if restart_result['success']: print("✅ Agent restarted successfully") else: print(f"❌ Failed to restart agent: {restart_result['error']}") return restart_result def _debug_performance_issues(self, agent_name: str): """Debug performance issues""" print(f"🐌 Debugging performance issues for {agent_name}") # Get performance bottlenecks bottlenecks = self.ants.sre.find_bottlenecks(agent_name) print("Performance Bottlenecks:") for bottleneck in bottlenecks: print(f" {bottleneck['operation']}: {bottleneck['avg_time']}ms") print(f" Occurrences: {bottleneck['count']}") print(f" Optimization: {bottleneck['optimization']}") return bottlenecks

2. High Error Rates

typescript
const ants = new AgenticAnts({ apiKey: process.env.AGENTICANTS_API_KEY }) class ErrorRateDebugger { private ants: AgenticAnts async debugHighErrorRates(agentName: string) { console.log(`🔍 Debugging High Error Rates for ${agentName}`) console.log('='.repeat(50)) // Get error metrics const errorMetrics = await this.ants.metrics.getErrorMetrics({ agent: agentName, period: 'last_24h' }) console.log(`Error Rate: ${errorMetrics.errorRate}%`) console.log(`Total Errors: ${errorMetrics.totalErrors}`) console.log(`Success Rate: ${errorMetrics.successRate}%`) if (errorMetrics.errorRate > 5) { console.log('❌ High error rate detected') // Analyze error types const errorTypes = await this.ants.metrics.getErrorTypes({ agent: agentName, period: 'last_24h' }) console.log('\nError Types:') errorTypes.forEach(errorType => { console.log(` ${errorType.type}: ${errorType.count} occurrences`) console.log(` Percentage: ${errorType.percentage}%`) }) // Get recent errors const recentErrors = await this.ants.traces.query({ agent: agentName, status: 'error', period: 'last_1h' }) console.log('\nRecent Errors:') recentErrors.slice(0, 5).forEach(error => { console.log(` Error: ${error.error}`) console.log(` Time: ${error.timestamp}`) console.log(` Input: ${error.input.substring(0, 100)}...`) console.log(' ---') }) return this.analyzeErrorPatterns(recentErrors) } console.log('✅ Error rate is within acceptable range') return { status: 'healthy', errorRate: errorMetrics.errorRate } } private async analyzeErrorPatterns(errors: any[]) { console.log('🔍 Analyzing Error Patterns') console.log('='.repeat(50)) // Group errors by type const errorGroups = this.groupErrorsByType(errors) console.log('Error Patterns:') Object.entries(errorGroups).forEach(([type, errorList]) => { console.log(`\n${type}: ${errorList.length} occurrences`) // Find common patterns const patterns = this.findCommonPatterns(errorList) patterns.forEach(pattern => { console.log(` Pattern: ${pattern.description}`) console.log(` Frequency: ${pattern.frequency}`) console.log(` Solution: ${pattern.solution}`) }) }) return errorGroups } private groupErrorsByType(errors: any[]): Record<string, any[]> { const groups: Record<string, any[]> = {} errors.forEach(error => { const type = error.errorType || 'unknown' if (!groups[type]) { groups[type] = [] } groups[type].push(error) }) return groups } private findCommonPatterns(errors: any[]): any[] { // Implement pattern recognition logic return [ { description: 'Timeout errors', frequency: 'high', solution: 'Increase timeout or optimize performance' } ] } }

3. Performance Issues

python
class PerformanceDebugger: def __init__(self): self.ants = ants def debug_performance_issues(self, agent_name: str): """Debug performance issues""" print(f"🐌 Debugging Performance Issues for {agent_name}") print("=" * 50) # Get performance metrics performance = self.ants.metrics.get_agent_metrics(agent_name, period="last_24h") print(f"Average Response Time: {performance['avg_response_time']}ms") print(f"P95 Response Time: {performance['p95_response_time']}ms") print(f"P99 Response Time: {performance['p99_response_time']}ms") print(f"Throughput: {performance['throughput']} req/s") # Check for performance issues issues = [] if performance['avg_response_time'] > 5000: issues.append("High average response time") if performance['p95_response_time'] > 10000: issues.append("High P95 response time") if performance['throughput'] < 10: issues.append("Low throughput") if issues: print(f"❌ Performance issues detected: {', '.join(issues)}") return self._analyze_performance_bottlenecks(agent_name) else: print("✅ Performance is within acceptable range") return {"status": "healthy", "issues": []} def _analyze_performance_bottlenecks(self, agent_name: str): """Analyze performance bottlenecks""" print("🔍 Analyzing Performance Bottlenecks") print("=" * 50) # Get bottleneck analysis bottlenecks = self.ants.sre.find_bottlenecks(agent_name) print("Bottlenecks:") for bottleneck in bottlenecks: print(f"\n{bottleneck['operation']}:") print(f" Average time: {bottleneck['avg_time']}ms") print(f" Occurrences: {bottleneck['count']}") print(f" Total time: {bottleneck['total_time']}ms") print(f" Optimization: {bottleneck['optimization']}") # Get slow traces slow_traces = self.ants.traces.query({ "agent": agent_name, "duration": ">5000", # > 5 seconds "period": "last_24h" }) if slow_traces: print(f"\nSlow Traces ({len(slow_traces)} found):") for trace in slow_traces[:5]: # Show first 5 print(f" Duration: {trace.duration}ms") print(f" Time: {trace.timestamp}") print(f" Input: {trace.input[:100]}...") print(" ---") return bottlenecks

Debugging Tools

1. Trace Analysis

python
class TraceAnalyzer: def __init__(self): self.ants = ants def analyze_trace(self, trace_id: str): """Analyze a specific trace""" print(f"🔍 Analyzing Trace: {trace_id}") print("=" * 50) # Get trace details trace = self.ants.traces.get(trace_id) print(f"Trace Name: {trace.name}") print(f"Status: {trace.status}") print(f"Duration: {trace.duration}ms") print(f"Start Time: {trace.start_time}") print(f"End Time: {trace.end_time}") # Analyze spans if trace.spans: print(f"\nSpans ({len(trace.spans)}):") for span in trace.spans: print(f" {span.name}: {span.duration}ms") if span.status == "error": print(f" ❌ Error: {span.error}") else: print(f" ✅ Status: {span.status}") # Analyze metadata if trace.metadata: print(f"\nMetadata:") for key, value in trace.metadata.items(): print(f" {key}: {value}") # Check for issues issues = self._identify_trace_issues(trace) if issues: print(f"\nIssues Found:") for issue in issues: print(f" ❌ {issue}") else: print(f"\n✅ No issues found") return trace def _identify_trace_issues(self, trace): """Identify issues in trace""" issues = [] # Check duration if trace.duration > 30000: # 30 seconds issues.append("Long duration") # Check status if trace.status == "error": issues.append("Trace failed") # Check spans for span in trace.spans: if span.status == "error": issues.append(f"Span error: {span.name}") if span.duration > 10000: # 10 seconds issues.append(f"Long span duration: {span.name}") return issues

2. Error Analysis

typescript
class ErrorAnalyzer { private ants: AgenticAnts async analyzeErrors(agentName: string, period: string = 'last_24h') { console.log(`🔍 Analyzing Errors for ${agentName}`) console.log('='.repeat(50)) // Get error summary const errorSummary = await this.ants.metrics.getErrorSummary({ agent: agentName, period: period }) console.log(`Total Errors: ${errorSummary.totalErrors}`) console.log(`Error Rate: ${errorSummary.errorRate}%`) console.log(`Success Rate: ${errorSummary.successRate}%`) // Get error breakdown const errorBreakdown = await this.ants.metrics.getErrorBreakdown({ agent: agentName, period: period }) console.log('\nError Breakdown:') errorBreakdown.forEach(error => { console.log(` ${error.type}: ${error.count} (${error.percentage}%)`) }) // Get recent errors const recentErrors = await this.ants.traces.query({ agent: agentName, status: 'error', period: 'last_1h' }) console.log('\nRecent Errors:') recentErrors.slice(0, 10).forEach(error => { console.log(` ${error.timestamp}: ${error.error}`) console.log(` Input: ${error.input.substring(0, 100)}...`) console.log(' ---') }) return { summary: errorSummary, breakdown: errorBreakdown, recent: recentErrors } } async analyzeErrorPatterns(errors: any[]) { console.log('🔍 Analyzing Error Patterns') console.log('='.repeat(50)) // Group errors by type const errorGroups = this.groupErrorsByType(errors) console.log('Error Groups:') Object.entries(errorGroups).forEach(([type, errorList]) => { console.log(`\n${type}: ${errorList.length} occurrences`) // Analyze patterns const patterns = this.analyzeErrorPatterns(errorList) patterns.forEach(pattern => { console.log(` Pattern: ${pattern.description}`) console.log(` Frequency: ${pattern.frequency}`) console.log(` Solution: ${pattern.solution}`) }) }) return errorGroups } private groupErrorsByType(errors: any[]): Record<string, any[]> { const groups: Record<string, any[]> = {} errors.forEach(error => { const type = error.errorType || 'unknown' if (!groups[type]) { groups[type] = [] } groups[type].push(error) }) return groups } private analyzeErrorPatterns(errors: any[]): any[] { // Implement pattern analysis logic return [ { description: 'Timeout errors', frequency: 'high', solution: 'Increase timeout or optimize performance' } ] } }

3. Performance Analysis

python
class PerformanceAnalyzer: def __init__(self): self.ants = ants def analyze_performance(self, agent_name: str, period: str = "last_24h"): """Analyze performance metrics""" print(f"📊 Performance Analysis for {agent_name}") print("=" * 50) # Get performance metrics performance = self.ants.metrics.get_agent_metrics(agent_name, period=period) print(f"Request Rate: {performance['request_rate']} req/s") print(f"Average Response Time: {performance['avg_response_time']}ms") print(f"P95 Response Time: {performance['p95_response_time']}ms") print(f"P99 Response Time: {performance['p99_response_time']}ms") print(f"Error Rate: {performance['error_rate']}%") print(f"Success Rate: {performance['success_rate']}%") # Analyze performance trends trends = self.ants.metrics.get_performance_trends(agent_name, period=period) print(f"\nPerformance Trends:") for trend in trends[-7:]: # Last 7 data points print(f" {trend['timestamp']}: {trend['avg_response_time']}ms avg, {trend['error_rate']}% errors") # Identify performance issues issues = self._identify_performance_issues(performance) if issues: print(f"\nPerformance Issues:") for issue in issues: print(f" ⚠️ {issue}") return performance def _identify_performance_issues(self, performance): """Identify performance issues""" issues = [] if performance['avg_response_time'] > 5000: issues.append("High average response time") if performance['p95_response_time'] > 10000: issues.append("High P95 response time") if performance['error_rate'] > 5: issues.append("High error rate") if performance['request_rate'] < 1: issues.append("Low request rate") return issues

Debugging Workflows

1. Systematic Debugging

python
class SystematicDebugger: def __init__(self): self.ants = ants def debug_agent_systematically(self, agent_name: str): """Debug agent systematically""" print(f"🔍 Systematic Debugging for {agent_name}") print("=" * 50) # Step 1: Check agent status print("Step 1: Checking Agent Status") agent_status = self.ants.agents.get_status(agent_name) print(f"Status: {agent_status['status']}") if agent_status['status'] != 'active': print("❌ Agent is not active. Fixing...") self.ants.agents.restart(agent_name) return # Step 2: Check recent errors print("\nStep 2: Checking Recent Errors") recent_errors = self.ants.traces.query({ "agent": agent_name, "status": "error", "period": "last_1h" }) if recent_errors: print(f"❌ Found {len(recent_errors)} errors in the last hour") self._analyze_errors(recent_errors) else: print("✅ No recent errors found") # Step 3: Check performance print("\nStep 3: Checking Performance") performance = self.ants.metrics.get_agent_metrics(agent_name, period="last_1h") if performance['avg_response_time'] > 5000: print("❌ High response time detected") self._analyze_performance_issues(agent_name) else: print("✅ Performance is acceptable") # Step 4: Check resource usage print("\nStep 4: Checking Resource Usage") resources = self.ants.sre.get_resource_usage(agent_name) if resources['cpu_usage'] > 80: print("❌ High CPU usage detected") if resources['memory_usage'] > 80: print("❌ High memory usage detected") print("✅ Systematic debugging complete") def _analyze_errors(self, errors): """Analyze errors""" print("Error Analysis:") for error in errors[:5]: # Show first 5 errors print(f" Error: {error.error}") print(f" Time: {error.timestamp}") print(f" Input: {error.input[:100]}...") print(" ---") def _analyze_performance_issues(self, agent_name: str): """Analyze performance issues""" print("Performance Issue Analysis:") bottlenecks = self.ants.sre.find_bottlenecks(agent_name) for bottleneck in bottlenecks: print(f" {bottleneck['operation']}: {bottleneck['avg_time']}ms") print(f" Optimization: {bottleneck['optimization']}")

2. Automated Debugging

typescript
class AutomatedDebugger { private ants: AgenticAnts async setupAutomatedDebugging() { console.log('🤖 Setting Up Automated Debugging') console.log('='.repeat(50)) // Configure automated debugging rules await this.ants.debugging.configureRules({ highErrorRate: { threshold: 5, // 5% error rate action: 'analyze_errors', notification: true }, highLatency: { threshold: 5000, // 5 seconds action: 'analyze_performance', notification: true }, lowThroughput: { threshold: 1, // 1 req/s action: 'analyze_bottlenecks', notification: false } }) console.log('✅ Automated debugging configured') } async runAutomatedDebugging(agentName: string) { console.log(`🤖 Running Automated Debugging for ${agentName}`) console.log('='.repeat(50)) // Check all debugging rules const results = await this.ants.debugging.checkRules(agentName) results.forEach(result => { console.log(`\nRule: ${result.rule}`) console.log(`Status: ${result.status}`) if (result.triggered) { console.log(`❌ Rule triggered: ${result.message}`) console.log(`Action: ${result.action}`) // Execute action this.executeDebuggingAction(result.action, agentName) } else { console.log(`✅ Rule not triggered`) } }) return results } private async executeDebuggingAction(action: string, agentName: string) { console.log(`Executing action: ${action}`) switch (action) { case 'analyze_errors': await this.analyzeErrors(agentName) break case 'analyze_performance': await this.analyzePerformance(agentName) break case 'analyze_bottlenecks': await this.analyzeBottlenecks(agentName) break } } private async analyzeErrors(agentName: string) { console.log('Analyzing errors...') // Implement error analysis } private async analyzePerformance(agentName: string) { console.log('Analyzing performance...') // Implement performance analysis } private async analyzeBottlenecks(agentName: string) { console.log('Analyzing bottlenecks...') // Implement bottleneck analysis } }

Best Practices

1. Debugging Checklist

python
class DebuggingChecklist: def __init__(self): self.ants = ants def run_debugging_checklist(self, agent_name: str): """Run comprehensive debugging checklist""" print(f"✅ Debugging Checklist for {agent_name}") print("=" * 50) checklist = [ ("Agent Status", self.check_agent_status), ("Recent Errors", self.check_recent_errors), ("Performance Metrics", self.check_performance_metrics), ("Resource Usage", self.check_resource_usage), ("Configuration", self.check_configuration), ("Dependencies", self.check_dependencies) ] results = {} for item, check_function in checklist: try: result = check_function(agent_name) results[item] = result status = "✅ PASS" if result['passed'] else "❌ FAIL" print(f"{item}: {status}") if not result['passed']: print(f" Issues: {result['issues']}") except Exception as e: results[item] = {"passed": False, "issues": [str(e)]} print(f"{item}: ❌ ERROR - {e}") return results def check_agent_status(self, agent_name: str): """Check agent status""" status = self.ants.agents.get_status(agent_name) return {"passed": status['status'] == 'active', "issues": []} def check_recent_errors(self, agent_name: str): """Check recent errors""" errors = self.ants.traces.query({ "agent": agent_name, "status": "error", "period": "last_1h" }) return {"passed": len(errors) == 0, "issues": [f"{len(errors)} errors found"]} def check_performance_metrics(self, agent_name: str): """Check performance metrics""" performance = self.ants.metrics.get_agent_metrics(agent_name, period="last_1h") issues = [] if performance['avg_response_time'] > 5000: issues.append("High response time") if performance['error_rate'] > 5: issues.append("High error rate") return {"passed": len(issues) == 0, "issues": issues} def check_resource_usage(self, agent_name: str): """Check resource usage""" resources = self.ants.sre.get_resource_usage(agent_name) issues = [] if resources['cpu_usage'] > 80: issues.append("High CPU usage") if resources['memory_usage'] > 80: issues.append("High memory usage") return {"passed": len(issues) == 0, "issues": issues} def check_configuration(self, agent_name: str): """Check configuration""" config = self.ants.agents.get_configuration(agent_name) return {"passed": config['valid'], "issues": []} def check_dependencies(self, agent_name: str): """Check dependencies""" dependencies = self.ants.agents.get_dependencies(agent_name) issues = [] for dep in dependencies: if not dep['healthy']: issues.append(f"Unhealthy dependency: {dep['name']}") return {"passed": len(issues) == 0, "issues": issues}

2. Debugging Strategy

typescript
class DebuggingStrategy { private ants: AgenticAnts async implementDebuggingStrategy(agentName: string) { console.log(`🎯 Implementing Debugging Strategy for ${agentName}`) console.log('='.repeat(50)) // Phase 1: Initial Assessment console.log('\nPhase 1: Initial Assessment') const assessment = await this.performInitialAssessment(agentName) // Phase 2: Issue Identification console.log('\nPhase 2: Issue Identification') const issues = await this.identifyIssues(agentName) // Phase 3: Root Cause Analysis console.log('\nPhase 3: Root Cause Analysis') const rootCauses = await this.analyzeRootCauses(issues) // Phase 4: Solution Implementation console.log('\nPhase 4: Solution Implementation') const solutions = await this.implementSolutions(rootCauses) // Phase 5: Verification console.log('\nPhase 5: Verification') const verification = await this.verifySolutions(agentName) return { assessment, issues, rootCauses, solutions, verification } } private async performInitialAssessment(agentName: string) { console.log('Performing initial assessment...') // Implement initial assessment return { status: 'completed' } } private async identifyIssues(agentName: string) { console.log('Identifying issues...') // Implement issue identification return { issues: [] } } private async analyzeRootCauses(issues: any[]) { console.log('Analyzing root causes...') // Implement root cause analysis return { rootCauses: [] } } private async implementSolutions(rootCauses: any[]) { console.log('Implementing solutions...') // Implement solution implementation return { solutions: [] } } private async verifySolutions(agentName: string) { console.log('Verifying solutions...') // Implement solution verification return { verified: true } } }

Troubleshooting

Common Debugging Issues

Issue: Agent not responding

python
def debug_agent_not_responding(): # Check agent status status = ants.agents.get_status("agent_name") print(f"Agent status: {status}") # Check recent traces traces = ants.traces.query({ "agent": "agent_name", "period": "last_1h" }) print(f"Recent traces: {len(traces)}") # Check performance performance = ants.metrics.get_agent_metrics("agent_name") print(f"Performance: {performance}")

Issue: High error rates

typescript
async function debugHighErrorRates() { const errors = await ants.traces.query({ agent: 'agent_name', status: 'error', period: 'last_24h' }) console.log(`Total errors: ${errors.length}`) // Group by error type const errorGroups = groupErrorsByType(errors) console.log('Error groups:', errorGroups) }

Next Steps

Example Projects


Congratulations! 🎉 You now have comprehensive debugging tools and techniques to effectively troubleshoot and resolve AI agent issues.

© 2026 ANTS Platform, Inc.Docs v1.0 · Last updated June 2026