Docs/Guides/Production

Production Best Practices

Deploy monitoring to production safely and effectively with enterprise-grade reliability and performance.

Overview

Deploying AI agents to production requires careful planning, monitoring, and operational excellence. This guide covers:

Pre-Production Checklist - Ensure readiness for production
Production Configuration - Optimize for production environments
Monitoring in Production - Real-time production monitoring
Incident Response - Handle production incidents effectively
Scaling Strategies - Scale your AI operations safely

Pre-Production Checklist

1. Infrastructure Readiness

python

class ProductionReadinessCheck: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def check_infrastructure_readiness(self): """Verify infrastructure is ready for production""" checks = [] # Check API connectivity try: self.ants.health.check() checks.append(("API Connectivity", "✅ PASS")) except Exception as e: checks.append(("API Connectivity", f"❌ FAIL: {e}")) # Check monitoring setup try: metrics = self.ants.metrics.get_system_metrics() checks.append(("Monitoring Setup", "✅ PASS")) except Exception as e: checks.append(("Monitoring Setup", f"❌ FAIL: {e}")) # Check alerting configuration try: alerts = self.ants.alerts.list() checks.append(("Alerting Configuration", "✅ PASS")) except Exception as e: checks.append(("Alerting Configuration", f"❌ FAIL: {e}")) # Check backup systems try: backup_status = self.ants.backup.check_status() checks.append(("Backup Systems", "✅ PASS")) except Exception as e: checks.append(("Backup Systems", f"❌ FAIL: {e}")) return checks def print_readiness_report(self): """Print comprehensive readiness report""" print("🚀 Production Readiness Report") print("=" * 50) checks = self.check_infrastructure_readiness() for check_name, status in checks: print(f"{check_name}: {status}") # Overall status failed_checks = [check for check in checks if "❌" in check[1]] if failed_checks: print(f"\n❌ {len(failed_checks)} checks failed. Fix before deploying to production.") else: print("\n✅ All checks passed. Ready for production deployment!")

2. Security and Compliance

typescript

class ProductionSecurityCheck { private ants: AgenticAnts async checkSecurityReadiness() { const securityChecks = [] // Check API key security try { await this.ants.security.validateApiKey() securityChecks.push({ check: 'API Key Security', status: 'PASS' }) } catch (error) { securityChecks.push({ check: 'API Key Security', status: 'FAIL', error: error.message }) } // Check data encryption try { const encryptionStatus = await this.ants.security.checkEncryption() securityChecks.push({ check: 'Data Encryption', status: 'PASS' }) } catch (error) { securityChecks.push({ check: 'Data Encryption', status: 'FAIL', error: error.message }) } // Check access controls try { const accessControls = await this.ants.security.checkAccessControls() securityChecks.push({ check: 'Access Controls', status: 'PASS' }) } catch (error) { securityChecks.push({ check: 'Access Controls', status: 'FAIL', error: error.message }) } // Check compliance requirements try { const complianceStatus = await this.ants.compliance.checkStatus() securityChecks.push({ check: 'Compliance Requirements', status: 'PASS' }) } catch (error) { securityChecks.push({ check: 'Compliance Requirements', status: 'FAIL', error: error.message }) } return securityChecks } async printSecurityReport() { console.log('🔒 Production Security Report') console.log('='.repeat(50)) const checks = await this.checkSecurityReadiness() checks.forEach(check => { const status = check.status === 'PASS' ? '✅' : '❌' console.log(`${check.check}: ${status} ${check.status}`) if (check.error) { console.log(` Error: ${check.error}`) } }) } }

3. Performance Baseline

python

class ProductionPerformanceBaseline: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def establish_performance_baseline(self): """Establish performance baseline for production""" baseline_metrics = {} # Test response times response_times = [] for i in range(100): start_time = time.time() # Simulate agent call result = self.simulate_agent_call() response_time = time.time() - start_time response_times.append(response_time) baseline_metrics["avg_response_time"] = sum(response_times) / len(response_times) baseline_metrics["p95_response_time"] = sorted(response_times)[95] baseline_metrics["p99_response_time"] = sorted(response_times)[99] # Test throughput throughput = self.test_throughput() baseline_metrics["max_throughput"] = throughput # Test error rates error_rate = self.test_error_rate() baseline_metrics["baseline_error_rate"] = error_rate return baseline_metrics def test_throughput(self): """Test maximum throughput""" # Implement throughput testing return 100 # requests per second def test_error_rate(self): """Test baseline error rate""" # Implement error rate testing return 0.01 # 1% error rate

Production Configuration

1. Environment Configuration

python

class ProductionConfiguration: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def configure_production_environment(self): """Configure AgenticAnts for production""" # Configure monitoring self.ants.config.set_production_mode(True) # Configure sampling self.ants.config.set_sampling_rate(1.0) # 100% sampling in production # Configure retention self.ants.config.set_data_retention({ "traces": "90_days", "metrics": "1_year", "logs": "7_years" }) # Configure alerting self.ants.alerts.configure_production_alerts() # Configure backup self.ants.backup.configure_production_backup() print("✅ Production environment configured successfully") def configure_production_alerts(self): """Configure production-specific alerts""" # High error rate alert self.ants.alerts.create({ "name": "High Error Rate", "condition": "error_rate > 5%", "window": "5m", "channels": ["email", "slack", "pagerduty"], "severity": "critical" }) # High latency alert self.ants.alerts.create({ "name": "High Latency", "condition": "p95_latency > 5000ms", "window": "10m", "channels": ["email", "slack"], "severity": "warning" }) # Cost spike alert self.ants.alerts.create({ "name": "Cost Spike", "condition": "daily_cost > 1000", "window": "1d", "channels": ["email", "slack"], "severity": "warning" }) print("✅ Production alerts configured successfully")

2. Load Balancing and Scaling

typescript

class ProductionScaling { private ants: AgenticAnts async configureLoadBalancing() { // Configure load balancing for multiple instances await this.ants.config.setLoadBalancing({ strategy: 'round_robin', healthCheckInterval: 30, maxRetries: 3, timeout: 5000 }) console.log('✅ Load balancing configured') } async configureAutoScaling() { // Configure auto-scaling based on metrics await this.ants.scaling.configureAutoScaling({ minInstances: 2, maxInstances: 10, scaleUpThreshold: 80, // CPU usage scaleDownThreshold: 30, scaleUpCooldown: 300, // 5 minutes scaleDownCooldown: 600 // 10 minutes }) console.log('✅ Auto-scaling configured') } async configureCircuitBreakers() { // Configure circuit breakers for resilience await this.ants.resilience.configureCircuitBreakers({ failureThreshold: 5, recoveryTimeout: 30000, halfOpenMaxCalls: 3 }) console.log('✅ Circuit breakers configured') } }

3. Data Management

python

class ProductionDataManagement: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def configure_data_management(self): """Configure data management for production""" # Configure data encryption self.ants.security.configure_encryption({ "at_rest": "AES-256", "in_transit": "TLS-1.3", "key_rotation": "90_days" }) # Configure data retention self.ants.data.configure_retention({ "traces": "90_days", "metrics": "1_year", "logs": "7_years", "backups": "1_year" }) # Configure data backup self.ants.backup.configure_backup({ "frequency": "daily", "retention": "30_days", "encryption": True }) # Configure data export self.ants.data.configure_export({ "format": "json", "compression": True, "encryption": True }) print("✅ Data management configured successfully")

Monitoring in Production

1. Real-Time Monitoring

python

class ProductionMonitoring: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def setup_production_monitoring(self): """Setup comprehensive production monitoring""" # Setup real-time dashboards self.ants.dashboards.create_production_dashboard() # Setup health checks self.ants.health.setup_production_health_checks() # Setup performance monitoring self.ants.metrics.setup_production_metrics() # Setup cost monitoring self.ants.finops.setup_production_cost_monitoring() print("✅ Production monitoring setup complete") def create_production_dashboard(self): """Create production monitoring dashboard""" dashboard = self.ants.dashboards.create({ "name": "Production Overview", "widgets": [ { "type": "timeseries", "title": "Request Rate", "metric": "requests_per_second", "time_range": "1h" }, { "type": "timeseries", "title": "Response Time", "metric": "p95_latency", "time_range": "1h" }, { "type": "gauge", "title": "Error Rate", "metric": "error_rate", "thresholds": [5, 10] }, { "type": "timeseries", "title": "Cost", "metric": "daily_cost", "time_range": "7d" } ] }) return dashboard

2. Health Checks

typescript

class ProductionHealthChecks { private ants: AgenticAnts async setupHealthChecks() { // Configure health checks for all components await this.ants.health.configureHealthChecks({ endpoints: [ { name: 'api-health', url: '/api/health', interval: 30, timeout: 5, expectedStatus: 200 }, { name: 'database-health', url: '/api/db/health', interval: 60, timeout: 10, expectedStatus: 200 }, { name: 'vector-db-health', url: '/api/vector/health', interval: 60, timeout: 10, expectedStatus: 200 } ] }) console.log('✅ Health checks configured') } async getHealthStatus() { const healthStatus = await this.ants.health.getStatus() console.log('🏥 Production Health Status') console.log('='.repeat(50)) Object.entries(healthStatus).forEach(([component, status]) => { const icon = status.healthy ? '✅' : '❌' console.log(`${component}: ${icon} ${status.status}`) if (!status.healthy) { console.log(` Error: ${status.error}`) } }) return healthStatus } }

3. Performance Monitoring

python

class ProductionPerformanceMonitoring: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def monitor_production_performance(self): """Monitor production performance metrics""" # Get current performance metrics metrics = self.ants.metrics.get_production_metrics() print("📊 Production Performance Metrics") print("=" * 50) print(f"Request Rate: {metrics['request_rate']} req/s") print(f"Response Time (P95): {metrics['p95_latency']}ms") print(f"Error Rate: {metrics['error_rate']}%") print(f"Success Rate: {metrics['success_rate']}%") print(f"Active Connections: {metrics['active_connections']}") # Check for performance issues if metrics['error_rate'] > 5: print("⚠️ High error rate detected!") if metrics['p95_latency'] > 5000: print("⚠️ High latency detected!") if metrics['active_connections'] > 1000: print("⚠️ High connection count detected!") return metrics def get_performance_trends(self): """Get performance trends over time""" trends = self.ants.metrics.get_performance_trends( period="last_24h", granularity="1h" ) print("📈 Performance Trends (Last 24h)") print("=" * 50) for trend in trends: print(f"{trend['timestamp']}: {trend['request_rate']} req/s, {trend['p95_latency']}ms") return trends

Incident Response

1. Incident Detection

python

class ProductionIncidentResponse: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def setup_incident_detection(self): """Setup incident detection and response""" # Configure incident detection rules self.ants.incidents.configure_detection_rules({ "high_error_rate": { "condition": "error_rate > 10%", "window": "5m", "severity": "critical" }, "high_latency": { "condition": "p95_latency > 10000ms", "window": "10m", "severity": "high" }, "cost_spike": { "condition": "hourly_cost > 100", "window": "1h", "severity": "medium" } }) # Configure incident response workflows self.ants.incidents.configure_response_workflows() print("✅ Incident detection and response configured") def handle_incident(self, incident_id: str): """Handle production incident""" incident = self.ants.incidents.get(incident_id) print(f"🚨 Incident {incident_id} detected") print(f"Severity: {incident['severity']}") print(f"Description: {incident['description']}") print(f"Affected systems: {incident['affected_systems']}") # Execute incident response workflow response = self.ants.incidents.execute_response_workflow(incident_id) print(f"Response actions: {response['actions']}") return response

2. Incident Response Workflows

typescript

class IncidentResponseWorkflows { private ants: AgenticAnts async configureResponseWorkflows() { // Configure automated response workflows await this.ants.incidents.configureWorkflows({ critical: { steps: [ { action: 'notify', channels: ['pagerduty', 'slack'] }, { action: 'scale', instances: 2 }, { action: 'enable', feature: 'circuit_breaker' }, { action: 'notify', channels: ['slack'], message: 'Mitigation applied' } ] }, high: { steps: [ { action: 'notify', channels: ['slack'] }, { action: 'investigate', automated: true }, { action: 'notify', channels: ['slack'], message: 'Investigation started' } ] }, medium: { steps: [ { action: 'notify', channels: ['email'] }, { action: 'log', level: 'warning' } ] } }) console.log('✅ Incident response workflows configured') } async executeIncidentResponse(incidentId: string) { const incident = await this.ants.incidents.get(incidentId) console.log(`🚨 Executing incident response for ${incidentId}`) console.log(`Severity: ${incident.severity}`) // Execute appropriate workflow based on severity const workflow = await this.ants.incidents.getWorkflow(incident.severity) for (const step of workflow.steps) { console.log(`Executing: ${step.action}`) await this.ants.incidents.executeStep(step) } console.log('✅ Incident response workflow completed') } }

3. Post-Incident Analysis

python

class PostIncidentAnalysis: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def conduct_post_incident_analysis(self, incident_id: str): """Conduct post-incident analysis""" incident = self.ants.incidents.get(incident_id) print(f"📋 Post-Incident Analysis for {incident_id}") print("=" * 50) # Analyze incident timeline timeline = self.ants.incidents.get_timeline(incident_id) print("Timeline:") for event in timeline: print(f" {event['timestamp']}: {event['description']}") # Analyze root cause root_cause = self.ants.incidents.analyze_root_cause(incident_id) print(f"\nRoot Cause: {root_cause['description']}") # Generate recommendations recommendations = self.ants.incidents.generate_recommendations(incident_id) print("\nRecommendations:") for rec in recommendations: print(f" - {rec['description']}") # Generate post-incident report report = self.ants.incidents.generate_post_incident_report(incident_id) return report

Scaling Strategies

1. Horizontal Scaling

python

class ProductionScaling: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def setup_horizontal_scaling(self): """Setup horizontal scaling for production""" # Configure auto-scaling self.ants.scaling.configure_auto_scaling({ "min_instances": 2, "max_instances": 10, "scale_up_threshold": 80, # CPU usage "scale_down_threshold": 30, "scale_up_cooldown": 300, # 5 minutes "scale_down_cooldown": 600 # 10 minutes }) # Configure load balancing self.ants.scaling.configure_load_balancing({ "strategy": "round_robin", "health_check_interval": 30, "max_retries": 3, "timeout": 5000 }) print("✅ Horizontal scaling configured") def monitor_scaling_metrics(self): """Monitor scaling metrics""" metrics = self.ants.scaling.get_scaling_metrics() print("📊 Scaling Metrics") print("=" * 50) print(f"Current Instances: {metrics['current_instances']}") print(f"CPU Usage: {metrics['cpu_usage']}%") print(f"Memory Usage: {metrics['memory_usage']}%") print(f"Request Rate: {metrics['request_rate']} req/s") # Check scaling triggers if metrics['cpu_usage'] > 80: print("⚠️ High CPU usage - scaling up may be triggered") if metrics['cpu_usage'] < 30: print("ℹ️ Low CPU usage - scaling down may be triggered") return metrics

2. Vertical Scaling

typescript

class VerticalScaling { private ants: AgenticAnts async setupVerticalScaling() { // Configure vertical scaling based on resource usage await this.ants.scaling.configureVerticalScaling({ cpuThreshold: 80, memoryThreshold: 85, scaleUpFactor: 1.5, scaleDownFactor: 0.8, cooldownPeriod: 300 }) console.log('✅ Vertical scaling configured') } async monitorResourceUsage() { const resourceMetrics = await this.ants.scaling.getResourceMetrics() console.log('📊 Resource Usage Metrics') console.log('='.repeat(50)) console.log(`CPU Usage: ${resourceMetrics.cpuUsage}%`) console.log(`Memory Usage: ${resourceMetrics.memoryUsage}%`) console.log(`Disk Usage: ${resourceMetrics.diskUsage}%`) console.log(`Network Usage: ${resourceMetrics.networkUsage}%`) // Check for resource constraints if (resourceMetrics.cpuUsage > 90) { console.log('⚠️ High CPU usage detected') } if (resourceMetrics.memoryUsage > 90) { console.log('⚠️ High memory usage detected') } return resourceMetrics } }

Best Practices

1. Production-Ready Code

python

class ProductionReadyAgent: def __init__(self): self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY')) def process_request(self, request: dict): """Process request with production-ready error handling""" # Create trace for monitoring trace = self.ants.trace.create( name="production-agent", input=request, metadata={ "environment": "production", "version": "1.0.0" } ) try: # Validate request self.validate_request(request) # Process request result = self.process_core_logic(request) # Log success trace.complete( output=result, metadata={ "success": True, "processing_time": trace.duration } ) return result except ValidationError as e: trace.error(error=str(e), metadata={"error_type": "validation"}) raise HTTPException(status_code=400, detail=str(e)) except ProcessingError as e: trace.error(error=str(e), metadata={"error_type": "processing"}) raise HTTPException(status_code=500, detail="Internal processing error") except Exception as e: trace.error(error=str(e), metadata={"error_type": "unexpected"}) raise HTTPException(status_code=500, detail="Internal server error") def validate_request(self, request: dict): """Validate incoming request""" if not request.get("input"): raise ValidationError("Input is required") if len(request["input"]) > 10000: raise ValidationError("Input too long") def process_core_logic(self, request: dict): """Core business logic""" # Implement your agent logic here return {"result": "processed"}

2. Monitoring and Alerting

typescript

class ProductionMonitoring { private ants: AgenticAnts async setupProductionMonitoring() { // Configure comprehensive monitoring await this.ants.monitoring.configure({ metrics: { performance: true, errors: true, costs: true, security: true }, alerting: { enabled: true, channels: ['email', 'slack', 'pagerduty'] }, dashboards: { realTime: true, historical: true } }) console.log('✅ Production monitoring configured') } async createProductionDashboards() { // Create production dashboards const dashboards = [ { name: 'Production Overview', widgets: [ { type: 'timeseries', metric: 'request_rate' }, { type: 'gauge', metric: 'error_rate' }, { type: 'timeseries', metric: 'response_time' } ] }, { name: 'Cost Monitoring', widgets: [ { type: 'timeseries', metric: 'daily_cost' }, { type: 'counter', metric: 'total_cost' } ] } ] for (const dashboard of dashboards) { await this.ants.dashboards.create(dashboard) } console.log('✅ Production dashboards created') } }

Troubleshooting

Common Production Issues

Issue: High error rates in production

python

def debug_production_errors(): # Analyze production errors errors = ants.traces.query({ "status": "error", "environment": "production", "period": "last_24h" }) print("Production Error Analysis:") for error in errors: print(f"Error: {error.error}") print(f"Time: {error.timestamp}") print(f"Agent: {error.metadata.get('agent')}") print(f"Request: {error.input}") print("---")

Issue: Performance degradation

typescript

async function debugPerformanceDegradation() { const performanceIssues = await ants.metrics.getPerformanceIssues({ period: 'last_24h', threshold: 5000 // 5 seconds }) console.log('Performance Issues:') performanceIssues.forEach(issue => { console.log(`Agent: ${issue.agent}`) console.log(`Latency: ${issue.latency}ms`) console.log(`Time: ${issue.timestamp}`) }) }

Next Steps

Cost Optimization - Reduce costs with our optimization guide
Debugging - Troubleshoot issues with our debugging guide
Security - Learn about security best practices

Example Projects

Production Customer Support Bot - GitHub Repository
Scalable Content Generation - GitHub Repository
Enterprise AI Platform - GitHub Repository

Congratulations! 🎉 You now have a production-ready AI system with comprehensive monitoring, alerting, and incident response capabilities.