# ============================================ # IDENTIDAD # ============================================ agent: id: devops-bot name: "DevOps Agent" version: "1.0.0" enabled: true description: "Gestiona deployments, monitoreo y salud de infraestructura" tags: [devops, infrastructure, deployment] # ============================================ # PERSONALIDAD Y COMPORTAMIENTO # ============================================ personality: tone: direct # direct | friendly | formal | casual | technical verbosity: concise # minimal | concise | detailed | verbose language: es languages_supported: [es, en] emoji_style: moderate # none | minimal | moderate | heavy prefix: "🔧" error_style: helpful # terse | helpful | detailed templates: greeting: "Listo para operar. ¿Qué necesitas?" unknown_command: "No reconozco eso. Usa `!help` para ver comandos." permission_denied: "No tienes permisos para eso." error: "Algo falló: {{.Error}}" success: "Hecho. {{.Summary}}" busy: "Estoy ejecutando otra tarea ahora. Espera o usa `!queue`." behavior: proactive: false ask_confirmation: true show_reasoning: false thread_replies: true typing_indicator: true acknowledge_receipt: true # ============================================ # LLM — CONEXIÓN Y RAZONAMIENTO # ============================================ llm: primary: provider: anthropic model: claude-sonnet-4-20250514 api_key_env: ANTHROPIC_API_KEY base_url: "" max_tokens: 4096 temperature: 0.3 fallback: provider: ollama model: llama3 base_url: "http://localhost:11434/v1" max_tokens: 2048 temperature: 0.5 reasoning: system_prompt_file: "prompts/devops-system.md" context_window: 8192 memory_messages: 20 tool_use: enabled: true max_iterations: 5 parallel_calls: false rate_limit: requests_per_minute: 30 tokens_per_minute: 100000 concurrent_requests: 3 # ============================================ # TOOLS — CAPACIDADES DISPONIBLES # ============================================ tools: ssh: enabled: true allowed_targets: [production, staging, monitoring] forbidden_commands: - "rm -rf /" - "dd if=" - "mkfs" timeout: 30s max_concurrent: 3 require_confirmation: - production http: enabled: true allowed_domains: - "api.github.com" - "api.gitea.internal" - "grafana.internal" timeout: 15s max_retries: 2 scripts: enabled: true scripts_dir: "./scripts/" allowed: - "deploy.sh" - "healthcheck.sh" - "rollback.sh" timeout: 120s sandbox: false file_ops: enabled: false allowed_paths: ["/var/log/", "/tmp/reports/"] read_only: true mcp: enabled: true servers: - name: github url: "stdio://mcp-github" tools: ["create_issue", "list_prs", "merge_pr"] - name: filesystem url: "stdio://mcp-filesystem" tools: ["read_file", "list_dir"] expose: port: 9100 tools: ["deploy", "status", "rollback"] # ============================================ # MATRIX — CONEXIÓN Y ROOMS # ============================================ matrix: homeserver: "${MATRIX_HOMESERVER}" user_id: "@devops-bot:${MATRIX_SERVER_NAME}" access_token_env: MATRIX_TOKEN_DEVOPS device_id: "DEVOPSBOT01" encryption: enabled: false # habilitar cuando E2EE esté configurado store_path: "./data/crypto/" trust_mode: tofu rooms: listen: - "${MATRIX_ROOM_DEVOPS}" - "${MATRIX_ROOM_ALERTS}" respond: - "${MATRIX_ROOM_DEVOPS}" - "${MATRIX_ROOM_LOGS}" admin: - "${MATRIX_ROOM_ADMIN}" filters: command_prefix: "!" mention_respond: true dm_respond: true ignore_bots: true ignore_users: [] min_power_level: 0 # ============================================ # COMUNICACIÓN INTER-AGENTES # ============================================ agents: peers: - id: monitor-bot capabilities: [alerts, metrics, healthcheck] room: "${MATRIX_ROOM_AGENTS_INTERNAL}" - id: assistant-bot capabilities: [search, summarize, translate] room: "${MATRIX_ROOM_AGENTS_INTERNAL}" delegation: enabled: true can_delegate_to: [monitor-bot, assistant-bot] can_receive_from: [assistant-bot] max_delegation_depth: 2 timeout: 60s protocol: format: json channel: matrix heartbeat_interval: 30s # ============================================ # SSH — INVENTARIO DE SERVIDORES # ============================================ ssh: defaults: user: deploy port: 22 key_file_env: SSH_PRIVATE_KEY_PATH known_hosts: "./data/known_hosts" keepalive_interval: 15s timeout: 10s targets: production: hosts: ["${PROD_HOST_1}", "${PROD_HOST_2}"] user: deploy jump_host: "${BASTION_HOST}" staging: hosts: ["${STAGING_HOST}"] user: deploy monitoring: hosts: ["${MONITORING_HOST}"] user: monitor key_file_env: SSH_MONITOR_KEY_PATH # ============================================ # PERMISOS Y SEGURIDAD # ============================================ security: roles: admin: users: ["@admin:${MATRIX_SERVER_NAME}"] actions: ["*"] developer: users: ["@dev1:${MATRIX_SERVER_NAME}", "@dev2:${MATRIX_SERVER_NAME}"] actions: ["deploy:staging", "status:*", "logs:*"] viewer: users: ["*"] actions: ["status:*", "help"] audit: enabled: true log_file: "./data/audit.log" log_to_room: "${MATRIX_ROOM_AUDIT}" include: [ssh, deploy, config_change] secrets: provider: env # ============================================ # SCHEDULING Y TAREAS AUTOMÁTICAS # ============================================ schedules: - name: healthcheck cron: "*/5 * * * *" action: kind: ssh target: production command: "/opt/scripts/healthcheck.sh" on_failure: notify_room: "${MATRIX_ROOM_ALERTS}" escalate_to: "@admin:${MATRIX_SERVER_NAME}" - name: daily-report cron: "0 9 * * *" action: kind: script script: "daily-report.sh" output_room: "${MATRIX_ROOM_DEVOPS}" - name: backup-check cron: "0 */6 * * *" action: kind: ssh target: production command: "/opt/scripts/check-backups.sh" on_failure: notify_room: "${MATRIX_ROOM_ALERTS}" escalate_to: "@admin:${MATRIX_SERVER_NAME}" # ============================================ # OBSERVABILIDAD # ============================================ observability: logging: level: info format: json output: stdout file: "./data/agent.log" metrics: enabled: true port: 9090 path: /metrics export: prometheus health: enabled: true port: 8080 path: /healthz tracing: enabled: false provider: jaeger endpoint: "http://jaeger:14268/api/traces" # ============================================ # RESILIENCIA # ============================================ resilience: circuit_breaker: failure_threshold: 5 timeout: 30s half_open_max: 2 retry: max_attempts: 3 backoff: exponential initial_delay: 1s max_delay: 30s shutdown: timeout: 15s drain_messages: true save_state: true state_file: "./data/state.json" queue: enabled: true max_size: 50 priority_users: ["@admin:${MATRIX_SERVER_NAME}"] # ============================================ # ALMACENAMIENTO Y ESTADO # ============================================ storage: state: backend: sqlite path: "./data/agent.db" cache: enabled: true backend: memory ttl: 10m max_entries: 500 history: backend: sqlite path: "./data/history.db" retention: 720h # 30 days