macha-autonomous/agent.py

#!/usr/bin/env python3
"""
AI Agent - Analyzes system state and proposes solutions using local LLMs
"""

import json
import requests
import subprocess
from typing import Dict, List, Any, Optional
from pathlib import Path
from datetime import datetime

from tools import SysadminTools


class MachaAgent:
    """AI agent that analyzes system issues and proposes fixes"""

    # Load system prompt from file
    @staticmethod
    def _load_system_prompt() -> str:
        """Load the system prompt from file"""
        prompt_file = Path(__file__).parent / "system_prompt.txt"
        try:
            return prompt_file.read_text()
        except Exception as e:
            print(f"Warning: Could not load system prompt from {prompt_file}: {e}")
            return "You are Macha, an autonomous AI system maintenance agent."

    SYSTEM_PROMPT = _load_system_prompt.__func__()

    def __init__(
        self,
        ollama_host: str = "http://localhost:11434",
        model: str = "gpt-oss:latest",
        state_dir: Path = Path("/var/lib/macha"),
        context_db = None,
        config_repo: str = "git+https://git.coven.systems/lily/nixos-servers",
        config_branch: str = "main",
        enable_tools: bool = True,
        use_queue: bool = True,
        priority: str = "INTERACTIVE"
    ):
        self.ollama_host = ollama_host
        self.model = model
        self.state_dir = state_dir
        self.state_dir.mkdir(parents=True, exist_ok=True)
        self.decision_log = self.state_dir / "decisions.jsonl"
        self.context_db = context_db
        self.config_repo = config_repo
        self.config_branch = config_branch
        self.enable_tools = enable_tools

        # Queue settings
        self.use_queue = use_queue
        self.priority = priority
        self.ollama_queue = None

        if use_queue:
            try:
                from ollama_queue import OllamaQueue, Priority
                self.ollama_queue = OllamaQueue()
                self.priority_level = getattr(Priority, priority, Priority.INTERACTIVE)
            except (PermissionError, OSError):
                # Silently fall back to direct API calls when queue is not accessible
                # (e.g., regular users don't have access to /var/lib/macha/queues)
                self.use_queue = False
            except Exception as e:
                # Log unexpected errors but still fall back gracefully
                import sys
                print(f"Note: Ollama queue unavailable ({type(e).__name__}), using direct API", file=sys.stderr)
                self.use_queue = False

        # Initialize tools system
        self.tools = SysadminTools(safe_mode=False) if enable_tools else None

        # Tool output cache for hierarchical processing
        self.tool_output_cache = {}
        self.cache_dir = self.state_dir / "tool_cache"

        # Only create cache dir if we have write access (running as macha user)
        try:
            self.cache_dir.mkdir(parents=True, exist_ok=True)
        except (PermissionError, OSError):
            # Running as unprivileged user (macha-chat), use temp dir instead
            import tempfile
            self.cache_dir = Path(tempfile.mkdtemp(prefix="macha_cache_"))

    def _query_relevant_knowledge(self, query: str, limit: int = 3) -> str:
        """
        Query knowledge base for relevant information

        Returns formatted string of relevant knowledge to include in prompts
        """
        if not self.context_db:
            return ""

        try:
            knowledge_items = self.context_db.query_knowledge(query, limit=limit)
            if not knowledge_items:
                return ""

            knowledge_text = "\n\nRELEVANT KNOWLEDGE FROM EXPERIENCE:\n"
            for item in knowledge_items:
                knowledge_text += f"\n• {item['topic']} ({item['category']}):\n"
                knowledge_text += f"  {item['knowledge']}\n"
                knowledge_text += f"  [Confidence: {item['confidence']}, Referenced: {item['times_referenced']} times]\n"

            return knowledge_text
        except Exception as e:
            print(f"Error querying knowledge: {e}")
            return ""

    def store_learning(
        self,
        topic: str,
        knowledge: str,
        category: str = "experience",
        confidence: str = "medium",
        tags: list = None
    ) -> bool:
        """
        Store a learned insight into the knowledge base

        Args:
            topic: What this is about
            knowledge: The insight/pattern/learning
            category: Type of knowledge
            confidence: How confident we are
            tags: Optional tags

        Returns:
            True if stored successfully
        """
        if not self.context_db:
            return False

        try:
            kid = self.context_db.store_knowledge(
                topic=topic,
                knowledge=knowledge,
                category=category,
                source="experience",
                confidence=confidence,
                tags=tags
            )
            if kid:
                print(f"📚 Learned: {topic}")
                return True
            return False
        except Exception as e:
            print(f"Error storing learning: {e}")
            return False

    def reflect_and_learn(
        self,
        situation: str,
        action_taken: str,
        outcome: str,
        success: bool
    ) -> None:
        """
        Reflect on an operation and extract learnings to store

        Args:
            situation: What was the problem/situation
            action_taken: What action was taken
            outcome: What was the result
            success: Whether it succeeded
        """
        if not self.context_db:
            return

        # Only learn from successful operations for now
        if not success:
            return

        # Build reflection prompt
        prompt = f"""Based on this successful operation, extract key learnings to remember for the future.

SITUATION:
{situation}

ACTION TAKEN:
{action_taken}

OUTCOME:
{outcome}

Extract 1-2 specific, actionable learnings. For each learning provide:
1. topic: A concise topic name (e.g., "systemd service restart", "disk cleanup procedure")
2. knowledge: The specific insight or pattern (what worked, why, important details)
3. category: One of: command, pattern, troubleshooting, performance

Respond ONLY with valid JSON:
[
  {{
    "topic": "...",
    "knowledge": "...",
    "category": "...",
    "confidence": "medium"
  }}
]
"""

        try:
            response = self._query_ollama(prompt, temperature=0.3, timeout=30)
            learnings = json.loads(response)

            if isinstance(learnings, list):
                for learning in learnings:
                    if all(k in learning for k in ['topic', 'knowledge', 'category']):
                        self.store_learning(
                            topic=learning['topic'],
                            knowledge=learning['knowledge'],
                            category=learning.get('category', 'experience'),
                            confidence=learning.get('confidence', 'medium')
                        )
        except Exception as e:
            # Reflection is optional - don't fail if it doesn't work
            print(f"Note: Could not extract learnings: {e}")

    def analyze_system_state(self, monitoring_data: Dict[str, Any], system_hostname: str = None, git_context=None) -> Dict[str, Any]:
        """Analyze system monitoring data and determine if action is needed"""

        # Build context for the AI
        context = self._build_analysis_context(monitoring_data)

        # Get system infrastructure context if available
        system_context = ""
        if self.context_db and system_hostname:
            system_context = self.context_db.get_system_context(system_hostname, git_context)

        # Ask the AI to analyze
        prompt = self._create_analysis_prompt(context, system_context)

        response = self._query_ollama(prompt)

        # Parse the AI's response
        analysis = self._parse_analysis_response(response)

        # Log the decision
        self._log_decision(monitoring_data, analysis)

        return analysis

    def propose_fix(self, issue_description: str, context: Dict[str, Any]) -> Dict[str, Any]:
        """Propose a fix for a specific issue"""

        # Query relevant config files if we have context_db
        relevant_configs = []
        if self.context_db:
            try:
                # Query for config files relevant to the issue
                configs = self.context_db.query_config_files(
                    query=issue_description,
                    n_results=3
                )
                relevant_configs = configs
            except Exception as e:
                print(f"Warning: Could not query config files: {e}")

        # Build config context section
        config_context = ""
        if relevant_configs:
            config_context = "\n\nRELEVANT CONFIGURATION FILES:\n"
            for cfg in relevant_configs:
                config_context += f"\n--- {cfg['path']} (relevance: {cfg['relevance']:.2%}) ---\n"
                config_context += cfg['content'][:1000]  # First 1000 chars to avoid token limits
                if len(cfg['content']) > 1000:
                    config_context += "\n... (truncated)"
                config_context += "\n"

        # Query relevant knowledge from experience
        knowledge_context = self._query_relevant_knowledge(issue_description, limit=3)

        # Build previous investigations context
        previous_inv_context = ""
        if context.get('previous_investigations'):
            previous_inv_context = "\n\nPREVIOUS INVESTIGATIONS (DO NOT REPEAT THESE):\n"
            for i, inv in enumerate(context['previous_investigations'][:3], 1):  # Show up to 3
                previous_inv_context += f"\nInvestigation #{i} ({inv['timestamp']}):\n"
                previous_inv_context += f"Commands: {', '.join(inv['commands'])}\n"
                previous_inv_context += f"Results:\n{inv['output'][:500]}...\n"  # First 500 chars
            previous_inv_context += "\n⚠️  You have already run these investigations. Do NOT propose them again."
            previous_inv_context += "\n⚠️  Based on the investigation results above, propose an ACTUAL FIX, not more investigation.\n"

        prompt = f"""{self.SYSTEM_PROMPT}

TASK: PROPOSE FIX
================================================================================

ISSUE TO ADDRESS:
{issue_description}

SYSTEM CONTEXT:
{json.dumps(context, indent=2)}{config_context}{knowledge_context}{previous_inv_context}

REPOSITORY INFO:
- Git Repository: {self.config_repo}
- Branch: {self.config_branch}

YOUR RESPONSE MUST BE VALID JSON:
{{
    "diagnosis": "brief description of what you think is wrong",
    "proposed_action": "specific action to take",
    "action_type": "one of: systemd_restart, nix_rebuild, config_change, cleanup, investigation",
    "risk_level": "one of: low, medium, high",
    "commands": ["list", "of", "shell", "commands"],
    "config_changes": {{
        "file": "path/to/config.nix in the repository",
        "change": "description of change needed"
    }},
    "reasoning": "why this fix should work",
    "rollback_plan": "how to undo if it doesn't work"
}}

RESPOND WITH ONLY THE JSON, NO OTHER TEXT.
"""

        response = self._query_ollama(prompt)

        try:
            # Try to extract JSON from response
            # LLMs sometimes add extra text, so we need to find the JSON part
            import re
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
            else:
                return {
                    "diagnosis": "Failed to parse AI response",
                    "proposed_action": "manual investigation required",
                    "action_type": "investigation",
                    "risk_level": "high",
                    "reasoning": "AI response was not in expected format"
                }
        except json.JSONDecodeError:
            return {
                "diagnosis": "Failed to parse AI response",
                "proposed_action": "manual investigation required",
                "action_type": "investigation",
                "risk_level": "high",
                "reasoning": f"Raw response: {response[:500]}"
            }

    def _build_analysis_context(self, data: Dict[str, Any]) -> str:
        """Build a concise context string for the AI"""
        lines = []

        # System resources
        res = data.get("resources", {})
        lines.append(f"CPU: {res.get('cpu_percent', 0):.1f}%, Memory: {res.get('memory_percent', 0):.1f}%, Load: {res.get('load_average', {}).get('1min', 0):.2f}")

        # Disk usage
        disk = data.get("disk", {})
        for part in disk.get("partitions", []):
            if part.get("percent_used", 0) > 80:  # Only mention if >80% full
                lines.append(f"⚠️  Disk {part['mountpoint']}: {part['percent_used']:.1f}% full")

        # Failed services
        systemd = data.get("systemd", {})
        if systemd.get("failed_count", 0) > 0:
            lines.append(f"\n⚠️  {systemd['failed_count']} failed systemd services:")
            for svc in systemd.get("failed_services", [])[:10]:
                lines.append(f"  - {svc.get('unit', 'unknown')}: {svc.get('sub', 'unknown')}")

        # Recent errors
        logs = data.get("logs", {})
        error_count = logs.get("error_count_1h", 0)
        if error_count > 0:
            lines.append(f"\n{error_count} errors in last hour")
            # Group errors by service
            errors_by_service = {}
            for err in logs.get("recent_errors", [])[:20]:
                svc = err.get("SYSLOG_IDENTIFIER", "unknown")
                errors_by_service[svc] = errors_by_service.get(svc, 0) + 1
            for svc, count in sorted(errors_by_service.items(), key=lambda x: x[1], reverse=True)[:5]:
                lines.append(f"  - {svc}: {count} errors")

        # Network
        net = data.get("network", {})
        if not net.get("internet_reachable", True):
            lines.append("\n⚠️  No internet connectivity")

        return "\n".join(lines)

    def _create_analysis_prompt(self, context: str, system_context: str = "") -> str:
        """Create the analysis prompt for the AI"""

        prompt = f"""{self.SYSTEM_PROMPT}

TASK: ANALYZE SYSTEM HEALTH
================================================================================

OBJECTIVE:
Analyze the current system state and determine if any action is needed.
Be thorough but not alarmist. Only recommend action if truly necessary.
"""

        if system_context:
            prompt += f"\n\nSYSTEM INFRASTRUCTURE:\n{system_context}"

        prompt += f"""

CURRENT SYSTEM STATE:
{context}

YOUR RESPONSE MUST BE VALID JSON:
{{
    "status": "one of: healthy, attention_needed, intervention_required",
    "issues": [
        {{
            "severity": "one of: info, warning, critical",
            "category": "one of: resources, services, disk, network, logs",
            "description": "brief description of the issue",
            "requires_action": true/false
        }}
    ],
    "overall_assessment": "brief summary of system health",
    "recommended_actions": ["list of recommended actions, if any"]
}}

RESPOND WITH ONLY THE JSON, NO OTHER TEXT.
"""

        return prompt

    def _auto_diagnose_ollama(self) -> str:
        """Automatically diagnose Ollama issues"""
        diagnostics = []

        diagnostics.append("=== OLLAMA SELF-DIAGNOSTIC ===")

        # Check if Ollama service is running
        try:
            result = subprocess.run(
                ['systemctl', 'is-active', 'ollama.service'],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode == 0:
                diagnostics.append("✅ Ollama service is active")
            else:
                diagnostics.append(f"❌ Ollama service is NOT active: {result.stdout.strip()}")
        except Exception as e:
            diagnostics.append(f"⚠️  Could not check service status: {e}")

        # Check memory usage
        try:
            result = subprocess.run(['free', '-h'], capture_output=True, text=True, timeout=5)
            diagnostics.append(f"\nMemory:\n{result.stdout}")
        except Exception as e:
            diagnostics.append(f"⚠️  Could not check memory: {e}")

        # Check which models are loaded
        try:
            response = requests.get(f"{self.ollama_host}/api/tags", timeout=5)
            if response.status_code == 200:
                models = response.json().get('models', [])
                diagnostics.append(f"\nLoaded models: {len(models)}")
                for model in models:
                    name = model.get('name', 'unknown')
                    size = model.get('size', 0) / (1024**3)
                    is_target = "← TARGET" if name == self.model else ""
                    diagnostics.append(f"  • {name} ({size:.1f} GB) {is_target}")

                # Check if target model is loaded
                model_names = [m.get('name') for m in models]
                if self.model not in model_names:
                    diagnostics.append(f"\n❌ TARGET MODEL NOT LOADED: {self.model}")
                    diagnostics.append(f"   Available: {', '.join(model_names)}")
            else:
                diagnostics.append(f"❌ Ollama API returned {response.status_code}")
        except Exception as e:
            diagnostics.append(f"⚠️  Could not query Ollama API: {e}")

        # Check recent Ollama logs
        try:
            result = subprocess.run(
                ['journalctl', '-u', 'ollama.service', '-n', '20', '--no-pager'],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.stdout:
                diagnostics.append(f"\nRecent logs:\n{result.stdout}")
        except Exception as e:
            diagnostics.append(f"⚠️  Could not check logs: {e}")

        return "\n".join(diagnostics)

    def _query_ollama(self, prompt: str, temperature: float = 0.3) -> str:
        """Query Ollama API (with optional queue)"""
        # If queue is enabled, submit to queue and wait
        if self.use_queue and self.ollama_queue:
            try:
                payload = {
                    "model": self.model,
                    "prompt": prompt,
                    "stream": False,
                    "temperature": temperature,
                    "timeout": 120
                }

                request_id = self.ollama_queue.submit(
                    request_type="generate",
                    payload=payload,
                    priority=self.priority_level
                )

                result = self.ollama_queue.wait_for_result(request_id, timeout=300)
                return result.get("response", "")

            except Exception as e:
                print(f"Warning: Queue request failed, falling back to direct: {e}")
                # Fall through to direct query

        # Direct query (no queue or queue failed)
        try:
            response = requests.post(
                f"{self.ollama_host}/api/generate",
                json={
                    "model": self.model,
                    "prompt": prompt,
                    "stream": False,
                    "temperature": temperature,
                },
                timeout=120  # 2 minute timeout for large models
            )
            response.raise_for_status()
            return response.json().get("response", "")
        except requests.exceptions.HTTPError as e:
            error_detail = ""
            try:
                error_detail = f" - {response.text}"
            except:
                pass
            print(f"ERROR: Ollama HTTP error {response.status_code}{error_detail}")
            print(f"Model requested: {self.model}")
            print(f"Ollama host: {self.ollama_host}")
            # Run diagnostics
            diagnostics = self._auto_diagnose_ollama()
            print(diagnostics)
            return json.dumps({
                "error": f"Ollama HTTP {response.status_code}: {str(e)}{error_detail}",
                "diagnosis": f"Ollama API error - check if model '{self.model}' is available",
                "action_type": "investigation",
                "risk_level": "high"
            })
        except Exception as e:
            print(f"ERROR: Failed to query Ollama: {str(e)}")
            print(f"Model requested: {self.model}")
            print(f"Ollama host: {self.ollama_host}")
            # Run diagnostics
            diagnostics = self._auto_diagnose_ollama()
            print(diagnostics)
            return json.dumps({
                "error": f"Failed to query Ollama: {str(e)}",
                "diagnosis": "Ollama API unavailable",
                "action_type": "investigation",
                "risk_level": "high"
            })

    def _estimate_tokens(self, text: str) -> int:
        """Rough token estimation: ~4 chars per token"""
        return len(text) // 4

    def _extract_key_findings(self, tool_name: str, raw_output: str, progress_callback=None) -> str:
        """
        Extract key findings from large tool output using chunked map-reduce.
        Processes large outputs in smaller chunks to prevent Ollama overload.
        """
        output_size = len(raw_output)
        chunk_size = 8000  # ~2000 tokens per chunk, safe size

        # Store full output in cache for potential deep dive
        cache_id = f"{tool_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        try:
            cache_file = self.cache_dir / f"{cache_id}.txt"
            cache_file.write_text(raw_output)
        except (PermissionError, OSError):
            # Fallback to temp directory if cache dir not writable
            import tempfile
            cache_file = Path(tempfile.gettempdir()) / f"macha_{cache_id}.txt"
            cache_file.write_text(raw_output)

        # If output is small enough, process in one go
        if output_size <= chunk_size:
            try:
                extraction_prompt = f"""Analyze this output from '{tool_name}'.

Extract: key findings, errors/warnings, metrics, actionable insights.

Output:
{raw_output}

Provide concise summary (max 600 chars)."""

                summary = self._query_ollama(extraction_prompt, temperature=0.1)
                return f"[Summary of {tool_name}]:\n{summary}\n\n[Full output: {output_size:,} chars cached as {cache_id}]"
            except Exception as e:
                print(f"Warning: Failed to extract findings: {e}")
                return self._simple_truncate(raw_output, 2000)

        # Large output: chunk and process with map-reduce
        try:
            chunks = []
            num_chunks = (output_size + chunk_size - 1) // chunk_size

            for i in range(0, output_size, chunk_size):
                chunk = raw_output[i:i+chunk_size]
                chunks.append(chunk)

            # Phase 1: Map - Summarize each chunk
            chunk_summaries = []
            for idx, chunk in enumerate(chunks):
                chunk_num = idx + 1

                # Progress feedback
                if progress_callback:
                    progress_callback(f"  Processing chunk {chunk_num}/{num_chunks}...")
                else:
                    print(f"  → Processing chunk {chunk_num}/{num_chunks}...", flush=True)

                extraction_prompt = f"""Analyze chunk {chunk_num}/{num_chunks} from '{tool_name}'.

Extract: key findings, errors/warnings, metrics, insights.

Chunk:
{chunk}

Concise summary (max 400 chars)."""

                chunk_summary = self._query_ollama(extraction_prompt, temperature=0.1)
                chunk_summaries.append(f"[Chunk {chunk_num}]: {chunk_summary}")

            # Phase 2: Reduce - Combine chunk summaries if many chunks
            if len(chunk_summaries) > 5:
                if progress_callback:
                    progress_callback(f"  Synthesizing {len(chunk_summaries)} chunk summaries...")
                else:
                    print(f"  → Synthesizing {len(chunk_summaries)} chunk summaries...", flush=True)

                combined = "\n".join(chunk_summaries)
                reduce_prompt = f"""Synthesize these chunk summaries from '{tool_name}':

{combined}

Provide unified summary (max 800 chars) covering all key points."""

                final_summary = self._query_ollama(reduce_prompt, temperature=0.1)
                return f"""[Chunked analysis of {tool_name}]:
{final_summary}

[Processed {num_chunks} chunks, {output_size:,} chars total, cached as {cache_id}]"""

            else:
                # Few chunks: just concatenate summaries
                combined_summary = "\n".join(chunk_summaries)
                return f"""[Chunked analysis of {tool_name}]:
{combined_summary}

[Processed {num_chunks} chunks, {output_size:,} chars total, cached as {cache_id}]"""

        except Exception as e:
            print(f"Warning: Chunked extraction failed for {tool_name}: {e}")
            return self._simple_truncate(raw_output, 2000)

    def _simple_truncate(self, text: str, max_chars: int) -> str:
        """Simple head+tail truncation"""
        if len(text) <= max_chars:
            return text

        half = max_chars // 2
        return (
            text[:half] +
            f"\n... [TRUNCATED: {len(text) - max_chars} chars omitted] ...\n" +
            text[-half:]
        )

    def _process_tool_result_hierarchical(self, tool_name: str, result: Any) -> str:
        """
        Intelligently process tool results based on size:
        - Small (< 5KB): Pass through directly
        - Medium (5-20KB): Hierarchical extraction with single-pass summarization
        - Large (> 20KB): Hierarchical extraction with chunked processing
        """
        result_str = json.dumps(result) if not isinstance(result, str) else result
        size = len(result_str)

        # Small outputs: pass through directly
        if size < 5000:
            print(f"  [Tool result: {size} chars, passing through]")
            return result_str

        # Medium and large outputs: hierarchical extraction with chunking
        else:
            print(f"  [Tool result: {size} chars, extracting key findings...]")
            # _extract_key_findings automatically chunks large outputs
            return self._extract_key_findings(tool_name, result_str)

    def _prune_messages(self, messages: List[Dict], max_context_tokens: int = 80000) -> List[Dict]:
        """
        Prune message history to stay within context limits.
        Keeps: system prompt + recent conversation window
        """
        if not messages:
            return messages

        # Separate system message from conversation
        system_msg = None
        conversation = []

        for msg in messages:
            if msg["role"] == "system":
                system_msg = msg
            else:
                conversation.append(msg)

        # Calculate current token count
        total_tokens = 0
        if system_msg:
            total_tokens += self._estimate_tokens(system_msg["content"])

        for msg in conversation:
            content = msg.get("content", "")
            total_tokens += self._estimate_tokens(str(content))

        # If under limit, return as-is
        if total_tokens <= max_context_tokens:
            result = []
            if system_msg:
                result.append(system_msg)
            result.extend(conversation)
            print(f"[Context: {total_tokens:,} tokens, {len(conversation)} messages]")
            return result

        # Need to prune - keep sliding window of recent messages
        # Strategy: Keep last 20 messages (10 exchanges) which should be ~40K tokens max
        print(f"[Context pruning: {total_tokens:,} tokens → keeping last 20 messages]")

        pruned_conversation = conversation[-20:]

        result = []
        if system_msg:
            result.append(system_msg)
        result.extend(pruned_conversation)

        # Calculate new token count
        new_tokens = self._estimate_tokens(system_msg["content"]) if system_msg else 0
        for msg in pruned_conversation:
            new_tokens += self._estimate_tokens(str(msg.get("content", "")))

        print(f"[Context after pruning: {new_tokens:,} tokens, {len(pruned_conversation)} messages]")

        return result

    def _query_ollama_with_tools(
        self,
        messages: List[Dict[str, str]],
        temperature: float = 0.3,
        max_iterations: int = 30
    ) -> str:
        """
        Query Ollama using chat API with tool support.
        Handles tool calls and returns final response.

        Args:
            messages: List of chat messages [{"role": "user", "content": "..."}]
            temperature: Generation temperature
            max_iterations: Maximum number of tool-calling iterations (default 30 for complex system operations)

        Returns:
            Final text response from the model
        """
        if not self.enable_tools or not self.tools:
            # Fallback to regular query
            user_content = " ".join([m["content"] for m in messages if m["role"] == "user"])
            return self._query_ollama(user_content, temperature)

        # Add system message if not present
        if not any(m["role"] == "system" for m in messages):
            messages = [{"role": "system", "content": self.SYSTEM_PROMPT}] + messages

        tool_definitions = self.tools.get_tool_definitions()

        for iteration in range(max_iterations):
            try:
                # Prune messages before sending to avoid context overflow
                pruned_messages = self._prune_messages(messages, max_context_tokens=80000)

                # Use queue if enabled
                if self.use_queue and self.ollama_queue:
                    try:
                        payload = {
                            "model": self.model,
                            "messages": pruned_messages,
                            "stream": False,
                            "temperature": temperature,
                            "tools": tool_definitions,
                            "timeout": 120
                        }

                        request_id = self.ollama_queue.submit(
                            request_type="chat_with_tools",
                            payload=payload,
                            priority=self.priority_level
                        )

                        resp_data = self.ollama_queue.wait_for_result(request_id, timeout=300)

                    except Exception as e:
                        print(f"Warning: Queue request failed, falling back to direct: {e}")
                        # Fall through to direct query
                        response = requests.post(
                            f"{self.ollama_host}/api/chat",
                            json={
                                "model": self.model,
                                "messages": pruned_messages,
                                "stream": False,
                                "temperature": temperature,
                                "tools": tool_definitions
                            },
                            timeout=120
                        )
                        response.raise_for_status()
                        resp_data = response.json()
                else:
                    # Direct query (no queue)
                    response = requests.post(
                        f"{self.ollama_host}/api/chat",
                        json={
                            "model": self.model,
                            "messages": pruned_messages,
                            "stream": False,
                            "temperature": temperature,
                            "tools": tool_definitions
                        },
                        timeout=120
                    )
                    response.raise_for_status()
                    resp_data = response.json()

                message = resp_data.get("message", {})

                # Check if model wants to call tools
                tool_calls = message.get("tool_calls", [])

                if not tool_calls:
                    # No tools to call, return the text response
                    return message.get("content", "")

                # Add assistant's message to history
                messages.append(message)

                # Execute each tool call
                for tool_call in tool_calls:
                    function_name = tool_call["function"]["name"]
                    arguments = tool_call["function"]["arguments"]

                    print(f"  → Tool call: {function_name}({arguments})")

                    # Execute the tool
                    tool_result = self.tools.execute_tool(function_name, arguments)

                    # Process result hierarchically based on size
                    processed_result = self._process_tool_result_hierarchical(function_name, tool_result)

                    # Add processed result to messages
                    messages.append({
                        "role": "tool",
                        "content": processed_result
                    })

                # Continue loop to let model process tool results

            except requests.exceptions.HTTPError as e:
                error_body = ""
                try:
                    error_body = response.text
                except:
                    pass

                # Check if this is a context length error
                if "context length" in error_body.lower() or "too long" in error_body.lower():
                    print(f"ERROR: Context length exceeded. Attempting recovery...")
                    # Emergency pruning - keep only system + last user message
                    system_msg = next((m for m in messages if m["role"] == "system"), None)
                    last_user_msg = next((m for m in reversed(messages) if m["role"] == "user"), None)

                    if system_msg and last_user_msg:
                        messages = [system_msg, last_user_msg]
                        print(f"[Emergency context reset: keeping only system + last user message]")
                        continue  # Retry with minimal context

                print(f"ERROR: Ollama chat API error: {e}")
                diagnostics = self._auto_diagnose_ollama()
                print(diagnostics)
                return json.dumps({
                    "error": f"Ollama chat API error: {str(e)}",
                    "diagnosis": "Failed to communicate with Ollama",
                    "action_type": "investigation",
                    "risk_level": "high"
                })
            except Exception as e:
                print(f"ERROR: Tool calling failed: {e}")
                return json.dumps({
                    "error": f"Tool calling error: {str(e)}",
                    "diagnosis": "Failed during tool execution",
                    "action_type": "investigation",
                    "risk_level": "high"
                })

        # If we hit max iterations, return what we have
        return "Maximum tool calling iterations reached. Unable to complete request."

    def _parse_analysis_response(self, response: str) -> Dict[str, Any]:
        """Parse the AI's analysis response"""
        import re

        # Log the raw response for debugging
        self._log(f"AI raw response (first 1000 chars): {response[:1000]}")

        try:
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                parsed = json.loads(json_match.group())
                self._log(f"Successfully parsed AI response: {parsed.get('status', 'unknown')}")
                return parsed
            else:
                self._log("ERROR: No JSON found in AI response")
        except Exception as e:
            self._log(f"ERROR parsing AI response: {e}")

        # Fallback
        self._log("Falling back to default response")
        return {
            "status": "healthy",
            "issues": [],
            "overall_assessment": "Unable to parse AI response",
            "recommended_actions": []
        }

    def _log(self, message: str):
        """Log a message to the orchestrator log"""
        # This will go to the orchestrator log via print
        print(f"[AGENT] {message}")

    def _log_decision(self, monitoring_data: Dict[str, Any], analysis: Dict[str, Any]):
        """Log AI decisions for auditing"""
        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "monitoring_summary": {
                "cpu": monitoring_data.get("resources", {}).get("cpu_percent"),
                "memory": monitoring_data.get("resources", {}).get("memory_percent"),
                "failed_services": monitoring_data.get("systemd", {}).get("failed_count"),
                "error_count": monitoring_data.get("logs", {}).get("error_count_1h"),
            },
            "analysis": analysis,
        }

        with open(self.decision_log, 'a') as f:
            f.write(json.dumps(log_entry) + '\n')

    def get_recent_decisions(self, count: int = 10) -> List[Dict[str, Any]]:
        """Get recent decision history"""
        if not self.decision_log.exists():
            return []

        decisions = []
        with open(self.decision_log, 'r') as f:
            for line in f:
                if line.strip():
                    try:
                        decisions.append(json.loads(line))
                    except:
                        pass

        return decisions[-count:]


if __name__ == "__main__":
    import sys

    # Test the agent
    agent = MachaAgent()

    if len(sys.argv) > 1 and sys.argv[1] == "test":
        # Test with sample data
        test_data = {
            "systemd": {"failed_count": 2, "failed_services": [
                {"unit": "test-service.service", "sub": "failed"}
            ]},
            "resources": {"cpu_percent": 25.0, "memory_percent": 45.0, "load_average": {"1min": 1.5}},
            "logs": {"error_count_1h": 10},
            "network": {"internet_reachable": True}
        }

        print("Testing agent analysis...")
        analysis = agent.analyze_system_state(test_data)
        print(json.dumps(analysis, indent=2))

        if analysis.get("issues"):
            print("\nTesting fix proposal...")
            fix = agent.propose_fix(
                analysis["issues"][0]["description"],
                test_data
            )
            print(json.dumps(fix, indent=2))