Initial commit: Split Macha autonomous system into separate flake

Macha is now a standalone NixOS flake that can be imported into other systems. This provides: - Independent versioning - Easier reusability - Cleaner separation of concerns - Better development workflow Includes: - Complete autonomous system code - NixOS module with full configuration options - Queue-based architecture with priority system - Chunked map-reduce for large outputs - ChromaDB knowledge base - Tool calling system - Multi-host SSH management - Gotify notification integration All capabilities from DESIGN.md are preserved.
2025-10-06 14:32:37 -06:00
commit 22ba493d9e
30 changed files with 10306 additions and 0 deletions
--- a/executor.py
+++ b/executor.py
@@ -0,0 +1,537 @@
+#!/usr/bin/env python3
+"""
+Action Executor - Safely executes proposed fixes with rollback capability
+"""
+
+import json
+import subprocess
+import shutil
+from typing import Dict, List, Any, Optional
+from pathlib import Path
+from datetime import datetime
+import time
+
+
+class SafeExecutor:
+    """Executes system maintenance actions with safety checks"""
+    
+    # Actions that are considered safe to auto-execute
+    SAFE_ACTIONS = {
+        "systemd_restart",  # Restart failed services
+        "cleanup",  # Disk cleanup, log rotation
+        "investigation",  # Read-only diagnostics
+    }
+    
+    # Services that should NEVER be stopped/disabled
+    PROTECTED_SERVICES = {
+        "sshd",
+        "systemd-networkd",
+        "NetworkManager",
+        "systemd-resolved",
+        "dbus",
+    }
+    
+    def __init__(
+        self,
+        state_dir: Path = Path("/var/lib/macha"),
+        autonomy_level: str = "suggest",  # observe, suggest, auto-safe, auto-full
+        dry_run: bool = False,
+        agent = None  # Optional agent for learning from actions
+    ):
+        self.state_dir = state_dir
+        self.state_dir.mkdir(parents=True, exist_ok=True)
+        self.autonomy_level = autonomy_level
+        self.dry_run = dry_run
+        self.agent = agent
+        self.action_log = self.state_dir / "actions.jsonl"
+        self.approval_queue = self.state_dir / "approval_queue.json"
+        
+    def execute_action(self, action: Dict[str, Any], monitoring_context: Dict[str, Any]) -> Dict[str, Any]:
+        """Execute a proposed action with appropriate safety checks"""
+        
+        action_type = action.get("action_type", "unknown")
+        risk_level = action.get("risk_level", "high")
+        
+        # Determine if we should execute
+        should_execute, reason = self._should_execute(action_type, risk_level)
+        
+        if not should_execute:
+            if self.autonomy_level == "suggest":
+                # Queue for approval
+                self._queue_for_approval(action, monitoring_context)
+                return {
+                    "executed": False,
+                    "status": "queued_for_approval",
+                    "reason": reason,
+                    "queue_file": str(self.approval_queue)
+                }
+            else:
+                return {
+                    "executed": False,
+                    "status": "blocked",
+                    "reason": reason
+                }
+        
+        # Execute the action
+        if self.dry_run:
+            return self._dry_run_action(action)
+        
+        return self._execute_action_impl(action, monitoring_context)
+    
+    def _should_execute(self, action_type: str, risk_level: str) -> tuple[bool, str]:
+        """Determine if an action should be auto-executed based on autonomy level"""
+        
+        if self.autonomy_level == "observe":
+            return False, "Autonomy level set to observe-only"
+        
+        # Auto-approve low-risk investigation actions
+        if action_type == "investigation" and risk_level == "low":
+            return True, "Auto-approved: Low-risk information gathering"
+        
+        if self.autonomy_level == "suggest":
+            return False, "Autonomy level requires manual approval"
+        
+        if self.autonomy_level == "auto-safe":
+            if action_type in self.SAFE_ACTIONS and risk_level == "low":
+                return True, "Auto-executing safe action"
+            return False, "Action requires higher autonomy level"
+        
+        if self.autonomy_level == "auto-full":
+            if risk_level == "high":
+                return False, "High risk actions always require approval"
+            return True, "Auto-executing approved action"
+        
+        return False, "Unknown autonomy level"
+    
+    def _execute_action_impl(self, action: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
+        """Actually execute the action"""
+        
+        action_type = action.get("action_type")
+        result = {
+            "executed": True,
+            "timestamp": datetime.now().isoformat(),
+            "action": action,
+            "success": False,
+            "output": "",
+            "error": None
+        }
+        
+        try:
+            if action_type == "systemd_restart":
+                result.update(self._restart_services(action))
+            
+            elif action_type == "cleanup":
+                result.update(self._perform_cleanup(action))
+            
+            elif action_type == "nix_rebuild":
+                result.update(self._nix_rebuild(action))
+            
+            elif action_type == "config_change":
+                result.update(self._apply_config_change(action))
+            
+            elif action_type == "investigation":
+                result.update(self._run_investigation(action))
+            
+            else:
+                result["error"] = f"Unknown action type: {action_type}"
+                
+        except Exception as e:
+            result["error"] = str(e)
+            result["success"] = False
+        
+        # Log the action
+        self._log_action(result)
+        
+        # Learn from successful operations
+        if result.get("success") and self.agent:
+            try:
+                self.agent.reflect_and_learn(
+                    situation=action.get("diagnosis", "Unknown situation"),
+                    action_taken=action.get("proposed_action", "Unknown action"),
+                    outcome=result.get("output", ""),
+                    success=True
+                )
+            except Exception as e:
+                # Don't fail the action if learning fails
+                print(f"Note: Could not record learning: {e}")
+        
+        return result
+    
+    def _restart_services(self, action: Dict[str, Any]) -> Dict[str, Any]:
+        """Restart systemd services"""
+        commands = action.get("commands", [])
+        output_lines = []
+        
+        for cmd in commands:
+            if not cmd.startswith("systemctl restart "):
+                continue
+            
+            service = cmd.split()[-1]
+            
+            # Safety check
+            if any(protected in service for protected in self.PROTECTED_SERVICES):
+                output_lines.append(f"BLOCKED: {service} is protected")
+                continue
+            
+            try:
+                result = subprocess.run(
+                    ["systemctl", "restart", service],
+                    capture_output=True,
+                    text=True,
+                    timeout=30
+                )
+                
+                if result.returncode == 0:
+                    output_lines.append(f"✓ Restarted {service}")
+                else:
+                    output_lines.append(f"✗ Failed to restart {service}: {result.stderr}")
+                
+            except subprocess.TimeoutExpired:
+                output_lines.append(f"✗ Timeout restarting {service}")
+        
+        return {
+            "success": len(output_lines) > 0,
+            "output": "\n".join(output_lines)
+        }
+    
+    def _perform_cleanup(self, action: Dict[str, Any]) -> Dict[str, Any]:
+        """Perform system cleanup tasks"""
+        output_lines = []
+        
+        # Nix store cleanup
+        if "nix" in action.get("proposed_action", "").lower():
+            try:
+                result = subprocess.run(
+                    ["nix-collect-garbage", "--delete-old"],
+                    capture_output=True,
+                    text=True,
+                    timeout=300
+                )
+                output_lines.append(f"Nix cleanup: {result.stdout}")
+            except Exception as e:
+                output_lines.append(f"Nix cleanup failed: {e}")
+        
+        # Journal cleanup (keep last 7 days)
+        try:
+            result = subprocess.run(
+                ["journalctl", "--vacuum-time=7d"],
+                capture_output=True,
+                text=True,
+                timeout=60
+            )
+            output_lines.append(f"Journal cleanup: {result.stdout}")
+        except Exception as e:
+            output_lines.append(f"Journal cleanup failed: {e}")
+        
+        return {
+            "success": True,
+            "output": "\n".join(output_lines)
+        }
+    
+    def _nix_rebuild(self, action: Dict[str, Any]) -> Dict[str, Any]:
+        """Rebuild NixOS configuration"""
+        
+        # This is HIGH RISK - always requires approval or full autonomy
+        # And we should test first
+        
+        output_lines = []
+        
+        # First, try a dry build
+        try:
+            result = subprocess.run(
+                ["nixos-rebuild", "dry-build", "--flake", ".#macha"],
+                capture_output=True,
+                text=True,
+                timeout=600,
+                cwd="/home/lily/Documents/nixos-servers"
+            )
+            
+            if result.returncode != 0:
+                return {
+                    "success": False,
+                    "output": f"Dry build failed:\n{result.stderr}"
+                }
+            
+            output_lines.append("✓ Dry build successful")
+            
+        except Exception as e:
+            return {
+                "success": False,
+                "output": f"Dry build error: {e}"
+            }
+        
+        # Now do the actual rebuild
+        try:
+            result = subprocess.run(
+                ["nixos-rebuild", "switch", "--flake", ".#macha"],
+                capture_output=True,
+                text=True,
+                timeout=1200,
+                cwd="/home/lily/Documents/nixos-servers"
+            )
+            
+            output_lines.append(result.stdout)
+            
+            return {
+                "success": result.returncode == 0,
+                "output": "\n".join(output_lines),
+                "error": result.stderr if result.returncode != 0 else None
+            }
+            
+        except Exception as e:
+            return {
+                "success": False,
+                "output": "\n".join(output_lines),
+                "error": str(e)
+            }
+    
+    def _apply_config_change(self, action: Dict[str, Any]) -> Dict[str, Any]:
+        """Apply a configuration file change"""
+        
+        config_changes = action.get("config_changes", {})
+        file_path = config_changes.get("file")
+        
+        if not file_path:
+            return {
+                "success": False,
+                "output": "No file specified in config_changes"
+            }
+        
+        # For now, we DON'T auto-modify configs - too risky
+        # Instead, we create a suggested patch file
+        
+        patch_file = self.state_dir / f"suggested_patch_{int(time.time())}.txt"
+        with open(patch_file, 'w') as f:
+            f.write(f"Suggested change to {file_path}:\n\n")
+            f.write(config_changes.get("change", "No change description"))
+            f.write(f"\n\nReasoning: {action.get('reasoning', 'No reasoning provided')}")
+        
+        return {
+            "success": True,
+            "output": f"Config change suggestion saved to {patch_file}\nThis requires manual review and application."
+        }
+    
+    def _run_investigation(self, action: Dict[str, Any]) -> Dict[str, Any]:
+        """Run diagnostic commands"""
+        commands = action.get("commands", [])
+        output_lines = []
+        
+        for cmd in commands:
+            # Only allow safe read-only commands
+            safe_commands = ["journalctl", "systemctl status", "df", "free", "ps", "netstat", "ss"]
+            if not any(cmd.startswith(safe) for safe in safe_commands):
+                output_lines.append(f"BLOCKED unsafe command: {cmd}")
+                continue
+            
+            try:
+                result = subprocess.run(
+                    cmd,
+                    shell=True,
+                    capture_output=True,
+                    text=True,
+                    timeout=30
+                )
+                output_lines.append(f"$ {cmd}")
+                output_lines.append(result.stdout)
+            except Exception as e:
+                output_lines.append(f"Error running {cmd}: {e}")
+        
+        return {
+            "success": True,
+            "output": "\n".join(output_lines)
+        }
+    
+    def _dry_run_action(self, action: Dict[str, Any]) -> Dict[str, Any]:
+        """Simulate action execution"""
+        return {
+            "executed": False,
+            "status": "dry_run",
+            "action": action,
+            "output": "Dry run mode - no actual changes made"
+        }
+    
+    def _queue_for_approval(self, action: Dict[str, Any], context: Dict[str, Any]):
+        """Add action to approval queue"""
+        queue = []
+        if self.approval_queue.exists():
+            with open(self.approval_queue, 'r') as f:
+                queue = json.load(f)
+        
+        # Check for duplicate pending actions
+        proposed_action = action.get("proposed_action", "")
+        diagnosis = action.get("diagnosis", "")
+        
+        for existing in queue:
+            # Skip already approved/rejected items
+            if existing.get("approved") is not None:
+                continue
+            
+            existing_action = existing.get("action", {})
+            existing_proposed = existing_action.get("proposed_action", "")
+            existing_diagnosis = existing_action.get("diagnosis", "")
+            
+            # Check if this is essentially the same issue
+            # Match if diagnosis is very similar OR proposed action is very similar
+            if (diagnosis and existing_diagnosis and 
+                self._similarity_check(diagnosis, existing_diagnosis) > 0.7):
+                print(f"Skipping duplicate action - similar diagnosis already queued")
+                return
+            
+            if (proposed_action and existing_proposed and
+                self._similarity_check(proposed_action, existing_proposed) > 0.7):
+                print(f"Skipping duplicate action - similar proposal already queued")
+                return
+        
+        queue.append({
+            "timestamp": datetime.now().isoformat(),
+            "action": action,
+            "context": context,
+            "approved": None
+        })
+        
+        with open(self.approval_queue, 'w') as f:
+            json.dump(queue, f, indent=2)
+    
+    def _similarity_check(self, str1: str, str2: str) -> float:
+        """Simple similarity check between two strings"""
+        # Normalize strings
+        s1 = str1.lower().strip()
+        s2 = str2.lower().strip()
+        
+        # Exact match
+        if s1 == s2:
+            return 1.0
+        
+        # Check for significant word overlap
+        words1 = set(s1.split())
+        words2 = set(s2.split())
+        
+        # Remove common words that don't indicate similarity
+        common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had'}
+        words1 = words1 - common_words
+        words2 = words2 - common_words
+        
+        if not words1 or not words2:
+            return 0.0
+        
+        # Calculate Jaccard similarity
+        intersection = len(words1 & words2)
+        union = len(words1 | words2)
+        
+        return intersection / union if union > 0 else 0.0
+    
+    def _log_action(self, result: Dict[str, Any]):
+        """Log executed actions"""
+        with open(self.action_log, 'a') as f:
+            f.write(json.dumps(result) + '\n')
+    
+    def get_approval_queue(self) -> List[Dict[str, Any]]:
+        """Get pending actions awaiting approval"""
+        if not self.approval_queue.exists():
+            return []
+        
+        with open(self.approval_queue, 'r') as f:
+            return json.load(f)
+    
+    def approve_action(self, index: int) -> bool:
+        """Approve and execute a queued action, then remove it from queue"""
+        queue = self.get_approval_queue()
+        if 0 <= index < len(queue):
+            action_item = queue[index]
+            
+            # Execute the approved action
+            result = self._execute_action_impl(action_item["action"], action_item["context"])
+            
+            # Archive the action (success or failure)
+            self._archive_action(action_item, result)
+            
+            # Remove from queue regardless of outcome
+            queue.pop(index)
+            
+            with open(self.approval_queue, 'w') as f:
+                json.dump(queue, f, indent=2)
+            
+            return result.get("success", False)
+        
+        return False
+    
+    def _archive_action(self, action_item: Dict[str, Any], result: Dict[str, Any]):
+        """Archive an approved action with its execution result"""
+        archive_file = self.state_dir / "approved_actions.jsonl"
+        
+        archive_entry = {
+            "timestamp": datetime.now().isoformat(),
+            "original_timestamp": action_item.get("timestamp"),
+            "action": action_item.get("action"),
+            "context": action_item.get("context"),
+            "result": result
+        }
+        
+        with open(archive_file, 'a') as f:
+            f.write(json.dumps(archive_entry) + '\n')
+    
+    def reject_action(self, index: int) -> bool:
+        """Reject and remove a queued action"""
+        queue = self.get_approval_queue()
+        if 0 <= index < len(queue):
+            removed_action = queue.pop(index)
+            
+            with open(self.approval_queue, 'w') as f:
+                json.dump(queue, f, indent=2)
+            
+            return True
+        
+        return False
+
+
+if __name__ == "__main__":
+    import sys
+    
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "queue":
+            executor = SafeExecutor()
+            queue = executor.get_approval_queue()
+            if queue:
+                print("\n" + "="*70)
+                print(f"PENDING ACTIONS: {len(queue)}")
+                print("="*70)
+                for i, item in enumerate(queue):
+                    action = item.get("action", {})
+                    timestamp = item.get("timestamp", "unknown")
+                    approved = item.get("approved")
+                    
+                    status = "✓ APPROVED" if approved else "⏳ PENDING" if approved is None else "✗ REJECTED"
+                    
+                    print(f"\n[{i}] {status} - {timestamp}")
+                    print("-" * 70)
+                    print(f"DIAGNOSIS: {action.get('diagnosis', 'N/A')}")
+                    print(f"\nPROPOSED ACTION: {action.get('proposed_action', 'N/A')}")
+                    print(f"TYPE: {action.get('action_type', 'N/A')}")
+                    print(f"RISK: {action.get('risk_level', 'N/A')}")
+                    
+                    if action.get('commands'):
+                        print(f"\nCOMMANDS:")
+                        for cmd in action['commands']:
+                            print(f"  - {cmd}")
+                    
+                    if action.get('config_changes'):
+                        print(f"\nCONFIG CHANGES:")
+                        for key, value in action['config_changes'].items():
+                            print(f"  {key}: {value}")
+                    
+                    print(f"\nREASONING: {action.get('reasoning', 'N/A')}")
+                print("\n" + "="*70 + "\n")
+            else:
+                print("No pending actions")
+        
+        elif sys.argv[1] == "approve" and len(sys.argv) > 2:
+            executor = SafeExecutor()
+            index = int(sys.argv[2])
+            success = executor.approve_action(index)
+            print(f"Approval {'succeeded' if success else 'failed'}")
+        
+        elif sys.argv[1] == "reject" and len(sys.argv) > 2:
+            executor = SafeExecutor()
+            index = int(sys.argv[2])
+            success = executor.reject_action(index)
+            print(f"Action {'rejected and removed from queue' if success else 'rejection failed'}")