Initial commit: Split Macha autonomous system into separate flake

Macha is now a standalone NixOS flake that can be imported into other
systems. This provides:

- Independent versioning
- Easier reusability
- Cleaner separation of concerns
- Better development workflow

Includes:
- Complete autonomous system code
- NixOS module with full configuration options
- Queue-based architecture with priority system
- Chunked map-reduce for large outputs
- ChromaDB knowledge base
- Tool calling system
- Multi-host SSH management
- Gotify notification integration

All capabilities from DESIGN.md are preserved.
This commit is contained in:
Lily Miller
2025-10-06 14:32:37 -06:00
commit 22ba493d9e
30 changed files with 10306 additions and 0 deletions

358
journal_monitor.py Normal file
View File

@@ -0,0 +1,358 @@
#!/usr/bin/env python3
"""
Journal Monitor - Monitor remote systems via centralized journald
"""
import json
import subprocess
from typing import Dict, List, Any, Optional, Set
from datetime import datetime, timedelta
from pathlib import Path
from collections import defaultdict
class JournalMonitor:
"""Monitor systems via centralized journald logs"""
def __init__(self, domain: str = "coven.systems"):
"""
Initialize journal monitor
Args:
domain: Domain suffix for FQDNs
"""
self.domain = domain
self.known_hosts: Set[str] = set()
def _run_journalctl(self, args: List[str], timeout: int = 30) -> tuple[bool, str, str]:
"""
Run journalctl command
Args:
args: Arguments to journalctl
timeout: Timeout in seconds
Returns:
(success, stdout, stderr)
"""
try:
cmd = ["journalctl"] + args
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout
)
return (
result.returncode == 0,
result.stdout.strip(),
result.stderr.strip()
)
except subprocess.TimeoutExpired:
return False, "", f"Command timed out after {timeout}s"
except Exception as e:
return False, "", str(e)
def discover_hosts(self) -> List[str]:
"""
Discover hosts reporting to centralized journal
Returns:
List of discovered FQDNs
"""
success, output, _ = self._run_journalctl([
"--output=json",
"--since=1 day ago",
"-n", "10000"
])
if not success:
return []
hosts = set()
for line in output.split('\n'):
if not line.strip():
continue
try:
entry = json.loads(line)
hostname = entry.get('_HOSTNAME', '')
# Ensure FQDN format
if hostname and not hostname.endswith(f'.{self.domain}'):
if '.' not in hostname:
hostname = f"{hostname}.{self.domain}"
if hostname:
hosts.add(hostname)
except json.JSONDecodeError:
continue
self.known_hosts = hosts
return sorted(hosts)
def collect_resources(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]:
"""
Collect resource usage from journal entries
This extracts CPU/memory info from systemd service messages
"""
# For now, return empty - we'll primarily use this for service/log monitoring
# Resource metrics could be added if systems log them
return {
"cpu_percent": 0,
"memory_percent": 0,
"load_average": {"1min": 0, "5min": 0, "15min": 0}
}
def collect_systemd_status(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]:
"""
Collect systemd service status from journal
Args:
hostname: FQDN of the system
since: Time range to check
Returns:
Dictionary with failed service information
"""
# Query for systemd service failures
success, output, _ = self._run_journalctl([
f"_HOSTNAME={hostname}",
"--priority=err",
"--unit=*.service",
f"--since={since}",
"--output=json"
])
if not success:
return {"failed_count": 0, "failed_services": []}
failed_services = {}
for line in output.split('\n'):
if not line.strip():
continue
try:
entry = json.loads(line)
unit = entry.get('_SYSTEMD_UNIT', '')
if unit and unit.endswith('.service'):
service_name = unit.replace('.service', '')
if service_name not in failed_services:
failed_services[service_name] = {
"unit": unit,
"message": entry.get('MESSAGE', ''),
"timestamp": entry.get('__REALTIME_TIMESTAMP', '')
}
except json.JSONDecodeError:
continue
return {
"failed_count": len(failed_services),
"failed_services": list(failed_services.values())
}
def collect_log_errors(self, hostname: str, since: str = "1 hour ago") -> Dict[str, Any]:
"""
Collect error logs from journal
Args:
hostname: FQDN of the system
since: Time range to check
Returns:
Dictionary with error log information
"""
success, output, _ = self._run_journalctl([
f"_HOSTNAME={hostname}",
"--priority=err",
f"--since={since}",
"--output=json"
])
if not success:
return {"error_count_1h": 0, "recent_errors": []}
errors = []
error_count = 0
for line in output.split('\n'):
if not line.strip():
continue
try:
entry = json.loads(line)
error_count += 1
if len(errors) < 10: # Keep last 10 errors
errors.append({
"message": entry.get('MESSAGE', ''),
"unit": entry.get('_SYSTEMD_UNIT', 'unknown'),
"priority": entry.get('PRIORITY', ''),
"timestamp": entry.get('__REALTIME_TIMESTAMP', '')
})
except json.JSONDecodeError:
continue
return {
"error_count_1h": error_count,
"recent_errors": errors
}
def collect_disk_usage(self, hostname: str) -> Dict[str, Any]:
"""
Collect disk usage - Note: This would require systems to log disk metrics
For now, returns empty. Could be enhanced if systems periodically log disk usage
"""
return {"partitions": []}
def collect_network_status(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]:
"""
Check network connectivity based on recent journal activity
If we see recent logs from a host, it's reachable
"""
success, output, _ = self._run_journalctl([
f"_HOSTNAME={hostname}",
f"--since={since}",
"-n", "1",
"--output=json"
])
# If we got recent logs, network is working
internet_reachable = bool(success and output.strip())
return {
"internet_reachable": internet_reachable,
"last_seen": datetime.now().isoformat() if internet_reachable else None
}
def collect_all(self, hostname: str) -> Dict[str, Any]:
"""
Collect all monitoring data for a host from journal
Args:
hostname: FQDN of the system to monitor
Returns:
Complete monitoring data
"""
# First check if we have recent logs from this host
net_status = self.collect_network_status(hostname)
if not net_status.get("internet_reachable"):
return {
"hostname": hostname,
"reachable": False,
"error": "No recent journal entries from this host"
}
return {
"hostname": hostname,
"reachable": True,
"source": "journal",
"resources": self.collect_resources(hostname),
"systemd": self.collect_systemd_status(hostname),
"disk": self.collect_disk_usage(hostname),
"network": net_status,
"logs": self.collect_log_errors(hostname),
}
def get_summary(self, data: Dict[str, Any]) -> str:
"""Generate human-readable summary from journal data"""
hostname = data.get("hostname", "unknown")
if not data.get("reachable", False):
return f"{hostname}: {data.get('error', 'Unreachable')}"
lines = [f"System: {hostname} (via journal)"]
# Services
systemd = data.get("systemd", {})
failed_count = systemd.get("failed_count", 0)
if failed_count > 0:
lines.append(f"Services: {failed_count} failed")
for svc in systemd.get("failed_services", [])[:3]:
lines.append(f" - {svc.get('unit', 'unknown')}")
else:
lines.append("Services: No recent failures")
# Network
net = data.get("network", {})
last_seen = net.get("last_seen")
if last_seen:
lines.append(f"Last seen: {last_seen}")
# Logs
logs = data.get("logs", {})
error_count = logs.get("error_count_1h", 0)
if error_count > 0:
lines.append(f"Recent logs: {error_count} errors in last hour")
return "\n".join(lines)
def get_active_services(self, hostname: str, since: str = "1 hour ago") -> List[str]:
"""
Get list of active services on a host by looking at journal entries
This helps with auto-discovery of what's running on each system
"""
success, output, _ = self._run_journalctl([
f"_HOSTNAME={hostname}",
f"--since={since}",
"--output=json",
"-n", "1000"
])
if not success:
return []
services = set()
for line in output.split('\n'):
if not line.strip():
continue
try:
entry = json.loads(line)
unit = entry.get('_SYSTEMD_UNIT', '')
if unit and unit.endswith('.service'):
# Extract service name
service = unit.replace('.service', '')
# Filter out common system services, focus on application services
if service not in ['systemd-journald', 'systemd-logind', 'sshd', 'dbus']:
services.add(service)
except json.JSONDecodeError:
continue
return sorted(services)
if __name__ == "__main__":
import sys
monitor = JournalMonitor()
# Discover hosts
print("Discovering hosts from journal...")
hosts = monitor.discover_hosts()
print(f"Found {len(hosts)} hosts:")
for host in hosts:
print(f" - {host}")
# Monitor first host if available
if hosts:
hostname = hosts[0]
print(f"\nMonitoring {hostname}...")
data = monitor.collect_all(hostname)
print("\n" + "="*60)
print(monitor.get_summary(data))
print("="*60)
# Discover services
print(f"\nActive services on {hostname}:")
services = monitor.get_active_services(hostname)
for svc in services[:10]:
print(f" - {svc}")