controller

package
v1.6.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 28, 2025 License: Apache-2.0 Imports: 19 Imported by: 0

Documentation

Overview

Package controller provides the Node Doctor Controller for multi-node aggregation, correlation, and coordinated remediation.

Index

Constants

View Source
const (
	CorrelationTypeInfrastructure = "infrastructure" // Same problem across many nodes
	CorrelationTypeCommonCause    = "common-cause"   // Different problems with shared root cause
	CorrelationTypeCascade        = "cascade"        // Sequential problems triggering each other
)

Correlation types

View Source
const (
	CorrelationStatusActive        = "active"
	CorrelationStatusInvestigating = "investigating"
	CorrelationStatusResolved      = "resolved"
)

Correlation statuses

View Source
const (
	// Cluster health events
	EventReasonClusterDegraded   = "ClusterDegraded"
	EventReasonClusterCritical   = "ClusterCritical"
	EventReasonClusterRecovered  = "ClusterRecovered"
	EventReasonNodeHealthChanged = "NodeHealthChanged"

	// Problem events
	EventReasonProblemDetected    = "ProblemDetected"
	EventReasonProblemResolved    = "ProblemResolved"
	EventReasonClusterWideProblem = "ClusterWideProblem"

	// Correlation events
	EventReasonCorrelationDetected = "CorrelationDetected"
	EventReasonCorrelationResolved = "CorrelationResolved"

	// Remediation events
	EventReasonLeaseGranted         = "RemediationLeaseGranted"
	EventReasonLeaseDenied          = "RemediationLeaseDenied"
	EventReasonLeaseExpired         = "RemediationLeaseExpired"
	EventReasonRemediationStarted   = "RemediationStarted"
	EventReasonRemediationCompleted = "RemediationCompleted"
	EventReasonRemediationFailed    = "RemediationFailed"
)

Event reasons for different scenarios

View Source
const (
	EventTypeNormal  = corev1.EventTypeNormal
	EventTypeWarning = corev1.EventTypeWarning
)

Event types

View Source
const (
	APIVersion = "v1"
)

API Version

Variables

This section is empty.

Functions

This section is empty.

Types

type APIError

type APIError struct {
	Code    string `json:"code"`
	Message string `json:"message"`
	Details string `json:"details,omitempty"`
}

APIError represents an API error

type APIResponse

type APIResponse struct {
	Success   bool        `json:"success"`
	Data      interface{} `json:"data,omitempty"`
	Error     *APIError   `json:"error,omitempty"`
	Timestamp time.Time   `json:"timestamp"`
}

APIResponse is a generic wrapper for API responses

type ClusterProblem

type ClusterProblem struct {
	ID            string    `json:"id"`
	Type          string    `json:"type"`
	Severity      string    `json:"severity"`
	AffectedNodes []string  `json:"affectedNodes"`
	Message       string    `json:"message"`
	DetectedAt    time.Time `json:"detectedAt"`
	IsCorrelated  bool      `json:"isCorrelated"`
	CorrelationID string    `json:"correlationId,omitempty"`
}

ClusterProblem represents a problem affecting the cluster

type ClusterStatus

type ClusterStatus struct {
	Timestamp      time.Time        `json:"timestamp"`
	OverallHealth  HealthStatus     `json:"overallHealth"`
	TotalNodes     int              `json:"totalNodes"`
	HealthyNodes   int              `json:"healthyNodes"`
	DegradedNodes  int              `json:"degradedNodes"`
	CriticalNodes  int              `json:"criticalNodes"`
	UnknownNodes   int              `json:"unknownNodes"`
	ActiveProblems int              `json:"activeProblems"`
	Correlations   int              `json:"activeCorrelations"`
	NodeSummaries  []NodeSummary    `json:"nodeSummaries,omitempty"`
	RecentProblems []ClusterProblem `json:"recentProblems,omitempty"`
}

ClusterStatus represents the overall cluster health status

type ControllerConfig

type ControllerConfig struct {
	// Server settings
	Server ServerConfig `json:"server" yaml:"server"`

	// Storage settings
	Storage StorageConfig `json:"storage" yaml:"storage"`

	// Correlation settings
	Correlation CorrelationConfig `json:"correlation" yaml:"correlation"`

	// Coordination settings
	Coordination CoordinationConfig `json:"coordination" yaml:"coordination"`

	// Prometheus settings
	Prometheus PrometheusConfig `json:"prometheus" yaml:"prometheus"`

	// Kubernetes settings
	Kubernetes KubernetesConfig `json:"kubernetes" yaml:"kubernetes"`
}

ControllerConfig holds the controller configuration

func DefaultControllerConfig

func DefaultControllerConfig() *ControllerConfig

DefaultControllerConfig returns a configuration with sensible defaults

type ControllerMetrics

type ControllerMetrics struct {
	// Cluster-level metrics
	NodesTotal     prometheus.Gauge
	NodesHealthy   prometheus.Gauge
	NodesDegraded  prometheus.Gauge
	NodesCritical  prometheus.Gauge
	NodesUnknown   prometheus.Gauge
	ActiveProblems prometheus.Gauge

	// Problem aggregation
	ProblemNodes  *prometheus.GaugeVec
	ProblemActive *prometheus.GaugeVec

	// Correlation metrics
	CorrelationActive   prometheus.Gauge
	CorrelationDetected *prometheus.CounterVec

	// Remediation metrics
	LeasesActive  prometheus.Gauge
	LeasesGranted *prometheus.CounterVec
	LeasesDenied  *prometheus.CounterVec

	// Report ingestion metrics
	ReportsReceived prometheus.Counter
	ReportErrors    *prometheus.CounterVec

	// Storage metrics
	StorageOperations *prometheus.CounterVec
	StorageErrors     *prometheus.CounterVec

	// Server metrics
	RequestDuration *prometheus.HistogramVec
	RequestsTotal   *prometheus.CounterVec
	// contains filtered or unexported fields
}

ControllerMetrics contains all Prometheus metrics for the controller

func NewControllerMetrics

func NewControllerMetrics() *ControllerMetrics

NewControllerMetrics creates and registers all controller metrics

func (*ControllerMetrics) Handler

func (m *ControllerMetrics) Handler() http.Handler

Handler returns an http.Handler for the /metrics endpoint

func (*ControllerMetrics) RecordCorrelationDetected

func (m *ControllerMetrics) RecordCorrelationDetected(correlationType string)

RecordCorrelationDetected increments the correlation detected counter

func (*ControllerMetrics) RecordLeaseDenied

func (m *ControllerMetrics) RecordLeaseDenied(reason string)

RecordLeaseDenied increments the lease denied counter

func (*ControllerMetrics) RecordLeaseGranted

func (m *ControllerMetrics) RecordLeaseGranted(remediationType string)

RecordLeaseGranted increments the lease granted counter

func (*ControllerMetrics) RecordReportError

func (m *ControllerMetrics) RecordReportError(errorType string)

RecordReportError increments the report error counter

func (*ControllerMetrics) RecordReportReceived

func (m *ControllerMetrics) RecordReportReceived()

RecordReportReceived increments the report received counter

func (*ControllerMetrics) RecordRequest

func (m *ControllerMetrics) RecordRequest(method, path, status string, duration float64)

RecordRequest records an HTTP request

func (*ControllerMetrics) RecordStorageError

func (m *ControllerMetrics) RecordStorageError(operation string)

RecordStorageError increments the storage error counter

func (*ControllerMetrics) RecordStorageOperation

func (m *ControllerMetrics) RecordStorageOperation(operation string)

RecordStorageOperation increments the storage operation counter

func (*ControllerMetrics) UpdateClusterMetrics

func (m *ControllerMetrics) UpdateClusterMetrics(status *ClusterStatus)

UpdateClusterMetrics updates all cluster-level metrics from current state

func (*ControllerMetrics) UpdateCorrelationMetrics

func (m *ControllerMetrics) UpdateCorrelationMetrics(activeCount int)

UpdateCorrelationMetrics updates correlation-related metrics

func (*ControllerMetrics) UpdateLeaseMetrics

func (m *ControllerMetrics) UpdateLeaseMetrics(activeCount int)

UpdateLeaseMetrics updates lease-related metrics

func (*ControllerMetrics) UpdateProblemMetrics

func (m *ControllerMetrics) UpdateProblemMetrics(problemCounts map[string]map[string]int)

UpdateProblemMetrics updates problem-related metrics

type CoordinationConfig

type CoordinationConfig struct {
	Enabled                   bool          `json:"enabled" yaml:"enabled"`
	MaxConcurrentRemediations int           `json:"maxConcurrentRemediations" yaml:"maxConcurrentRemediations"`
	DefaultLeaseDuration      time.Duration `json:"defaultLeaseDuration" yaml:"defaultLeaseDuration"`
	CooldownPeriod            time.Duration `json:"cooldownPeriod" yaml:"cooldownPeriod"`
}

CoordinationConfig contains remediation coordination settings

type Correlation

type Correlation struct {
	ID            string                 `json:"id"`
	Type          string                 `json:"type"` // infrastructure, common-cause, cascade
	Severity      string                 `json:"severity"`
	AffectedNodes []string               `json:"affectedNodes"`
	ProblemTypes  []string               `json:"problemTypes"`
	Message       string                 `json:"message"`
	DetectedAt    time.Time              `json:"detectedAt"`
	UpdatedAt     time.Time              `json:"updatedAt"`
	Status        string                 `json:"status"` // active, resolved, investigating
	Confidence    float64                `json:"confidence"`
	Metadata      map[string]interface{} `json:"metadata,omitempty"`
}

Correlation represents a detected pattern across multiple nodes

type CorrelationConfig

type CorrelationConfig struct {
	Enabled                bool          `json:"enabled" yaml:"enabled"`
	ClusterWideThreshold   float64       `json:"clusterWideThreshold" yaml:"clusterWideThreshold"`
	EvaluationInterval     time.Duration `json:"evaluationInterval" yaml:"evaluationInterval"`
	MinNodesForCorrelation int           `json:"minNodesForCorrelation" yaml:"minNodesForCorrelation"`
}

CorrelationConfig contains correlation engine settings

type Correlator

type Correlator struct {
	// contains filtered or unexported fields
}

Correlator detects patterns across node reports and identifies cluster-wide issues.

func NewCorrelator

func NewCorrelator(config *CorrelationConfig, storage Storage, metrics *ControllerMetrics, events *EventRecorder) *Correlator

NewCorrelator creates a new Correlator instance.

func (*Correlator) EvaluateNow

func (c *Correlator) EvaluateNow(ctx context.Context)

EvaluateNow triggers an immediate correlation evaluation.

func (*Correlator) ForceResolve

func (c *Correlator) ForceResolve(ctx context.Context, correlationID string) error

ForceResolve forces a correlation to be resolved (for manual intervention).

func (*Correlator) GetActiveCorrelations

func (c *Correlator) GetActiveCorrelations() []*Correlation

GetActiveCorrelations returns all currently active correlations.

func (*Correlator) GetCorrelation

func (c *Correlator) GetCorrelation(id string) (*Correlation, error)

GetCorrelation returns a specific correlation by ID.

func (*Correlator) GetStats

func (c *Correlator) GetStats() CorrelatorStats

GetStats returns correlator statistics.

func (*Correlator) InjectProblemPattern

func (c *Correlator) InjectProblemPattern(patternType string, problems []string, name, description string)

InjectProblemPattern allows adding custom problem patterns for detection. This is useful for extending the correlator with domain-specific patterns.

func (*Correlator) RemoveNode

func (c *Correlator) RemoveNode(nodeName string)

RemoveNode removes a node from tracking.

func (*Correlator) Start

func (c *Correlator) Start(ctx context.Context) error

Start begins the background correlation evaluation loop.

func (*Correlator) Stop

func (c *Correlator) Stop() error

Stop stops the correlator.

func (*Correlator) UpdateNodeReport

func (c *Correlator) UpdateNodeReport(report *NodeReport)

UpdateNodeReport updates the cached report for a node.

type CorrelatorStats

type CorrelatorStats struct {
	ActiveCorrelations int       `json:"activeCorrelations"`
	TrackedNodes       int       `json:"trackedNodes"`
	LastEvalTime       time.Time `json:"lastEvalTime"`
	Started            bool      `json:"started"`
}

CorrelatorStats contains correlator statistics.

type EventRecorder

type EventRecorder struct {
	// contains filtered or unexported fields
}

EventRecorder creates Kubernetes Events for cluster-level issues

func NewEventRecorder

func NewEventRecorder(config *EventRecorderConfig) (*EventRecorder, error)

NewEventRecorder creates a new Kubernetes event recorder

func (*EventRecorder) IsEnabled

func (r *EventRecorder) IsEnabled() bool

IsEnabled returns whether event recording is enabled

func (*EventRecorder) RecordClusterHealthChange

func (r *EventRecorder) RecordClusterHealthChange(ctx context.Context, status *ClusterStatus, previousHealth HealthStatus)

RecordClusterHealthChange records an event when cluster health status changes

func (*EventRecorder) RecordClusterWideProblem

func (r *EventRecorder) RecordClusterWideProblem(ctx context.Context, problem *ClusterProblem)

RecordClusterWideProblem records an event when a cluster-wide problem is detected

func (*EventRecorder) RecordCorrelation

func (r *EventRecorder) RecordCorrelation(ctx context.Context, correlation *Correlation)

RecordCorrelation records an event when a correlation is detected

func (*EventRecorder) RecordCorrelationResolved

func (r *EventRecorder) RecordCorrelationResolved(ctx context.Context, correlation *Correlation)

RecordCorrelationResolved records an event when a correlation is resolved

func (*EventRecorder) RecordLeaseDenied

func (r *EventRecorder) RecordLeaseDenied(ctx context.Context, nodeName, remediationType, reason string)

RecordLeaseDenied records an event when a remediation lease is denied

func (*EventRecorder) RecordLeaseExpired

func (r *EventRecorder) RecordLeaseExpired(ctx context.Context, lease *Lease)

RecordLeaseExpired records an event when a remediation lease expires

func (*EventRecorder) RecordLeaseGranted

func (r *EventRecorder) RecordLeaseGranted(ctx context.Context, lease *Lease)

RecordLeaseGranted records an event when a remediation lease is granted

func (*EventRecorder) RecordNodeHealthChange

func (r *EventRecorder) RecordNodeHealthChange(ctx context.Context, nodeName string, previousHealth, currentHealth HealthStatus)

RecordNodeHealthChange records an event when a node's health changes significantly

func (*EventRecorder) RecordProblemDetected

func (r *EventRecorder) RecordProblemDetected(ctx context.Context, nodeName string, problem *ProblemSummary)

RecordProblemDetected records an event when a significant problem is detected on a node

func (*EventRecorder) RecordProblemResolved

func (r *EventRecorder) RecordProblemResolved(ctx context.Context, nodeName string, problemType string)

RecordProblemResolved records an event when a problem is resolved

type EventRecorderConfig

type EventRecorderConfig struct {
	Kubeconfig      string
	InCluster       bool
	Namespace       string
	Enabled         bool
	RateLimitPeriod time.Duration
}

EventRecorderConfig holds configuration for the EventRecorder

type HealthStatus

type HealthStatus string

HealthStatus represents the overall health state

const (
	HealthStatusHealthy  HealthStatus = "healthy"
	HealthStatusDegraded HealthStatus = "degraded"
	HealthStatusCritical HealthStatus = "critical"
	HealthStatusUnknown  HealthStatus = "unknown"
)

type KubernetesConfig

type KubernetesConfig struct {
	Enabled      bool   `json:"enabled" yaml:"enabled"`
	Kubeconfig   string `json:"kubeconfig" yaml:"kubeconfig"`
	InCluster    bool   `json:"inCluster" yaml:"inCluster"`
	Namespace    string `json:"namespace" yaml:"namespace"`
	CreateEvents bool   `json:"createEvents" yaml:"createEvents"`
}

KubernetesConfig contains Kubernetes integration settings

type Lease

type Lease struct {
	ID              string    `json:"id"`
	NodeName        string    `json:"nodeName"`
	RemediationType string    `json:"remediationType"`
	GrantedAt       time.Time `json:"grantedAt"`
	ExpiresAt       time.Time `json:"expiresAt"`
	CompletedAt     time.Time `json:"completedAt,omitempty"`
	Status          string    `json:"status"` // active, completed, expired, cancelled
	Reason          string    `json:"reason,omitempty"`
}

Lease represents an active remediation lease

type LeaseRequest

type LeaseRequest struct {
	NodeName          string `json:"node"`
	RemediationType   string `json:"remediation"`
	RequestedDuration string `json:"requestedDuration,omitempty"` // e.g., "5m"
	Reason            string `json:"reason,omitempty"`
	Priority          int    `json:"priority,omitempty"` // higher = more urgent
}

LeaseRequest represents a request for a remediation lease

type LeaseResponse

type LeaseResponse struct {
	LeaseID   string    `json:"leaseId,omitempty"`
	Approved  bool      `json:"approved"`
	ExpiresAt time.Time `json:"expiresAt,omitempty"`
	Message   string    `json:"message,omitempty"`
	RetryAt   time.Time `json:"retryAt,omitempty"`  // when to retry if denied
	Position  int       `json:"position,omitempty"` // queue position if waiting
}

LeaseResponse represents the response to a lease request

type MonitorStatus

type MonitorStatus struct {
	Name       string       `json:"name"`
	Type       string       `json:"type"`
	Status     HealthStatus `json:"status"`
	LastRun    time.Time    `json:"lastRun"`
	Message    string       `json:"message,omitempty"`
	ErrorCount int          `json:"errorCount,omitempty"`
}

MonitorStatus represents the status of a single monitor

type NodeCondition

type NodeCondition struct {
	Type               string    `json:"type"`
	Status             string    `json:"status"` // True, False, Unknown
	Reason             string    `json:"reason,omitempty"`
	Message            string    `json:"message,omitempty"`
	LastTransitionTime time.Time `json:"lastTransitionTime,omitempty"`
	LastHeartbeatTime  time.Time `json:"lastHeartbeatTime,omitempty"`
}

NodeCondition represents a Kubernetes-style node condition

type NodeDetail

type NodeDetail struct {
	NodeName       string           `json:"nodeName"`
	NodeUID        string           `json:"nodeUID,omitempty"`
	Health         HealthStatus     `json:"health"`
	LastReportAt   time.Time        `json:"lastReportAt"`
	FirstSeenAt    time.Time        `json:"firstSeenAt"`
	ReportCount    int64            `json:"reportCount"`
	LatestReport   *NodeReport      `json:"latestReport,omitempty"`
	ActiveProblems []ProblemSummary `json:"activeProblems"`
	Conditions     []NodeCondition  `json:"conditions"`
	RecentHistory  []ReportSummary  `json:"recentHistory,omitempty"`
}

NodeDetail provides detailed information about a specific node

type NodeReport

type NodeReport struct {
	// Node identification
	NodeName string `json:"nodeName"`
	NodeUID  string `json:"nodeUID,omitempty"`

	// Report metadata
	Timestamp  time.Time `json:"timestamp"`
	ReportID   string    `json:"reportId,omitempty"`
	Version    string    `json:"version,omitempty"`    // node-doctor version
	Uptime     string    `json:"uptime,omitempty"`     // node-doctor uptime
	ReportType string    `json:"reportType,omitempty"` // "periodic", "on-demand", "startup"

	// Health summary
	OverallHealth   HealthStatus     `json:"overallHealth"`
	MonitorStatuses []MonitorStatus  `json:"monitorStatuses,omitempty"`
	ActiveProblems  []ProblemSummary `json:"activeProblems,omitempty"`
	Conditions      []NodeCondition  `json:"conditions,omitempty"`

	// Statistics
	Stats *NodeStats `json:"stats,omitempty"`
}

NodeReport represents a health report from a node-doctor DaemonSet pod. This is the primary data structure sent from nodes to the controller.

type NodeStats

type NodeStats struct {
	StatusesProcessed int64  `json:"statusesProcessed"`
	ProblemsDetected  int64  `json:"problemsDetected"`
	RemediationsRun   int64  `json:"remediationsRun"`
	MemoryUsageBytes  int64  `json:"memoryUsageBytes,omitempty"`
	GoroutineCount    int    `json:"goroutineCount,omitempty"`
	CPUUsagePercent   string `json:"cpuUsagePercent,omitempty"`
}

NodeStats contains statistics about the node-doctor instance

type NodeSummary

type NodeSummary struct {
	NodeName       string       `json:"nodeName"`
	Health         HealthStatus `json:"health"`
	LastReportAt   time.Time    `json:"lastReportAt"`
	ProblemCount   int          `json:"problemCount"`
	ConditionCount int          `json:"conditionCount"`
}

NodeSummary provides a brief overview of a node's status

type PaginatedResponse

type PaginatedResponse struct {
	Items      interface{} `json:"items"`
	TotalCount int         `json:"totalCount"`
	Page       int         `json:"page"`
	PageSize   int         `json:"pageSize"`
	HasMore    bool        `json:"hasMore"`
}

PaginatedResponse wraps paginated list responses

type ProblemSummary

type ProblemSummary struct {
	Type        string    `json:"type"`
	Severity    string    `json:"severity"`
	Message     string    `json:"message"`
	Source      string    `json:"source"`
	DetectedAt  time.Time `json:"detectedAt"`
	LastSeenAt  time.Time `json:"lastSeenAt"`
	Occurrences int       `json:"occurrences,omitempty"`
}

ProblemSummary represents an active problem on the node

type PrometheusConfig

type PrometheusConfig struct {
	Enabled bool   `json:"enabled" yaml:"enabled"`
	Port    int    `json:"port" yaml:"port"`
	Path    string `json:"path" yaml:"path"`
}

PrometheusConfig contains Prometheus metrics configuration

type ReportSummary

type ReportSummary struct {
	ReportID      string       `json:"reportId"`
	Timestamp     time.Time    `json:"timestamp"`
	OverallHealth HealthStatus `json:"overallHealth"`
	ProblemCount  int          `json:"problemCount"`
}

ReportSummary is a condensed view of a historical report

type SQLiteStorage

type SQLiteStorage struct {
	// contains filtered or unexported fields
}

SQLiteStorage implements Storage using SQLite

func NewSQLiteStorage

func NewSQLiteStorage(config *StorageConfig) (*SQLiteStorage, error)

NewSQLiteStorage creates a new SQLite storage instance

func (*SQLiteStorage) Close

func (s *SQLiteStorage) Close() error

Close closes the database connection

func (*SQLiteStorage) DeleteOldReports

func (s *SQLiteStorage) DeleteOldReports(ctx context.Context, before time.Time) (int64, error)

DeleteOldReports removes reports older than the given time

func (*SQLiteStorage) ExpireLeases

func (s *SQLiteStorage) ExpireLeases(ctx context.Context) (int64, error)

ExpireLeases marks expired leases as expired

func (*SQLiteStorage) GetActiveCorrelations

func (s *SQLiteStorage) GetActiveCorrelations(ctx context.Context) ([]*Correlation, error)

GetActiveCorrelations returns all active correlations

func (*SQLiteStorage) GetActiveLeases

func (s *SQLiteStorage) GetActiveLeases(ctx context.Context) ([]*Lease, error)

GetActiveLeases returns all active leases

func (*SQLiteStorage) GetAllLatestReports

func (s *SQLiteStorage) GetAllLatestReports(ctx context.Context) (map[string]*NodeReport, error)

GetAllLatestReports returns the latest report for all nodes

func (*SQLiteStorage) GetCorrelation

func (s *SQLiteStorage) GetCorrelation(ctx context.Context, id string) (*Correlation, error)

GetCorrelation returns a correlation by ID

func (*SQLiteStorage) GetLastCompletedLease

func (s *SQLiteStorage) GetLastCompletedLease(ctx context.Context, nodeName string) (*Lease, error)

GetLastCompletedLease returns the most recent completed/expired lease for a node. This is used for cooldown period enforcement to prevent rapid repeated remediations.

func (*SQLiteStorage) GetLatestNodeReport

func (s *SQLiteStorage) GetLatestNodeReport(ctx context.Context, nodeName string) (*NodeReport, error)

GetLatestNodeReport returns the most recent report for a node

func (*SQLiteStorage) GetLease

func (s *SQLiteStorage) GetLease(ctx context.Context, leaseID string) (*Lease, error)

GetLease returns a lease by ID

func (*SQLiteStorage) GetLeaseStats

func (s *SQLiteStorage) GetLeaseStats(ctx context.Context) (active, total int64, err error)

GetLeaseStats returns lease statistics

func (*SQLiteStorage) GetNodeCount

func (s *SQLiteStorage) GetNodeCount(ctx context.Context) (int, error)

GetNodeCount returns the number of unique nodes

func (*SQLiteStorage) GetNodeLease

func (s *SQLiteStorage) GetNodeLease(ctx context.Context, nodeName string) (*Lease, error)

GetNodeLease returns the active lease for a node, if any

func (*SQLiteStorage) GetNodeReports

func (s *SQLiteStorage) GetNodeReports(ctx context.Context, nodeName string, limit int) ([]*NodeReport, error)

GetNodeReports returns recent reports for a node

func (*SQLiteStorage) GetReportCount

func (s *SQLiteStorage) GetReportCount(ctx context.Context) (int64, error)

GetReportCount returns the total number of reports

func (*SQLiteStorage) Initialize

func (s *SQLiteStorage) Initialize(ctx context.Context) error

Initialize opens the database and runs migrations

func (*SQLiteStorage) RunCleanup

func (s *SQLiteStorage) RunCleanup(ctx context.Context) error

RunCleanup performs periodic cleanup of old data

func (*SQLiteStorage) SaveCorrelation

func (s *SQLiteStorage) SaveCorrelation(ctx context.Context, correlation *Correlation) error

SaveCorrelation saves a correlation to the database

func (*SQLiteStorage) SaveLease

func (s *SQLiteStorage) SaveLease(ctx context.Context, lease *Lease) error

SaveLease saves a lease to the database

func (*SQLiteStorage) SaveNodeReport

func (s *SQLiteStorage) SaveNodeReport(ctx context.Context, report *NodeReport) error

SaveNodeReport saves a node report to the database

func (*SQLiteStorage) UpdateCorrelation

func (s *SQLiteStorage) UpdateCorrelation(ctx context.Context, correlation *Correlation) error

UpdateCorrelation updates an existing correlation

func (*SQLiteStorage) UpdateLeaseStatus

func (s *SQLiteStorage) UpdateLeaseStatus(ctx context.Context, leaseID, status string) error

UpdateLeaseStatus updates the status of a lease

type Server

type Server struct {
	// contains filtered or unexported fields
}

Server is the main HTTP server for the Node Doctor Controller

func NewServer

func NewServer(config *ControllerConfig) (*Server, error)

NewServer creates a new controller server

func (*Server) GetStorage

func (s *Server) GetStorage() Storage

GetStorage returns the storage backend

func (*Server) Handler

func (s *Server) Handler() http.Handler

Handler returns the HTTP handler for testing purposes. This allows tests to use httptest.NewServer(server.Handler()).

func (*Server) IsReady

func (s *Server) IsReady() bool

IsReady returns whether the server is ready to accept requests

func (*Server) SetReady

func (s *Server) SetReady(ready bool)

SetReady sets the readiness state

func (*Server) SetStorage

func (s *Server) SetStorage(storage Storage)

SetStorage sets the storage backend for the server

func (*Server) Start

func (s *Server) Start(ctx context.Context) error

Start starts the HTTP server

func (*Server) Stop

func (s *Server) Stop(ctx context.Context) error

Stop gracefully stops the HTTP server

type ServerConfig

type ServerConfig struct {
	BindAddress  string        `json:"bindAddress" yaml:"bindAddress"`
	Port         int           `json:"port" yaml:"port"`
	ReadTimeout  time.Duration `json:"readTimeout" yaml:"readTimeout"`
	WriteTimeout time.Duration `json:"writeTimeout" yaml:"writeTimeout"`
	EnableCORS   bool          `json:"enableCORS" yaml:"enableCORS"`
}

ServerConfig contains HTTP server configuration

type Storage

type Storage interface {
	// Initialize sets up the database and runs migrations
	Initialize(ctx context.Context) error

	// Close closes the database connection
	Close() error

	// Node Reports
	SaveNodeReport(ctx context.Context, report *NodeReport) error
	GetLatestNodeReport(ctx context.Context, nodeName string) (*NodeReport, error)
	GetNodeReports(ctx context.Context, nodeName string, limit int) ([]*NodeReport, error)
	GetAllLatestReports(ctx context.Context) (map[string]*NodeReport, error)
	DeleteOldReports(ctx context.Context, before time.Time) (int64, error)

	// Leases
	SaveLease(ctx context.Context, lease *Lease) error
	GetLease(ctx context.Context, leaseID string) (*Lease, error)
	GetActiveLeases(ctx context.Context) ([]*Lease, error)
	GetNodeLease(ctx context.Context, nodeName string) (*Lease, error)
	GetLastCompletedLease(ctx context.Context, nodeName string) (*Lease, error)
	UpdateLeaseStatus(ctx context.Context, leaseID, status string) error
	ExpireLeases(ctx context.Context) (int64, error)

	// Correlations
	SaveCorrelation(ctx context.Context, correlation *Correlation) error
	GetCorrelation(ctx context.Context, id string) (*Correlation, error)
	GetActiveCorrelations(ctx context.Context) ([]*Correlation, error)
	UpdateCorrelation(ctx context.Context, correlation *Correlation) error

	// Statistics
	GetNodeCount(ctx context.Context) (int, error)
	GetReportCount(ctx context.Context) (int64, error)
	GetLeaseStats(ctx context.Context) (active, total int64, err error)

	// Maintenance
	RunCleanup(ctx context.Context) error
}

Storage defines the interface for persistent storage

type StorageConfig

type StorageConfig struct {
	Path      string        `json:"path" yaml:"path"`
	Retention time.Duration `json:"retention" yaml:"retention"` // How long to keep reports
}

StorageConfig contains SQLite storage configuration

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL