Documentation
¶
Overview ¶
Package controller provides the Node Doctor Controller for multi-node aggregation, correlation, and coordinated remediation.
Index ¶
- Constants
- type APIError
- type APIResponse
- type ClusterProblem
- type ClusterStatus
- type ControllerConfig
- type ControllerMetrics
- func (m *ControllerMetrics) Handler() http.Handler
- func (m *ControllerMetrics) RecordCorrelationDetected(correlationType string)
- func (m *ControllerMetrics) RecordLeaseDenied(reason string)
- func (m *ControllerMetrics) RecordLeaseGranted(remediationType string)
- func (m *ControllerMetrics) RecordReportError(errorType string)
- func (m *ControllerMetrics) RecordReportReceived()
- func (m *ControllerMetrics) RecordRequest(method, path, status string, duration float64)
- func (m *ControllerMetrics) RecordStorageError(operation string)
- func (m *ControllerMetrics) RecordStorageOperation(operation string)
- func (m *ControllerMetrics) UpdateClusterMetrics(status *ClusterStatus)
- func (m *ControllerMetrics) UpdateCorrelationMetrics(activeCount int)
- func (m *ControllerMetrics) UpdateLeaseMetrics(activeCount int)
- func (m *ControllerMetrics) UpdateProblemMetrics(problemCounts map[string]map[string]int)
- type CoordinationConfig
- type Correlation
- type CorrelationConfig
- type Correlator
- func (c *Correlator) EvaluateNow(ctx context.Context)
- func (c *Correlator) ForceResolve(ctx context.Context, correlationID string) error
- func (c *Correlator) GetActiveCorrelations() []*Correlation
- func (c *Correlator) GetCorrelation(id string) (*Correlation, error)
- func (c *Correlator) GetStats() CorrelatorStats
- func (c *Correlator) InjectProblemPattern(patternType string, problems []string, name, description string)
- func (c *Correlator) RemoveNode(nodeName string)
- func (c *Correlator) Start(ctx context.Context) error
- func (c *Correlator) Stop() error
- func (c *Correlator) UpdateNodeReport(report *NodeReport)
- type CorrelatorStats
- type EventRecorder
- func (r *EventRecorder) IsEnabled() bool
- func (r *EventRecorder) RecordClusterHealthChange(ctx context.Context, status *ClusterStatus, previousHealth HealthStatus)
- func (r *EventRecorder) RecordClusterWideProblem(ctx context.Context, problem *ClusterProblem)
- func (r *EventRecorder) RecordCorrelation(ctx context.Context, correlation *Correlation)
- func (r *EventRecorder) RecordCorrelationResolved(ctx context.Context, correlation *Correlation)
- func (r *EventRecorder) RecordLeaseDenied(ctx context.Context, nodeName, remediationType, reason string)
- func (r *EventRecorder) RecordLeaseExpired(ctx context.Context, lease *Lease)
- func (r *EventRecorder) RecordLeaseGranted(ctx context.Context, lease *Lease)
- func (r *EventRecorder) RecordNodeHealthChange(ctx context.Context, nodeName string, ...)
- func (r *EventRecorder) RecordProblemDetected(ctx context.Context, nodeName string, problem *ProblemSummary)
- func (r *EventRecorder) RecordProblemResolved(ctx context.Context, nodeName string, problemType string)
- type EventRecorderConfig
- type HealthStatus
- type KubernetesConfig
- type Lease
- type LeaseRequest
- type LeaseResponse
- type MonitorStatus
- type NodeCondition
- type NodeDetail
- type NodeReport
- type NodeStats
- type NodeSummary
- type PaginatedResponse
- type ProblemSummary
- type PrometheusConfig
- type ReportSummary
- type SQLiteStorage
- func (s *SQLiteStorage) Close() error
- func (s *SQLiteStorage) DeleteOldReports(ctx context.Context, before time.Time) (int64, error)
- func (s *SQLiteStorage) ExpireLeases(ctx context.Context) (int64, error)
- func (s *SQLiteStorage) GetActiveCorrelations(ctx context.Context) ([]*Correlation, error)
- func (s *SQLiteStorage) GetActiveLeases(ctx context.Context) ([]*Lease, error)
- func (s *SQLiteStorage) GetAllLatestReports(ctx context.Context) (map[string]*NodeReport, error)
- func (s *SQLiteStorage) GetCorrelation(ctx context.Context, id string) (*Correlation, error)
- func (s *SQLiteStorage) GetLastCompletedLease(ctx context.Context, nodeName string) (*Lease, error)
- func (s *SQLiteStorage) GetLatestNodeReport(ctx context.Context, nodeName string) (*NodeReport, error)
- func (s *SQLiteStorage) GetLease(ctx context.Context, leaseID string) (*Lease, error)
- func (s *SQLiteStorage) GetLeaseStats(ctx context.Context) (active, total int64, err error)
- func (s *SQLiteStorage) GetNodeCount(ctx context.Context) (int, error)
- func (s *SQLiteStorage) GetNodeLease(ctx context.Context, nodeName string) (*Lease, error)
- func (s *SQLiteStorage) GetNodeReports(ctx context.Context, nodeName string, limit int) ([]*NodeReport, error)
- func (s *SQLiteStorage) GetReportCount(ctx context.Context) (int64, error)
- func (s *SQLiteStorage) Initialize(ctx context.Context) error
- func (s *SQLiteStorage) RunCleanup(ctx context.Context) error
- func (s *SQLiteStorage) SaveCorrelation(ctx context.Context, correlation *Correlation) error
- func (s *SQLiteStorage) SaveLease(ctx context.Context, lease *Lease) error
- func (s *SQLiteStorage) SaveNodeReport(ctx context.Context, report *NodeReport) error
- func (s *SQLiteStorage) UpdateCorrelation(ctx context.Context, correlation *Correlation) error
- func (s *SQLiteStorage) UpdateLeaseStatus(ctx context.Context, leaseID, status string) error
- type Server
- type ServerConfig
- type Storage
- type StorageConfig
Constants ¶
const ( CorrelationTypeInfrastructure = "infrastructure" // Same problem across many nodes CorrelationTypeCommonCause = "common-cause" // Different problems with shared root cause CorrelationTypeCascade = "cascade" // Sequential problems triggering each other )
Correlation types
const ( CorrelationStatusActive = "active" CorrelationStatusInvestigating = "investigating" CorrelationStatusResolved = "resolved" )
Correlation statuses
const ( // Cluster health events EventReasonClusterDegraded = "ClusterDegraded" EventReasonClusterCritical = "ClusterCritical" EventReasonClusterRecovered = "ClusterRecovered" EventReasonNodeHealthChanged = "NodeHealthChanged" // Problem events EventReasonProblemDetected = "ProblemDetected" EventReasonProblemResolved = "ProblemResolved" EventReasonClusterWideProblem = "ClusterWideProblem" // Correlation events EventReasonCorrelationDetected = "CorrelationDetected" EventReasonCorrelationResolved = "CorrelationResolved" // Remediation events EventReasonLeaseGranted = "RemediationLeaseGranted" EventReasonLeaseDenied = "RemediationLeaseDenied" EventReasonLeaseExpired = "RemediationLeaseExpired" EventReasonRemediationStarted = "RemediationStarted" EventReasonRemediationCompleted = "RemediationCompleted" EventReasonRemediationFailed = "RemediationFailed" )
Event reasons for different scenarios
const ( EventTypeNormal = corev1.EventTypeNormal EventTypeWarning = corev1.EventTypeWarning )
Event types
const (
APIVersion = "v1"
)
API Version
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type APIError ¶
type APIError struct {
Code string `json:"code"`
Message string `json:"message"`
Details string `json:"details,omitempty"`
}
APIError represents an API error
type APIResponse ¶
type APIResponse struct {
Success bool `json:"success"`
Data interface{} `json:"data,omitempty"`
Error *APIError `json:"error,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
APIResponse is a generic wrapper for API responses
type ClusterProblem ¶
type ClusterProblem struct {
ID string `json:"id"`
Type string `json:"type"`
Severity string `json:"severity"`
AffectedNodes []string `json:"affectedNodes"`
Message string `json:"message"`
DetectedAt time.Time `json:"detectedAt"`
CorrelationID string `json:"correlationId,omitempty"`
}
ClusterProblem represents a problem affecting the cluster
type ClusterStatus ¶
type ClusterStatus struct {
Timestamp time.Time `json:"timestamp"`
OverallHealth HealthStatus `json:"overallHealth"`
TotalNodes int `json:"totalNodes"`
HealthyNodes int `json:"healthyNodes"`
DegradedNodes int `json:"degradedNodes"`
CriticalNodes int `json:"criticalNodes"`
UnknownNodes int `json:"unknownNodes"`
ActiveProblems int `json:"activeProblems"`
Correlations int `json:"activeCorrelations"`
NodeSummaries []NodeSummary `json:"nodeSummaries,omitempty"`
RecentProblems []ClusterProblem `json:"recentProblems,omitempty"`
}
ClusterStatus represents the overall cluster health status
type ControllerConfig ¶
type ControllerConfig struct {
// Server settings
Server ServerConfig `json:"server" yaml:"server"`
// Storage settings
Storage StorageConfig `json:"storage" yaml:"storage"`
// Correlation settings
Correlation CorrelationConfig `json:"correlation" yaml:"correlation"`
// Coordination settings
Coordination CoordinationConfig `json:"coordination" yaml:"coordination"`
// Prometheus settings
Prometheus PrometheusConfig `json:"prometheus" yaml:"prometheus"`
// Kubernetes settings
Kubernetes KubernetesConfig `json:"kubernetes" yaml:"kubernetes"`
}
ControllerConfig holds the controller configuration
func DefaultControllerConfig ¶
func DefaultControllerConfig() *ControllerConfig
DefaultControllerConfig returns a configuration with sensible defaults
type ControllerMetrics ¶
type ControllerMetrics struct {
// Cluster-level metrics
NodesTotal prometheus.Gauge
NodesHealthy prometheus.Gauge
NodesDegraded prometheus.Gauge
NodesCritical prometheus.Gauge
NodesUnknown prometheus.Gauge
ActiveProblems prometheus.Gauge
// Problem aggregation
ProblemNodes *prometheus.GaugeVec
ProblemActive *prometheus.GaugeVec
// Correlation metrics
CorrelationActive prometheus.Gauge
CorrelationDetected *prometheus.CounterVec
// Remediation metrics
LeasesActive prometheus.Gauge
LeasesGranted *prometheus.CounterVec
LeasesDenied *prometheus.CounterVec
// Report ingestion metrics
ReportsReceived prometheus.Counter
ReportErrors *prometheus.CounterVec
// Storage metrics
StorageOperations *prometheus.CounterVec
StorageErrors *prometheus.CounterVec
// Server metrics
RequestDuration *prometheus.HistogramVec
RequestsTotal *prometheus.CounterVec
// contains filtered or unexported fields
}
ControllerMetrics contains all Prometheus metrics for the controller
func NewControllerMetrics ¶
func NewControllerMetrics() *ControllerMetrics
NewControllerMetrics creates and registers all controller metrics
func (*ControllerMetrics) Handler ¶
func (m *ControllerMetrics) Handler() http.Handler
Handler returns an http.Handler for the /metrics endpoint
func (*ControllerMetrics) RecordCorrelationDetected ¶
func (m *ControllerMetrics) RecordCorrelationDetected(correlationType string)
RecordCorrelationDetected increments the correlation detected counter
func (*ControllerMetrics) RecordLeaseDenied ¶
func (m *ControllerMetrics) RecordLeaseDenied(reason string)
RecordLeaseDenied increments the lease denied counter
func (*ControllerMetrics) RecordLeaseGranted ¶
func (m *ControllerMetrics) RecordLeaseGranted(remediationType string)
RecordLeaseGranted increments the lease granted counter
func (*ControllerMetrics) RecordReportError ¶
func (m *ControllerMetrics) RecordReportError(errorType string)
RecordReportError increments the report error counter
func (*ControllerMetrics) RecordReportReceived ¶
func (m *ControllerMetrics) RecordReportReceived()
RecordReportReceived increments the report received counter
func (*ControllerMetrics) RecordRequest ¶
func (m *ControllerMetrics) RecordRequest(method, path, status string, duration float64)
RecordRequest records an HTTP request
func (*ControllerMetrics) RecordStorageError ¶
func (m *ControllerMetrics) RecordStorageError(operation string)
RecordStorageError increments the storage error counter
func (*ControllerMetrics) RecordStorageOperation ¶
func (m *ControllerMetrics) RecordStorageOperation(operation string)
RecordStorageOperation increments the storage operation counter
func (*ControllerMetrics) UpdateClusterMetrics ¶
func (m *ControllerMetrics) UpdateClusterMetrics(status *ClusterStatus)
UpdateClusterMetrics updates all cluster-level metrics from current state
func (*ControllerMetrics) UpdateCorrelationMetrics ¶
func (m *ControllerMetrics) UpdateCorrelationMetrics(activeCount int)
UpdateCorrelationMetrics updates correlation-related metrics
func (*ControllerMetrics) UpdateLeaseMetrics ¶
func (m *ControllerMetrics) UpdateLeaseMetrics(activeCount int)
UpdateLeaseMetrics updates lease-related metrics
func (*ControllerMetrics) UpdateProblemMetrics ¶
func (m *ControllerMetrics) UpdateProblemMetrics(problemCounts map[string]map[string]int)
UpdateProblemMetrics updates problem-related metrics
type CoordinationConfig ¶
type CoordinationConfig struct {
Enabled bool `json:"enabled" yaml:"enabled"`
MaxConcurrentRemediations int `json:"maxConcurrentRemediations" yaml:"maxConcurrentRemediations"`
DefaultLeaseDuration time.Duration `json:"defaultLeaseDuration" yaml:"defaultLeaseDuration"`
CooldownPeriod time.Duration `json:"cooldownPeriod" yaml:"cooldownPeriod"`
}
CoordinationConfig contains remediation coordination settings
type Correlation ¶
type Correlation struct {
ID string `json:"id"`
Type string `json:"type"` // infrastructure, common-cause, cascade
Severity string `json:"severity"`
AffectedNodes []string `json:"affectedNodes"`
ProblemTypes []string `json:"problemTypes"`
Message string `json:"message"`
DetectedAt time.Time `json:"detectedAt"`
UpdatedAt time.Time `json:"updatedAt"`
Status string `json:"status"` // active, resolved, investigating
Confidence float64 `json:"confidence"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
}
Correlation represents a detected pattern across multiple nodes
type CorrelationConfig ¶
type CorrelationConfig struct {
Enabled bool `json:"enabled" yaml:"enabled"`
ClusterWideThreshold float64 `json:"clusterWideThreshold" yaml:"clusterWideThreshold"`
EvaluationInterval time.Duration `json:"evaluationInterval" yaml:"evaluationInterval"`
MinNodesForCorrelation int `json:"minNodesForCorrelation" yaml:"minNodesForCorrelation"`
}
CorrelationConfig contains correlation engine settings
type Correlator ¶
type Correlator struct {
// contains filtered or unexported fields
}
Correlator detects patterns across node reports and identifies cluster-wide issues.
func NewCorrelator ¶
func NewCorrelator(config *CorrelationConfig, storage Storage, metrics *ControllerMetrics, events *EventRecorder) *Correlator
NewCorrelator creates a new Correlator instance.
func (*Correlator) EvaluateNow ¶
func (c *Correlator) EvaluateNow(ctx context.Context)
EvaluateNow triggers an immediate correlation evaluation.
func (*Correlator) ForceResolve ¶
func (c *Correlator) ForceResolve(ctx context.Context, correlationID string) error
ForceResolve forces a correlation to be resolved (for manual intervention).
func (*Correlator) GetActiveCorrelations ¶
func (c *Correlator) GetActiveCorrelations() []*Correlation
GetActiveCorrelations returns all currently active correlations.
func (*Correlator) GetCorrelation ¶
func (c *Correlator) GetCorrelation(id string) (*Correlation, error)
GetCorrelation returns a specific correlation by ID.
func (*Correlator) GetStats ¶
func (c *Correlator) GetStats() CorrelatorStats
GetStats returns correlator statistics.
func (*Correlator) InjectProblemPattern ¶
func (c *Correlator) InjectProblemPattern(patternType string, problems []string, name, description string)
InjectProblemPattern allows adding custom problem patterns for detection. This is useful for extending the correlator with domain-specific patterns.
func (*Correlator) RemoveNode ¶
func (c *Correlator) RemoveNode(nodeName string)
RemoveNode removes a node from tracking.
func (*Correlator) Start ¶
func (c *Correlator) Start(ctx context.Context) error
Start begins the background correlation evaluation loop.
func (*Correlator) UpdateNodeReport ¶
func (c *Correlator) UpdateNodeReport(report *NodeReport)
UpdateNodeReport updates the cached report for a node.
type CorrelatorStats ¶
type CorrelatorStats struct {
ActiveCorrelations int `json:"activeCorrelations"`
TrackedNodes int `json:"trackedNodes"`
LastEvalTime time.Time `json:"lastEvalTime"`
Started bool `json:"started"`
}
CorrelatorStats contains correlator statistics.
type EventRecorder ¶
type EventRecorder struct {
// contains filtered or unexported fields
}
EventRecorder creates Kubernetes Events for cluster-level issues
func NewEventRecorder ¶
func NewEventRecorder(config *EventRecorderConfig) (*EventRecorder, error)
NewEventRecorder creates a new Kubernetes event recorder
func (*EventRecorder) IsEnabled ¶
func (r *EventRecorder) IsEnabled() bool
IsEnabled returns whether event recording is enabled
func (*EventRecorder) RecordClusterHealthChange ¶
func (r *EventRecorder) RecordClusterHealthChange(ctx context.Context, status *ClusterStatus, previousHealth HealthStatus)
RecordClusterHealthChange records an event when cluster health status changes
func (*EventRecorder) RecordClusterWideProblem ¶
func (r *EventRecorder) RecordClusterWideProblem(ctx context.Context, problem *ClusterProblem)
RecordClusterWideProblem records an event when a cluster-wide problem is detected
func (*EventRecorder) RecordCorrelation ¶
func (r *EventRecorder) RecordCorrelation(ctx context.Context, correlation *Correlation)
RecordCorrelation records an event when a correlation is detected
func (*EventRecorder) RecordCorrelationResolved ¶
func (r *EventRecorder) RecordCorrelationResolved(ctx context.Context, correlation *Correlation)
RecordCorrelationResolved records an event when a correlation is resolved
func (*EventRecorder) RecordLeaseDenied ¶
func (r *EventRecorder) RecordLeaseDenied(ctx context.Context, nodeName, remediationType, reason string)
RecordLeaseDenied records an event when a remediation lease is denied
func (*EventRecorder) RecordLeaseExpired ¶
func (r *EventRecorder) RecordLeaseExpired(ctx context.Context, lease *Lease)
RecordLeaseExpired records an event when a remediation lease expires
func (*EventRecorder) RecordLeaseGranted ¶
func (r *EventRecorder) RecordLeaseGranted(ctx context.Context, lease *Lease)
RecordLeaseGranted records an event when a remediation lease is granted
func (*EventRecorder) RecordNodeHealthChange ¶
func (r *EventRecorder) RecordNodeHealthChange(ctx context.Context, nodeName string, previousHealth, currentHealth HealthStatus)
RecordNodeHealthChange records an event when a node's health changes significantly
func (*EventRecorder) RecordProblemDetected ¶
func (r *EventRecorder) RecordProblemDetected(ctx context.Context, nodeName string, problem *ProblemSummary)
RecordProblemDetected records an event when a significant problem is detected on a node
func (*EventRecorder) RecordProblemResolved ¶
func (r *EventRecorder) RecordProblemResolved(ctx context.Context, nodeName string, problemType string)
RecordProblemResolved records an event when a problem is resolved
type EventRecorderConfig ¶
type EventRecorderConfig struct {
Kubeconfig string
InCluster bool
Namespace string
Enabled bool
RateLimitPeriod time.Duration
}
EventRecorderConfig holds configuration for the EventRecorder
type HealthStatus ¶
type HealthStatus string
HealthStatus represents the overall health state
const ( HealthStatusHealthy HealthStatus = "healthy" HealthStatusDegraded HealthStatus = "degraded" HealthStatusCritical HealthStatus = "critical" HealthStatusUnknown HealthStatus = "unknown" )
type KubernetesConfig ¶
type KubernetesConfig struct {
Enabled bool `json:"enabled" yaml:"enabled"`
Kubeconfig string `json:"kubeconfig" yaml:"kubeconfig"`
InCluster bool `json:"inCluster" yaml:"inCluster"`
Namespace string `json:"namespace" yaml:"namespace"`
CreateEvents bool `json:"createEvents" yaml:"createEvents"`
}
KubernetesConfig contains Kubernetes integration settings
type Lease ¶
type Lease struct {
ID string `json:"id"`
NodeName string `json:"nodeName"`
RemediationType string `json:"remediationType"`
GrantedAt time.Time `json:"grantedAt"`
ExpiresAt time.Time `json:"expiresAt"`
CompletedAt time.Time `json:"completedAt,omitempty"`
Status string `json:"status"` // active, completed, expired, cancelled
Reason string `json:"reason,omitempty"`
}
Lease represents an active remediation lease
type LeaseRequest ¶
type LeaseRequest struct {
NodeName string `json:"node"`
RemediationType string `json:"remediation"`
RequestedDuration string `json:"requestedDuration,omitempty"` // e.g., "5m"
Reason string `json:"reason,omitempty"`
Priority int `json:"priority,omitempty"` // higher = more urgent
}
LeaseRequest represents a request for a remediation lease
type LeaseResponse ¶
type LeaseResponse struct {
LeaseID string `json:"leaseId,omitempty"`
Approved bool `json:"approved"`
ExpiresAt time.Time `json:"expiresAt,omitempty"`
Message string `json:"message,omitempty"`
RetryAt time.Time `json:"retryAt,omitempty"` // when to retry if denied
Position int `json:"position,omitempty"` // queue position if waiting
}
LeaseResponse represents the response to a lease request
type MonitorStatus ¶
type MonitorStatus struct {
Name string `json:"name"`
Type string `json:"type"`
Status HealthStatus `json:"status"`
LastRun time.Time `json:"lastRun"`
Message string `json:"message,omitempty"`
ErrorCount int `json:"errorCount,omitempty"`
}
MonitorStatus represents the status of a single monitor
type NodeCondition ¶
type NodeCondition struct {
Type string `json:"type"`
Status string `json:"status"` // True, False, Unknown
Reason string `json:"reason,omitempty"`
Message string `json:"message,omitempty"`
LastTransitionTime time.Time `json:"lastTransitionTime,omitempty"`
LastHeartbeatTime time.Time `json:"lastHeartbeatTime,omitempty"`
}
NodeCondition represents a Kubernetes-style node condition
type NodeDetail ¶
type NodeDetail struct {
NodeName string `json:"nodeName"`
NodeUID string `json:"nodeUID,omitempty"`
Health HealthStatus `json:"health"`
LastReportAt time.Time `json:"lastReportAt"`
FirstSeenAt time.Time `json:"firstSeenAt"`
ReportCount int64 `json:"reportCount"`
LatestReport *NodeReport `json:"latestReport,omitempty"`
ActiveProblems []ProblemSummary `json:"activeProblems"`
Conditions []NodeCondition `json:"conditions"`
RecentHistory []ReportSummary `json:"recentHistory,omitempty"`
}
NodeDetail provides detailed information about a specific node
type NodeReport ¶
type NodeReport struct {
// Node identification
NodeName string `json:"nodeName"`
NodeUID string `json:"nodeUID,omitempty"`
// Report metadata
Timestamp time.Time `json:"timestamp"`
ReportID string `json:"reportId,omitempty"`
Version string `json:"version,omitempty"` // node-doctor version
Uptime string `json:"uptime,omitempty"` // node-doctor uptime
ReportType string `json:"reportType,omitempty"` // "periodic", "on-demand", "startup"
// Health summary
OverallHealth HealthStatus `json:"overallHealth"`
MonitorStatuses []MonitorStatus `json:"monitorStatuses,omitempty"`
ActiveProblems []ProblemSummary `json:"activeProblems,omitempty"`
Conditions []NodeCondition `json:"conditions,omitempty"`
// Statistics
Stats *NodeStats `json:"stats,omitempty"`
}
NodeReport represents a health report from a node-doctor DaemonSet pod. This is the primary data structure sent from nodes to the controller.
type NodeStats ¶
type NodeStats struct {
StatusesProcessed int64 `json:"statusesProcessed"`
ProblemsDetected int64 `json:"problemsDetected"`
RemediationsRun int64 `json:"remediationsRun"`
MemoryUsageBytes int64 `json:"memoryUsageBytes,omitempty"`
GoroutineCount int `json:"goroutineCount,omitempty"`
CPUUsagePercent string `json:"cpuUsagePercent,omitempty"`
}
NodeStats contains statistics about the node-doctor instance
type NodeSummary ¶
type NodeSummary struct {
NodeName string `json:"nodeName"`
Health HealthStatus `json:"health"`
LastReportAt time.Time `json:"lastReportAt"`
ProblemCount int `json:"problemCount"`
ConditionCount int `json:"conditionCount"`
}
NodeSummary provides a brief overview of a node's status
type PaginatedResponse ¶
type PaginatedResponse struct {
Items interface{} `json:"items"`
TotalCount int `json:"totalCount"`
Page int `json:"page"`
PageSize int `json:"pageSize"`
HasMore bool `json:"hasMore"`
}
PaginatedResponse wraps paginated list responses
type ProblemSummary ¶
type ProblemSummary struct {
Type string `json:"type"`
Severity string `json:"severity"`
Message string `json:"message"`
Source string `json:"source"`
DetectedAt time.Time `json:"detectedAt"`
LastSeenAt time.Time `json:"lastSeenAt"`
Occurrences int `json:"occurrences,omitempty"`
}
ProblemSummary represents an active problem on the node
type PrometheusConfig ¶
type PrometheusConfig struct {
Enabled bool `json:"enabled" yaml:"enabled"`
Port int `json:"port" yaml:"port"`
Path string `json:"path" yaml:"path"`
}
PrometheusConfig contains Prometheus metrics configuration
type ReportSummary ¶
type ReportSummary struct {
ReportID string `json:"reportId"`
Timestamp time.Time `json:"timestamp"`
OverallHealth HealthStatus `json:"overallHealth"`
ProblemCount int `json:"problemCount"`
}
ReportSummary is a condensed view of a historical report
type SQLiteStorage ¶
type SQLiteStorage struct {
// contains filtered or unexported fields
}
SQLiteStorage implements Storage using SQLite
func NewSQLiteStorage ¶
func NewSQLiteStorage(config *StorageConfig) (*SQLiteStorage, error)
NewSQLiteStorage creates a new SQLite storage instance
func (*SQLiteStorage) Close ¶
func (s *SQLiteStorage) Close() error
Close closes the database connection
func (*SQLiteStorage) DeleteOldReports ¶
DeleteOldReports removes reports older than the given time
func (*SQLiteStorage) ExpireLeases ¶
func (s *SQLiteStorage) ExpireLeases(ctx context.Context) (int64, error)
ExpireLeases marks expired leases as expired
func (*SQLiteStorage) GetActiveCorrelations ¶
func (s *SQLiteStorage) GetActiveCorrelations(ctx context.Context) ([]*Correlation, error)
GetActiveCorrelations returns all active correlations
func (*SQLiteStorage) GetActiveLeases ¶
func (s *SQLiteStorage) GetActiveLeases(ctx context.Context) ([]*Lease, error)
GetActiveLeases returns all active leases
func (*SQLiteStorage) GetAllLatestReports ¶
func (s *SQLiteStorage) GetAllLatestReports(ctx context.Context) (map[string]*NodeReport, error)
GetAllLatestReports returns the latest report for all nodes
func (*SQLiteStorage) GetCorrelation ¶
func (s *SQLiteStorage) GetCorrelation(ctx context.Context, id string) (*Correlation, error)
GetCorrelation returns a correlation by ID
func (*SQLiteStorage) GetLastCompletedLease ¶
GetLastCompletedLease returns the most recent completed/expired lease for a node. This is used for cooldown period enforcement to prevent rapid repeated remediations.
func (*SQLiteStorage) GetLatestNodeReport ¶
func (s *SQLiteStorage) GetLatestNodeReport(ctx context.Context, nodeName string) (*NodeReport, error)
GetLatestNodeReport returns the most recent report for a node
func (*SQLiteStorage) GetLeaseStats ¶
func (s *SQLiteStorage) GetLeaseStats(ctx context.Context) (active, total int64, err error)
GetLeaseStats returns lease statistics
func (*SQLiteStorage) GetNodeCount ¶
func (s *SQLiteStorage) GetNodeCount(ctx context.Context) (int, error)
GetNodeCount returns the number of unique nodes
func (*SQLiteStorage) GetNodeLease ¶
GetNodeLease returns the active lease for a node, if any
func (*SQLiteStorage) GetNodeReports ¶
func (s *SQLiteStorage) GetNodeReports(ctx context.Context, nodeName string, limit int) ([]*NodeReport, error)
GetNodeReports returns recent reports for a node
func (*SQLiteStorage) GetReportCount ¶
func (s *SQLiteStorage) GetReportCount(ctx context.Context) (int64, error)
GetReportCount returns the total number of reports
func (*SQLiteStorage) Initialize ¶
func (s *SQLiteStorage) Initialize(ctx context.Context) error
Initialize opens the database and runs migrations
func (*SQLiteStorage) RunCleanup ¶
func (s *SQLiteStorage) RunCleanup(ctx context.Context) error
RunCleanup performs periodic cleanup of old data
func (*SQLiteStorage) SaveCorrelation ¶
func (s *SQLiteStorage) SaveCorrelation(ctx context.Context, correlation *Correlation) error
SaveCorrelation saves a correlation to the database
func (*SQLiteStorage) SaveLease ¶
func (s *SQLiteStorage) SaveLease(ctx context.Context, lease *Lease) error
SaveLease saves a lease to the database
func (*SQLiteStorage) SaveNodeReport ¶
func (s *SQLiteStorage) SaveNodeReport(ctx context.Context, report *NodeReport) error
SaveNodeReport saves a node report to the database
func (*SQLiteStorage) UpdateCorrelation ¶
func (s *SQLiteStorage) UpdateCorrelation(ctx context.Context, correlation *Correlation) error
UpdateCorrelation updates an existing correlation
func (*SQLiteStorage) UpdateLeaseStatus ¶
func (s *SQLiteStorage) UpdateLeaseStatus(ctx context.Context, leaseID, status string) error
UpdateLeaseStatus updates the status of a lease
type Server ¶
type Server struct {
// contains filtered or unexported fields
}
Server is the main HTTP server for the Node Doctor Controller
func NewServer ¶
func NewServer(config *ControllerConfig) (*Server, error)
NewServer creates a new controller server
func (*Server) GetStorage ¶
GetStorage returns the storage backend
func (*Server) Handler ¶
Handler returns the HTTP handler for testing purposes. This allows tests to use httptest.NewServer(server.Handler()).
func (*Server) SetStorage ¶
SetStorage sets the storage backend for the server
type ServerConfig ¶
type ServerConfig struct {
BindAddress string `json:"bindAddress" yaml:"bindAddress"`
Port int `json:"port" yaml:"port"`
ReadTimeout time.Duration `json:"readTimeout" yaml:"readTimeout"`
WriteTimeout time.Duration `json:"writeTimeout" yaml:"writeTimeout"`
EnableCORS bool `json:"enableCORS" yaml:"enableCORS"`
}
ServerConfig contains HTTP server configuration
type Storage ¶
type Storage interface {
// Initialize sets up the database and runs migrations
Initialize(ctx context.Context) error
// Close closes the database connection
Close() error
// Node Reports
SaveNodeReport(ctx context.Context, report *NodeReport) error
GetLatestNodeReport(ctx context.Context, nodeName string) (*NodeReport, error)
GetNodeReports(ctx context.Context, nodeName string, limit int) ([]*NodeReport, error)
GetAllLatestReports(ctx context.Context) (map[string]*NodeReport, error)
DeleteOldReports(ctx context.Context, before time.Time) (int64, error)
// Leases
SaveLease(ctx context.Context, lease *Lease) error
GetLease(ctx context.Context, leaseID string) (*Lease, error)
GetActiveLeases(ctx context.Context) ([]*Lease, error)
GetNodeLease(ctx context.Context, nodeName string) (*Lease, error)
GetLastCompletedLease(ctx context.Context, nodeName string) (*Lease, error)
UpdateLeaseStatus(ctx context.Context, leaseID, status string) error
ExpireLeases(ctx context.Context) (int64, error)
// Correlations
SaveCorrelation(ctx context.Context, correlation *Correlation) error
GetCorrelation(ctx context.Context, id string) (*Correlation, error)
GetActiveCorrelations(ctx context.Context) ([]*Correlation, error)
UpdateCorrelation(ctx context.Context, correlation *Correlation) error
// Statistics
GetNodeCount(ctx context.Context) (int, error)
GetReportCount(ctx context.Context) (int64, error)
GetLeaseStats(ctx context.Context) (active, total int64, err error)
// Maintenance
RunCleanup(ctx context.Context) error
}
Storage defines the interface for persistent storage
type StorageConfig ¶
type StorageConfig struct {
Path string `json:"path" yaml:"path"`
Retention time.Duration `json:"retention" yaml:"retention"` // How long to keep reports
}
StorageConfig contains SQLite storage configuration