Documentation
¶
Overview ¶
Package types defines configuration types for Node Doctor.
Package types defines the core interfaces and types for Node Doctor. Based on architecture.md specification.
Index ¶
- Constants
- Variables
- type APIServerLatency
- type AnnotationConfig
- type AuthConfig
- type CircuitBreakerConfig
- type Condition
- type ConditionConfig
- type ConditionStatus
- type ConfigMetadata
- type ControllerWebhookConfig
- type DNSLatency
- type Event
- type EventConfig
- type EventSeverity
- type Exporter
- type ExporterConfigs
- type ExporterReloadResult
- type ExporterReloadSummary
- type FeatureFlags
- type GatewayLatency
- type GlobalSettings
- type HTTPExporterConfig
- type KubernetesExporterConfig
- type LatencyMetrics
- type Monitor
- type MonitorConfig
- type MonitorRegistryValidator
- type MonitorRemediationConfig
- type NodeDoctorConfig
- type PeerLatency
- type Problem
- type ProblemSeverity
- type PrometheusExporterConfig
- type ReloadConfig
- type ReloadableExporter
- type RemediationConfig
- type RemediationCoordinationConfig
- type RemediationOverride
- type Remediator
- type RetryConfig
- type Status
- func (s *Status) AddCondition(condition Condition) *Status
- func (s *Status) AddEvent(event Event) *Status
- func (s *Status) ClearConditions() *Status
- func (s *Status) ClearEvents() *Status
- func (s *Status) GetLatencyMetrics() *LatencyMetrics
- func (s *Status) SetLatencyMetrics(metrics *LatencyMetrics) *Status
- func (s *Status) String() string
- func (s *Status) Validate() error
- type WebhookEndpoint
Constants ¶
const ( DefaultLogLevel = "info" DefaultLogFormat = "json" DefaultLogOutput = "stdout" DefaultUpdateInterval = "10s" DefaultResyncInterval = "60s" DefaultHeartbeatInterval = "5m" DefaultQPS = 50 DefaultBurst = 100 DefaultHTTPPort = 8080 DefaultHTTPBindAddress = "0.0.0.0" DefaultPrometheusPort = 9100 DefaultPrometheusPath = "/metrics" DefaultMonitorInterval = "30s" DefaultMonitorTimeout = "10s" DefaultCooldownPeriod = "5m" DefaultMaxAttemptsGlobal = 3 DefaultMaxRemediationsPerHour = 10 DefaultMaxRemediationsPerMinute = 2 DefaultCircuitBreakerThreshold = 5 DefaultCircuitBreakerTimeout = "30m" DefaultHistorySize = 100 MaxRecursionDepth = 10 // Maximum nesting depth for strategies MaxQPS = 10000 MaxBurst = 100000 )
Package-level defaults
Variables ¶
var ( // Minimum interval thresholds (conservative settings to prevent system overload) MinMonitorInterval = 1 * time.Second // Minimum time between monitor polls MinHeartbeatInterval = 5 * time.Second // Minimum heartbeat check interval MinCooldownPeriod = 10 * time.Second // Minimum cooldown between remediation attempts )
Package-level variables for validation
Functions ¶
This section is empty.
Types ¶
type APIServerLatency ¶ added in v1.5.0
type APIServerLatency struct {
LatencyMs float64 `json:"latency_ms"`
Reachable bool `json:"reachable"`
}
APIServerLatency represents Kubernetes API server response latency.
type AnnotationConfig ¶
type AnnotationConfig struct {
Key string `json:"key" yaml:"key"`
Value string `json:"value" yaml:"value"`
}
AnnotationConfig defines a node annotation to manage.
type AuthConfig ¶
type AuthConfig struct {
Type string `json:"type" yaml:"type"` // "none", "bearer", "basic"
Token string `json:"token,omitempty" yaml:"token,omitempty"` // Bearer token
Username string `json:"username,omitempty" yaml:"username,omitempty"` // Basic auth username
Password string `json:"password,omitempty" yaml:"password,omitempty"` // Basic auth password
}
AuthConfig defines authentication configuration for webhooks.
func (*AuthConfig) Validate ¶
func (a *AuthConfig) Validate() error
Validate validates the AuthConfig configuration.
type CircuitBreakerConfig ¶
type CircuitBreakerConfig struct {
Enabled bool `json:"enabled" yaml:"enabled"`
Threshold int `json:"threshold,omitempty" yaml:"threshold,omitempty"`
TimeoutString string `json:"timeout,omitempty" yaml:"timeout,omitempty"`
Timeout time.Duration `json:"-" yaml:"-"`
SuccessThreshold int `json:"successThreshold,omitempty" yaml:"successThreshold,omitempty"`
}
CircuitBreakerConfig configures circuit breaker behavior.
type Condition ¶
type Condition struct {
// Type is the type of condition (e.g., "KubeletReady", "DiskPressure").
Type string
// Status is the current status of the condition (True, False, Unknown).
Status ConditionStatus
// Transition is when the condition last transitioned.
Transition time.Time
// Reason is a brief machine-readable string explaining the condition.
Reason string
// Message is a human-readable explanation of the condition.
Message string
}
Condition represents the current state of a monitored resource.
func NewCondition ¶
func NewCondition(conditionType string, status ConditionStatus, reason, message string) Condition
NewCondition creates a new Condition with the specified parameters. Transition time is automatically set to the current time.
type ConditionConfig ¶
type ConditionConfig struct {
Type string `json:"type" yaml:"type"`
DefaultStatus string `json:"defaultStatus,omitempty" yaml:"defaultStatus,omitempty"`
DefaultReason string `json:"defaultReason,omitempty" yaml:"defaultReason,omitempty"`
DefaultMessage string `json:"defaultMessage,omitempty" yaml:"defaultMessage,omitempty"`
}
ConditionConfig defines a custom node condition.
type ConditionStatus ¶
type ConditionStatus string
ConditionStatus represents the status of a condition.
const ( // ConditionTrue indicates the condition is true/healthy. ConditionTrue ConditionStatus = "True" // ConditionFalse indicates the condition is false/unhealthy. ConditionFalse ConditionStatus = "False" // ConditionUnknown indicates the condition status cannot be determined. ConditionUnknown ConditionStatus = "Unknown" )
type ConfigMetadata ¶
type ConfigMetadata struct {
Name string `json:"name" yaml:"name"`
Namespace string `json:"namespace,omitempty" yaml:"namespace,omitempty"`
Labels map[string]string `json:"labels,omitempty" yaml:"labels,omitempty"`
}
ConfigMetadata contains metadata about the configuration.
type ControllerWebhookConfig ¶ added in v1.6.0
type ControllerWebhookConfig struct {
// Enabled indicates whether to send reports to the controller
Enabled bool `json:"enabled" yaml:"enabled"`
// URL is the controller's report ingestion endpoint
URL string `json:"url" yaml:"url"`
// IntervalString is the interval between reports (stored as string)
IntervalString string `json:"interval,omitempty" yaml:"interval,omitempty"`
Interval time.Duration `json:"-" yaml:"-"`
// TimeoutString is the request timeout (stored as string)
TimeoutString string `json:"timeout,omitempty" yaml:"timeout,omitempty"`
Timeout time.Duration `json:"-" yaml:"-"`
// Auth configuration for authenticating with the controller
Auth AuthConfig `json:"auth,omitempty" yaml:"auth,omitempty"`
// Headers are custom headers to include in requests
Headers map[string]string `json:"headers,omitempty" yaml:"headers,omitempty"`
// Retry configuration for failed requests
Retry *RetryConfig `json:"retry,omitempty" yaml:"retry,omitempty"`
}
ControllerWebhookConfig configures the webhook for sending reports to the node-doctor controller.
func (*ControllerWebhookConfig) ApplyDefaults ¶ added in v1.6.0
func (c *ControllerWebhookConfig) ApplyDefaults(parent *HTTPExporterConfig) error
ApplyDefaults applies default values to ControllerWebhookConfig.
func (*ControllerWebhookConfig) Validate ¶ added in v1.6.0
func (c *ControllerWebhookConfig) Validate() error
Validate validates the ControllerWebhookConfig configuration.
type DNSLatency ¶ added in v1.5.0
type DNSLatency struct {
DNSServer string `json:"dns_server"`
Domain string `json:"domain"`
RecordType string `json:"record_type"`
DomainType string `json:"domain_type"` // "cluster", "external", "custom"
LatencyMs float64 `json:"latency_ms"`
Success bool `json:"success"`
}
DNSLatency represents DNS resolution latency.
type Event ¶
type Event struct {
// Severity indicates the importance of the event (Info, Warning, Error).
Severity EventSeverity
// Timestamp when the event occurred.
Timestamp time.Time
// Reason is a short, machine-readable string that describes the event.
Reason string
// Message is a human-readable description of the event.
Message string
}
Event represents a discrete occurrence detected by a monitor.
func NewEvent ¶
func NewEvent(severity EventSeverity, reason, message string) Event
NewEvent creates a new Event with the specified parameters. Timestamp is automatically set to the current time.
type EventConfig ¶
type EventConfig struct {
MaxEventsPerMinute int `json:"maxEventsPerMinute,omitempty" yaml:"maxEventsPerMinute,omitempty"`
EventTTLString string `json:"eventTTL,omitempty" yaml:"eventTTL,omitempty"`
EventTTL time.Duration `json:"-" yaml:"-"`
DeduplicationWindowString string `json:"deduplicationWindow,omitempty" yaml:"deduplicationWindow,omitempty"`
DeduplicationWindow time.Duration `json:"-" yaml:"-"`
}
EventConfig configures Kubernetes event behavior.
type EventSeverity ¶
type EventSeverity string
EventSeverity represents the severity level of an event.
const ( // EventInfo indicates an informational event with no action required. EventInfo EventSeverity = "Info" // EventWarning indicates a warning that may require attention. EventWarning EventSeverity = "Warning" // EventError indicates an error condition that requires immediate attention. EventError EventSeverity = "Error" )
type Exporter ¶
type Exporter interface {
// ExportStatus publishes a status update.
ExportStatus(ctx context.Context, status *Status) error
// ExportProblem publishes a problem report.
ExportProblem(ctx context.Context, problem *Problem) error
}
Exporter is the interface for components that export status and problems. Exporters publish information to external systems (Prometheus, Kubernetes API, logs).
type ExporterConfigs ¶
type ExporterConfigs struct {
Kubernetes *KubernetesExporterConfig `json:"kubernetes,omitempty" yaml:"kubernetes,omitempty"`
HTTP *HTTPExporterConfig `json:"http,omitempty" yaml:"http,omitempty"`
Prometheus *PrometheusExporterConfig `json:"prometheus,omitempty" yaml:"prometheus,omitempty"`
}
ExporterConfigs contains all exporter configurations.
type ExporterReloadResult ¶
type ExporterReloadResult struct {
ExporterType string // Type of exporter (e.g., "kubernetes", "http", "prometheus")
Success bool // Whether the reload was successful
Error error // Error details if reload failed
Message string // Additional information about the reload
}
ExporterReloadResult represents the result of an exporter reload operation
type ExporterReloadSummary ¶
type ExporterReloadSummary struct {
TotalExporters int // Total number of exporters
ReloadableCount int // Number of exporters that support reload
SuccessfulReloads int // Number of successful reloads
FailedReloads int // Number of failed reloads
Results []ExporterReloadResult // Detailed results for each exporter
}
ExporterReloadSummary provides a summary of all exporter reload operations
func (*ExporterReloadSummary) AddResult ¶
func (s *ExporterReloadSummary) AddResult(result ExporterReloadResult)
AddResult adds a reload result to the summary
type FeatureFlags ¶
type FeatureFlags struct {
EnableMetrics bool `json:"enableMetrics,omitempty" yaml:"enableMetrics,omitempty"`
EnableProfiling bool `json:"enableProfiling,omitempty" yaml:"enableProfiling,omitempty"`
ProfilingPort int `json:"profilingPort,omitempty" yaml:"profilingPort,omitempty"`
EnableTracing bool `json:"enableTracing,omitempty" yaml:"enableTracing,omitempty"`
TracingEndpoint string `json:"tracingEndpoint,omitempty" yaml:"tracingEndpoint,omitempty"`
}
FeatureFlags contains experimental feature flags.
func (*FeatureFlags) ApplyDefaults ¶
func (f *FeatureFlags) ApplyDefaults()
ApplyDefaults applies default values to FeatureFlags.
type GatewayLatency ¶ added in v1.5.0
type GatewayLatency struct {
GatewayIP string `json:"gateway_ip"`
LatencyMs float64 `json:"latency_ms"`
AvgLatencyMs float64 `json:"avg_latency_ms"`
MaxLatencyMs float64 `json:"max_latency_ms"`
Reachable bool `json:"reachable"`
PingCount int `json:"ping_count"`
SuccessCount int `json:"success_count"`
}
GatewayLatency represents latency to the default gateway.
type GlobalSettings ¶
type GlobalSettings struct {
// NodeName is the Kubernetes node name (usually from ${NODE_NAME})
NodeName string `json:"nodeName" yaml:"nodeName"`
// Logging configuration
LogLevel string `json:"logLevel,omitempty" yaml:"logLevel,omitempty"`
LogFormat string `json:"logFormat,omitempty" yaml:"logFormat,omitempty"`
LogOutput string `json:"logOutput,omitempty" yaml:"logOutput,omitempty"`
LogFile string `json:"logFile,omitempty" yaml:"logFile,omitempty"`
// Update intervals (stored as strings, parsed to time.Duration)
UpdateIntervalString string `json:"updateInterval,omitempty" yaml:"updateInterval,omitempty"`
ResyncIntervalString string `json:"resyncInterval,omitempty" yaml:"resyncInterval,omitempty"`
HeartbeatIntervalString string `json:"heartbeatInterval,omitempty" yaml:"heartbeatInterval,omitempty"`
// Parsed duration fields (not in JSON/YAML)
UpdateInterval time.Duration `json:"-" yaml:"-"`
ResyncInterval time.Duration `json:"-" yaml:"-"`
HeartbeatInterval time.Duration `json:"-" yaml:"-"`
// Remediation master switches
EnableRemediation bool `json:"enableRemediation,omitempty" yaml:"enableRemediation,omitempty"`
DryRunMode bool `json:"dryRunMode,omitempty" yaml:"dryRunMode,omitempty"`
// Kubernetes client configuration
Kubeconfig string `json:"kubeconfig,omitempty" yaml:"kubeconfig,omitempty"`
QPS float32 `json:"qps,omitempty" yaml:"qps,omitempty"`
Burst int `json:"burst,omitempty" yaml:"burst,omitempty"`
}
GlobalSettings contains global configuration settings.
func (*GlobalSettings) ApplyDefaults ¶
func (s *GlobalSettings) ApplyDefaults() error
ApplyDefaults applies default values to GlobalSettings.
func (*GlobalSettings) SubstituteEnvVars ¶
func (s *GlobalSettings) SubstituteEnvVars()
SubstituteEnvVars performs environment variable substitution on GlobalSettings.
func (*GlobalSettings) Validate ¶
func (s *GlobalSettings) Validate() error
Validate validates the GlobalSettings configuration.
type HTTPExporterConfig ¶
type HTTPExporterConfig struct {
Enabled bool `json:"enabled" yaml:"enabled"`
Webhooks []WebhookEndpoint `json:"webhooks,omitempty" yaml:"webhooks,omitempty"`
Workers int `json:"workers,omitempty" yaml:"workers,omitempty"`
QueueSize int `json:"queueSize,omitempty" yaml:"queueSize,omitempty"`
// Default timeout for all webhooks (can be overridden per webhook)
TimeoutString string `json:"timeout,omitempty" yaml:"timeout,omitempty"`
Timeout time.Duration `json:"-" yaml:"-"`
// Default retry configuration for all webhooks (can be overridden per webhook)
Retry RetryConfig `json:"retry,omitempty" yaml:"retry,omitempty"`
Headers map[string]string `json:"headers,omitempty" yaml:"headers,omitempty"`
// Controller webhook for sending aggregated reports to the controller
Controller *ControllerWebhookConfig `json:"controller,omitempty" yaml:"controller,omitempty"`
}
HTTPExporterConfig configures the HTTP webhook exporter.
func (*HTTPExporterConfig) ApplyDefaults ¶
func (h *HTTPExporterConfig) ApplyDefaults() error
ApplyDefaults applies default values to HTTPExporterConfig.
func (*HTTPExporterConfig) SubstituteEnvVars ¶
func (h *HTTPExporterConfig) SubstituteEnvVars()
SubstituteEnvVars performs environment variable substitution on HTTPExporterConfig.
func (*HTTPExporterConfig) Validate ¶
func (h *HTTPExporterConfig) Validate() error
Validate validates the HTTPExporterConfig configuration.
type KubernetesExporterConfig ¶
type KubernetesExporterConfig struct {
Enabled bool `json:"enabled" yaml:"enabled"`
// Update intervals
UpdateIntervalString string `json:"updateInterval,omitempty" yaml:"updateInterval,omitempty"`
ResyncIntervalString string `json:"resyncInterval,omitempty" yaml:"resyncInterval,omitempty"`
HeartbeatIntervalString string `json:"heartbeatInterval,omitempty" yaml:"heartbeatInterval,omitempty"`
UpdateInterval time.Duration `json:"-" yaml:"-"`
ResyncInterval time.Duration `json:"-" yaml:"-"`
HeartbeatInterval time.Duration `json:"-" yaml:"-"`
// Namespace for events
Namespace string `json:"namespace,omitempty" yaml:"namespace,omitempty"`
// Custom node conditions
Conditions []ConditionConfig `json:"conditions,omitempty" yaml:"conditions,omitempty"`
// Node annotations to manage
Annotations []AnnotationConfig `json:"annotations,omitempty" yaml:"annotations,omitempty"`
// Event configuration
Events EventConfig `json:"events,omitempty" yaml:"events,omitempty"`
}
KubernetesExporterConfig configures the Kubernetes exporter.
func (*KubernetesExporterConfig) ApplyDefaults ¶
func (k *KubernetesExporterConfig) ApplyDefaults() error
ApplyDefaults applies default values to KubernetesExporterConfig.
func (*KubernetesExporterConfig) SubstituteEnvVars ¶
func (k *KubernetesExporterConfig) SubstituteEnvVars()
SubstituteEnvVars performs environment variable substitution on KubernetesExporterConfig.
func (*KubernetesExporterConfig) Validate ¶
func (k *KubernetesExporterConfig) Validate() error
Validate validates the KubernetesExporterConfig configuration.
type LatencyMetrics ¶ added in v1.5.0
type LatencyMetrics struct {
// Gateway latency metrics
Gateway *GatewayLatency `json:"gateway,omitempty"`
// Peer latency metrics (CNI/cross-node connectivity)
Peers []PeerLatency `json:"peers,omitempty"`
// DNS latency metrics
DNS []DNSLatency `json:"dns,omitempty"`
// API server latency
APIServer *APIServerLatency `json:"apiserver,omitempty"`
}
LatencyMetrics contains network latency measurements for Prometheus export. Monitors should populate this in Status.Metadata["latency_metrics"].
type Monitor ¶
type Monitor interface {
// Start begins the monitoring process and returns a channel for status updates.
// The monitor runs asynchronously and sends Status updates through the channel.
Start() (<-chan *Status, error)
// Stop gracefully stops the monitor.
Stop()
}
Monitor is the interface that all monitors must implement. Monitors detect problems on the node and report them via a channel.
type MonitorConfig ¶
type MonitorConfig struct {
// Name is the unique identifier for this monitor
Name string `json:"name" yaml:"name"`
// Type is the monitor type (e.g., "system-disk-check")
Type string `json:"type" yaml:"type"`
// Enabled indicates whether this monitor is active
Enabled bool `json:"enabled" yaml:"enabled"`
// Interval and timeout (stored as strings)
IntervalString string `json:"interval,omitempty" yaml:"interval,omitempty"`
TimeoutString string `json:"timeout,omitempty" yaml:"timeout,omitempty"`
// Parsed duration fields
Interval time.Duration `json:"-" yaml:"-"`
Timeout time.Duration `json:"-" yaml:"-"`
// Config contains monitor-specific configuration as a map
// Each monitor type will parse this according to its needs
Config map[string]interface{} `json:"config,omitempty" yaml:"config,omitempty"`
// Remediation contains optional remediation configuration for this monitor
Remediation *MonitorRemediationConfig `json:"remediation,omitempty" yaml:"remediation,omitempty"`
// DependsOn specifies monitors that must complete successfully before this monitor starts
// Used for dependency ordering and circular dependency detection during validation
DependsOn []string `json:"dependsOn,omitempty" yaml:"dependsOn,omitempty"`
}
MonitorConfig represents a single monitor configuration.
func (*MonitorConfig) ApplyDefaults ¶
func (m *MonitorConfig) ApplyDefaults() error
ApplyDefaults applies default values to MonitorConfig.
func (*MonitorConfig) SubstituteEnvVars ¶
func (m *MonitorConfig) SubstituteEnvVars()
SubstituteEnvVars performs environment variable substitution on MonitorConfig.
func (*MonitorConfig) Validate ¶
func (m *MonitorConfig) Validate() error
Validate validates the MonitorConfig configuration.
type MonitorRegistryValidator ¶
type MonitorRegistryValidator interface {
// IsRegistered returns true if the given monitor type is registered
IsRegistered(monitorType string) bool
// GetRegisteredTypes returns a sorted list of all registered monitor types
GetRegisteredTypes() []string
}
MonitorRegistryValidator provides an interface for validating monitor types without creating an import cycle between config and monitors packages. This interface is implemented by monitors.Registry.
type MonitorRemediationConfig ¶
type MonitorRemediationConfig struct {
// Enabled indicates whether remediation is enabled for this monitor
Enabled bool `json:"enabled" yaml:"enabled"`
// Strategy is the remediation strategy type
Strategy string `json:"strategy,omitempty" yaml:"strategy,omitempty"`
// Action is the specific action to take
Action string `json:"action,omitempty" yaml:"action,omitempty"`
// Service is the systemd service name (for systemd-restart strategy)
Service string `json:"service,omitempty" yaml:"service,omitempty"`
// ScriptPath is the path to remediation script (for custom-script strategy)
ScriptPath string `json:"scriptPath,omitempty" yaml:"scriptPath,omitempty"`
// Args are arguments to pass to the script
Args []string `json:"args,omitempty" yaml:"args,omitempty"`
// Cooldown period (stored as string)
CooldownString string `json:"cooldown,omitempty" yaml:"cooldown,omitempty"`
Cooldown time.Duration `json:"-" yaml:"-"`
// MaxAttempts is the maximum remediation attempts
MaxAttempts int `json:"maxAttempts,omitempty" yaml:"maxAttempts,omitempty"`
// Priority for multiple remediation strategies
Priority int `json:"priority,omitempty" yaml:"priority,omitempty"`
// GracefulStop indicates whether to stop gracefully
GracefulStop bool `json:"gracefulStop,omitempty" yaml:"gracefulStop,omitempty"`
// WaitTimeout for graceful stop (stored as string)
WaitTimeoutString string `json:"waitTimeout,omitempty" yaml:"waitTimeout,omitempty"`
WaitTimeout time.Duration `json:"-" yaml:"-"`
// Additional strategies for multi-step remediation
Strategies []MonitorRemediationConfig `json:"strategies,omitempty" yaml:"strategies,omitempty"`
}
MonitorRemediationConfig contains remediation settings for a monitor.
func (*MonitorRemediationConfig) ApplyDefaults ¶
func (r *MonitorRemediationConfig) ApplyDefaults() error
ApplyDefaults applies default values to MonitorRemediationConfig.
func (*MonitorRemediationConfig) SubstituteEnvVars ¶
func (r *MonitorRemediationConfig) SubstituteEnvVars()
SubstituteEnvVars performs environment variable substitution on MonitorRemediationConfig.
func (*MonitorRemediationConfig) Validate ¶
func (r *MonitorRemediationConfig) Validate() error
Validate validates the MonitorRemediationConfig configuration.
type NodeDoctorConfig ¶
type NodeDoctorConfig struct {
// APIVersion of the configuration schema
APIVersion string `json:"apiVersion" yaml:"apiVersion"`
// Kind of resource (always "NodeDoctorConfig")
Kind string `json:"kind" yaml:"kind"`
// Metadata contains name, namespace, labels, etc.
Metadata ConfigMetadata `json:"metadata" yaml:"metadata"`
// Settings contains global configuration
Settings GlobalSettings `json:"settings" yaml:"settings"`
// Monitors contains all monitor configurations
Monitors []MonitorConfig `json:"monitors" yaml:"monitors"`
// Exporters contains exporter configurations
Exporters ExporterConfigs `json:"exporters" yaml:"exporters"`
// Remediation contains global remediation settings
Remediation RemediationConfig `json:"remediation" yaml:"remediation"`
// Features contains feature flags
Features FeatureFlags `json:"features,omitempty" yaml:"features,omitempty"`
// Reload contains configuration hot reload settings
Reload ReloadConfig `json:"reload,omitempty" yaml:"reload,omitempty"`
}
NodeDoctorConfig is the top-level configuration structure.
func (*NodeDoctorConfig) ApplyDefaults ¶
func (c *NodeDoctorConfig) ApplyDefaults() error
ApplyDefaults applies default values to the configuration.
func (*NodeDoctorConfig) SubstituteEnvVars ¶
func (c *NodeDoctorConfig) SubstituteEnvVars()
SubstituteEnvVars performs environment variable substitution on the configuration.
func (*NodeDoctorConfig) Validate ¶
func (c *NodeDoctorConfig) Validate() error
Validate validates the entire configuration.
func (*NodeDoctorConfig) ValidateWithRegistry ¶
func (c *NodeDoctorConfig) ValidateWithRegistry(registry MonitorRegistryValidator) error
ValidateWithRegistry validates the entire configuration including monitor type registration. This method should be called instead of Validate() when a monitor registry is available, as it performs additional validation that requires checking against registered monitor types.
type PeerLatency ¶ added in v1.5.0
type PeerLatency struct {
PeerNode string `json:"peer_node"`
PeerIP string `json:"peer_ip"`
LatencyMs float64 `json:"latency_ms"`
AvgLatencyMs float64 `json:"avg_latency_ms"`
Reachable bool `json:"reachable"`
}
PeerLatency represents latency to a peer node.
type Problem ¶
type Problem struct {
// Type categorizes the problem (e.g., "systemd-service-failed").
Type string
// Resource identifies the affected resource (e.g., "kubelet.service").
Resource string
// Severity indicates how critical the problem is.
Severity ProblemSeverity
// Message describes the problem in detail.
Message string
// DetectedAt is when the problem was first detected.
DetectedAt time.Time
// Metadata contains additional context about the problem.
Metadata map[string]string
}
Problem represents an issue detected that may require remediation.
func NewProblem ¶
func NewProblem(problemType, resource string, severity ProblemSeverity, message string) *Problem
NewProblem creates a new Problem with the specified parameters. DetectedAt time is automatically set to the current time. Metadata map is initialized as empty.
func (*Problem) GetMetadata ¶
GetMetadata retrieves a metadata value by key from the Problem. Returns the value and true if found, empty string and false otherwise. If the Problem pointer is nil, returns empty string and false.
func (*Problem) Validate ¶
Validate checks if the Problem has all required fields populated. Returns an error if any required field is missing or invalid.
func (*Problem) WithMetadata ¶
WithMetadata adds a metadata key-value pair to the Problem. Returns the Problem pointer for method chaining. If the Problem pointer is nil, this is a no-op and returns nil.
type ProblemSeverity ¶
type ProblemSeverity string
ProblemSeverity represents the severity level of a problem.
const ( // ProblemInfo indicates an informational problem with no immediate impact. ProblemInfo ProblemSeverity = "Info" // ProblemWarning indicates a problem that may impact node health if not addressed. ProblemWarning ProblemSeverity = "Warning" // ProblemCritical indicates a critical problem requiring immediate remediation. ProblemCritical ProblemSeverity = "Critical" )
type PrometheusExporterConfig ¶
type PrometheusExporterConfig struct {
Enabled bool `json:"enabled" yaml:"enabled"`
Port int `json:"port,omitempty" yaml:"port,omitempty"`
Path string `json:"path,omitempty" yaml:"path,omitempty"`
Namespace string `json:"namespace,omitempty" yaml:"namespace,omitempty"`
Subsystem string `json:"subsystem,omitempty" yaml:"subsystem,omitempty"`
Labels map[string]string `json:"labels,omitempty" yaml:"labels,omitempty"`
}
PrometheusExporterConfig configures the Prometheus exporter.
func (*PrometheusExporterConfig) ApplyDefaults ¶
func (p *PrometheusExporterConfig) ApplyDefaults() error
ApplyDefaults applies default values to PrometheusExporterConfig.
func (*PrometheusExporterConfig) SubstituteEnvVars ¶
func (p *PrometheusExporterConfig) SubstituteEnvVars()
SubstituteEnvVars performs environment variable substitution on PrometheusExporterConfig.
func (*PrometheusExporterConfig) Validate ¶
func (p *PrometheusExporterConfig) Validate() error
Validate validates the PrometheusExporterConfig configuration.
type ReloadConfig ¶
type ReloadConfig struct {
// Enabled indicates whether hot reload is enabled
Enabled bool `json:"enabled" yaml:"enabled"`
// DebounceIntervalString is the debounce interval as a string (e.g., "500ms")
DebounceIntervalString string `json:"debounceInterval,omitempty" yaml:"debounceInterval,omitempty"`
// DebounceInterval is the parsed debounce duration
DebounceInterval time.Duration `json:"-" yaml:"-"`
}
ReloadConfig contains configuration hot reload settings.
func (*ReloadConfig) ApplyDefaults ¶
func (r *ReloadConfig) ApplyDefaults() error
ApplyDefaults applies default values to reload configuration.
type ReloadableExporter ¶
type ReloadableExporter interface {
Exporter
// Reload updates the exporter configuration without restarting the exporter.
// The config parameter should be the exporter-specific configuration struct.
// Returns an error if the reload fails or if the configuration is invalid.
Reload(config interface{}) error
// IsReloadable returns true if this exporter supports configuration reload.
// This is primarily used for runtime checks and debugging.
IsReloadable() bool
}
ReloadableExporter extends the basic Exporter interface with reload capability. Exporters that implement this interface can update their configuration without requiring a full restart, enabling hot reload of exporter settings.
type RemediationConfig ¶
type RemediationConfig struct {
// Master switches
Enabled bool `json:"enabled" yaml:"enabled"`
DryRun bool `json:"dryRun,omitempty" yaml:"dryRun,omitempty"`
// Safety limits
MaxRemediationsPerHour int `json:"maxRemediationsPerHour,omitempty" yaml:"maxRemediationsPerHour,omitempty"`
MaxRemediationsPerMinute int `json:"maxRemediationsPerMinute,omitempty" yaml:"maxRemediationsPerMinute,omitempty"`
// Cooldown configuration
CooldownPeriodString string `json:"cooldownPeriod,omitempty" yaml:"cooldownPeriod,omitempty"`
CooldownPeriod time.Duration `json:"-" yaml:"-"`
// Global max attempts
MaxAttemptsGlobal int `json:"maxAttemptsGlobal,omitempty" yaml:"maxAttemptsGlobal,omitempty"`
// Circuit breaker settings
CircuitBreaker CircuitBreakerConfig `json:"circuitBreaker,omitempty" yaml:"circuitBreaker,omitempty"`
// History configuration
HistorySize int `json:"historySize,omitempty" yaml:"historySize,omitempty"`
// Problem-specific overrides
Overrides []RemediationOverride `json:"overrides,omitempty" yaml:"overrides,omitempty"`
// Coordination with controller for cluster-wide remediation safety
Coordination *RemediationCoordinationConfig `json:"coordination,omitempty" yaml:"coordination,omitempty"`
}
RemediationConfig contains global remediation settings.
func (*RemediationConfig) ApplyDefaults ¶
func (r *RemediationConfig) ApplyDefaults() error
ApplyDefaults applies default values to RemediationConfig.
func (*RemediationConfig) SubstituteEnvVars ¶
func (r *RemediationConfig) SubstituteEnvVars()
SubstituteEnvVars performs environment variable substitution on RemediationConfig.
func (*RemediationConfig) Validate ¶
func (r *RemediationConfig) Validate() error
Validate validates the RemediationConfig configuration.
type RemediationCoordinationConfig ¶ added in v1.6.0
type RemediationCoordinationConfig struct {
// Enabled indicates whether to coordinate remediations with the controller
Enabled bool `json:"enabled" yaml:"enabled"`
// ControllerURL is the URL of the node-doctor controller
ControllerURL string `json:"controllerURL" yaml:"controllerURL"`
// LeaseTimeoutString is the requested duration for remediation leases (stored as string)
LeaseTimeoutString string `json:"leaseTimeout,omitempty" yaml:"leaseTimeout,omitempty"`
LeaseTimeout time.Duration `json:"-" yaml:"-"`
// RequestTimeoutString is the timeout for lease requests to the controller (stored as string)
RequestTimeoutString string `json:"requestTimeout,omitempty" yaml:"requestTimeout,omitempty"`
RequestTimeout time.Duration `json:"-" yaml:"-"`
// FallbackOnUnreachable determines behavior when controller is unreachable
// If true, proceed with remediation; if false, block and wait for controller
FallbackOnUnreachable bool `json:"fallbackOnUnreachable,omitempty" yaml:"fallbackOnUnreachable,omitempty"`
// MaxRetries is the maximum number of lease request retries
MaxRetries int `json:"maxRetries,omitempty" yaml:"maxRetries,omitempty"`
// RetryIntervalString is the interval between lease request retries (stored as string)
RetryIntervalString string `json:"retryInterval,omitempty" yaml:"retryInterval,omitempty"`
RetryInterval time.Duration `json:"-" yaml:"-"`
}
RemediationCoordinationConfig configures coordination with the controller for remediation leases.
func (*RemediationCoordinationConfig) ApplyDefaults ¶ added in v1.6.0
func (c *RemediationCoordinationConfig) ApplyDefaults() error
ApplyDefaults applies default values to RemediationCoordinationConfig.
func (*RemediationCoordinationConfig) Validate ¶ added in v1.6.0
func (c *RemediationCoordinationConfig) Validate() error
Validate validates the RemediationCoordinationConfig configuration.
type RemediationOverride ¶
type RemediationOverride struct {
Problem string `json:"problem" yaml:"problem"`
CooldownString string `json:"cooldown,omitempty" yaml:"cooldown,omitempty"`
Cooldown time.Duration `json:"-" yaml:"-"`
MaxAttempts int `json:"maxAttempts,omitempty" yaml:"maxAttempts,omitempty"`
CircuitBreakerThreshold int `json:"circuitBreakerThreshold,omitempty" yaml:"circuitBreakerThreshold,omitempty"`
}
RemediationOverride allows problem-specific remediation overrides.
type Remediator ¶
type Remediator interface {
// CanRemediate returns true if this remediator can handle the given problem.
CanRemediate(problem Problem) bool
// Remediate attempts to fix the problem.
// Returns an error if remediation fails or is not allowed (cooldown, rate limit, etc.).
Remediate(ctx context.Context, problem Problem) error
// GetCooldown returns the minimum time between remediation attempts for this remediator.
GetCooldown() time.Duration
}
Remediator is the interface for components that can fix problems.
type RetryConfig ¶
type RetryConfig struct {
MaxAttempts int `json:"maxAttempts,omitempty" yaml:"maxAttempts,omitempty"`
// Base delay between retries (stored as string)
BaseDelayString string `json:"baseDelay,omitempty" yaml:"baseDelay,omitempty"`
BaseDelay time.Duration `json:"-" yaml:"-"`
// Maximum delay between retries (stored as string)
MaxDelayString string `json:"maxDelay,omitempty" yaml:"maxDelay,omitempty"`
MaxDelay time.Duration `json:"-" yaml:"-"`
}
RetryConfig defines retry behavior for webhook calls.
func (*RetryConfig) Validate ¶
func (r *RetryConfig) Validate() error
Validate validates the RetryConfig configuration.
type Status ¶
type Status struct {
// Source identifies the monitor that generated this status.
Source string
// Events are notable occurrences detected by the monitor.
Events []Event
// Conditions represent the current state of the monitored resource.
Conditions []Condition
// Timestamp when this status was generated.
Timestamp time.Time
// Metadata holds monitor-specific observability data (metrics, diagnostics, etc.)
Metadata map[string]interface{} `json:"metadata,omitempty"`
}
Status represents the current state reported by a monitor.
func NewStatus ¶
NewStatus creates a new Status with the specified source. Timestamp is automatically set to the current time. Events and Conditions slices are initialized as empty.
func (*Status) AddCondition ¶
AddCondition adds a condition to the Status. Returns the Status pointer for method chaining.
func (*Status) AddEvent ¶
AddEvent adds an event to the Status. Returns the Status pointer for method chaining.
func (*Status) ClearConditions ¶
ClearConditions removes all conditions from the Status. Returns the Status pointer for method chaining.
func (*Status) ClearEvents ¶
ClearEvents removes all events from the Status. Returns the Status pointer for method chaining.
func (*Status) GetLatencyMetrics ¶ added in v1.5.0
func (s *Status) GetLatencyMetrics() *LatencyMetrics
GetLatencyMetrics retrieves latency metrics from Status.Metadata. Returns nil if not set or if type assertion fails.
func (*Status) SetLatencyMetrics ¶ added in v1.5.0
func (s *Status) SetLatencyMetrics(metrics *LatencyMetrics) *Status
SetLatencyMetrics is a helper to set latency metrics in Status.Metadata.
type WebhookEndpoint ¶
type WebhookEndpoint struct {
Name string `json:"name" yaml:"name"`
URL string `json:"url" yaml:"url"`
Auth AuthConfig `json:"auth,omitempty" yaml:"auth,omitempty"`
// Per-webhook timeout (overrides default)
TimeoutString string `json:"timeout,omitempty" yaml:"timeout,omitempty"`
Timeout time.Duration `json:"-" yaml:"-"`
// Per-webhook retry config (overrides default)
Retry *RetryConfig `json:"retry,omitempty" yaml:"retry,omitempty"`
// Per-webhook headers (merged with default headers)
Headers map[string]string `json:"headers,omitempty" yaml:"headers,omitempty"`
// Control what gets sent to this webhook
SendStatus bool `json:"sendStatus,omitempty" yaml:"sendStatus,omitempty"`
SendProblems bool `json:"sendProblems,omitempty" yaml:"sendProblems,omitempty"`
}
WebhookEndpoint defines a webhook destination for HTTP exports.
func (*WebhookEndpoint) ApplyDefaults ¶
func (w *WebhookEndpoint) ApplyDefaults(parent *HTTPExporterConfig) error
ApplyDefaults applies default values to WebhookEndpoint.
func (*WebhookEndpoint) SubstituteEnvVars ¶
func (w *WebhookEndpoint) SubstituteEnvVars()
SubstituteEnvVars performs environment variable substitution on WebhookEndpoint.
func (*WebhookEndpoint) Validate ¶
func (w *WebhookEndpoint) Validate() error
Validate validates the WebhookEndpoint configuration.