Files
nadir-agent/internal/modules/networking/rollback.go
T
2026-06-24 17:29:45 +02:00

202 lines
6.8 KiB
Go

package networking
import (
"context"
"errors"
"fmt"
"log"
"time"
)
const defaultRollbackSeconds = 120
// errAlreadyPending is returned when another change is awaiting confirmation.
// The write handlers map this to 409 Conflict.
var errAlreadyPending = errors.New("already pending")
// errPending builds the 409 message. The lock is global across all interfaces
// (see pendingChange), so the message says so to avoid confusing a user who is
// touching a different interface than the one that holds the lock.
func errPending(iface string) error {
return fmt.Errorf("%w: a change to %s is awaiting confirmation. This is a global lock across all interfaces — confirm or roll that change back first", errAlreadyPending, iface)
}
// startRollback snapshots the current state, applies the new config, and arms a
// timer that auto-reverts if not confirmed. Returns errAlreadyPending (409) if
// another change is in flight, or a wrapped error (500) if apply fails.
//
// Snapshot and Apply run WITHOUT the mutex held, so they don't block reads or
// the pending-status endpoint while shelling out to nmcli/networkctl/ifup.
func (m *Module) startRollback(ctx context.Context, iface string, cfg IfaceConfig) (int, error) {
// Fast pre-check so we don't snapshot/apply when something is already
// pending. armPending re-checks under the lock to close the race.
if err := m.checkNoPending(); err != nil {
return 0, err
}
prior, err := m.be.Snapshot(ctx, iface)
if err != nil {
return 0, fmt.Errorf("snapshot %s: %w", iface, err)
}
if err := m.be.Apply(ctx, iface, cfg); err != nil {
// Apply is not atomic: nmcli `con modify` may succeed before `con up`
// fails, and networkd writes the .network file before `reconfigure`
// runs. A failed Apply can therefore leave a half-applied config that
// would otherwise have NO auto-revert (we bail before arming the timer).
// Best-effort restore the snapshot so we never leave that unprotected.
if rerr := m.be.Apply(ctx, iface, prior); rerr != nil {
log.Printf("networking: apply %s failed and restore also failed: %v", iface, rerr)
}
return 0, fmt.Errorf("apply %s: %w", iface, err)
}
// The revert runs from the timer or an explicit rollback, possibly with no
// client attached, so it uses context.Background() rather than ctx.
return m.armPending(iface, func() error { return m.be.Apply(context.Background(), iface, prior) }, cfg.RollbackSeconds)
}
// startLinkDown takes the interface down behind the same rollback safety net: if
// the change is not confirmed, the interface is brought back up. Taking a remote
// interface down is just as much a lock-yourself-out risk as a bad static config.
//
// Bringing a link UP needs no protection (it cannot lock you out), so link-up
// stays a direct, un-wrapped call in the handler.
func (m *Module) startLinkDown(ctx context.Context, iface string) (int, error) {
if err := m.checkNoPending(); err != nil {
return 0, err
}
if err := m.be.SetLinkDown(ctx, iface); err != nil {
return 0, fmt.Errorf("link down %s: %w", iface, err)
}
// Revert (bring the link back up) may run from the timer with no client.
return m.armPending(iface, func() error { return m.be.SetLinkUp(context.Background(), iface) }, 0)
}
// checkNoPending reports a 409 error if a change is already pending.
func (m *Module) checkNoPending() error {
m.mu.Lock()
defer m.mu.Unlock()
if m.pending != nil {
return errPending(m.pending.Iface)
}
return nil
}
// armPending installs the pending change and starts its auto-revert timer. The
// caller has already applied the change; revert is the closure that undoes it.
// It is invoked on timer expiry, explicit rollback, or if a concurrent change
// raced us between the pre-check and here (in which case we revert immediately
// and report the conflict). seconds <= 0 uses the default timeout.
func (m *Module) armPending(iface string, revert func() error, seconds int) (int, error) {
if seconds <= 0 {
seconds = defaultRollbackSeconds
}
dur := time.Duration(seconds) * time.Second
m.mu.Lock()
defer m.mu.Unlock()
if m.pending != nil {
// Lost the race — undo what we just applied and report conflict.
if err := revert(); err != nil {
log.Printf("networking: failed to undo raced change on %s: %v", iface, err)
}
return 0, errPending(m.pending.Iface)
}
pc := &pendingChange{
Iface: iface,
revert: revert,
Deadline: time.Now().Add(dur),
}
// The timer fires the auto-revert. It captures m and pc by closure so it can
// revert even if the server is otherwise idle — the whole point is protecting
// against being locked out of a remote box.
pc.Timer = time.AfterFunc(dur, func() {
// Check validity under lock, then revert outside it so a slow
// nmcli/networkctl call doesn't block the entire networking module.
m.mu.Lock()
if m.pending != pc {
m.mu.Unlock()
return
}
iface := pc.Iface
revert := pc.revert
m.mu.Unlock()
log.Printf("networking: rollback timer expired for %s — reverting", iface)
if err := revert(); err != nil {
log.Printf("networking: auto-rollback of %s failed: %v", iface, err)
}
m.mu.Lock()
if m.pending == pc {
m.pending = nil
}
m.mu.Unlock()
})
m.pending = pc
return seconds, nil
}
// confirm cancels the rollback timer and clears the pending change, making it
// permanent. Errors if there is no pending change or it's for another interface.
func (m *Module) confirm(iface string) error {
m.mu.Lock()
defer m.mu.Unlock()
if m.pending == nil {
return fmt.Errorf("no pending change to confirm")
}
if m.pending.Iface != iface {
return fmt.Errorf("pending change is for %s, not %s", m.pending.Iface, iface)
}
m.pending.Timer.Stop()
m.pending = nil
return nil
}
// rollbackNow immediately reverts the pending change and clears it. Errors if
// there is no pending change or it's for another interface.
func (m *Module) rollbackNow(iface string) error {
m.mu.Lock()
defer m.mu.Unlock()
if m.pending == nil {
return fmt.Errorf("no pending change to rollback")
}
if m.pending.Iface != iface {
return fmt.Errorf("pending change is for %s, not %s", m.pending.Iface, iface)
}
m.pending.Timer.Stop()
err := m.pending.revert()
m.pending = nil
if err != nil {
return fmt.Errorf("rollback %s: %w", iface, err)
}
return nil
}
// PendingInfo is the JSON body returned by the pending-change status endpoint.
type PendingInfo struct {
Iface string `json:"interface" example:"eth0" doc:"Interface with a pending change"`
SecondsRemaining int `json:"seconds_remaining" example:"45" doc:"Seconds until auto-rollback"`
}
// pendingInfo returns the current pending change status, or nil if none.
func (m *Module) pendingInfo() *PendingInfo {
m.mu.Lock()
defer m.mu.Unlock()
if m.pending == nil {
return nil
}
remaining := max(int(time.Until(m.pending.Deadline).Seconds()), 0)
return &PendingInfo{
Iface: m.pending.Iface,
SecondsRemaining: remaining,
}
}