202 lines
6.8 KiB
Go
202 lines
6.8 KiB
Go
package networking
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"time"
|
|
)
|
|
|
|
const defaultRollbackSeconds = 120
|
|
|
|
// errAlreadyPending is returned when another change is awaiting confirmation.
|
|
// The write handlers map this to 409 Conflict.
|
|
var errAlreadyPending = errors.New("already pending")
|
|
|
|
// errPending builds the 409 message. The lock is global across all interfaces
|
|
// (see pendingChange), so the message says so to avoid confusing a user who is
|
|
// touching a different interface than the one that holds the lock.
|
|
func errPending(iface string) error {
|
|
return fmt.Errorf("%w: a change to %s is awaiting confirmation. This is a global lock across all interfaces — confirm or roll that change back first", errAlreadyPending, iface)
|
|
}
|
|
|
|
// startRollback snapshots the current state, applies the new config, and arms a
|
|
// timer that auto-reverts if not confirmed. Returns errAlreadyPending (409) if
|
|
// another change is in flight, or a wrapped error (500) if apply fails.
|
|
//
|
|
// Snapshot and Apply run WITHOUT the mutex held, so they don't block reads or
|
|
// the pending-status endpoint while shelling out to nmcli/networkctl/ifup.
|
|
func (m *Module) startRollback(ctx context.Context, iface string, cfg IfaceConfig) (int, error) {
|
|
// Fast pre-check so we don't snapshot/apply when something is already
|
|
// pending. armPending re-checks under the lock to close the race.
|
|
if err := m.checkNoPending(); err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
prior, err := m.be.Snapshot(ctx, iface)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("snapshot %s: %w", iface, err)
|
|
}
|
|
if err := m.be.Apply(ctx, iface, cfg); err != nil {
|
|
// Apply is not atomic: nmcli `con modify` may succeed before `con up`
|
|
// fails, and networkd writes the .network file before `reconfigure`
|
|
// runs. A failed Apply can therefore leave a half-applied config that
|
|
// would otherwise have NO auto-revert (we bail before arming the timer).
|
|
// Best-effort restore the snapshot so we never leave that unprotected.
|
|
if rerr := m.be.Apply(ctx, iface, prior); rerr != nil {
|
|
log.Printf("networking: apply %s failed and restore also failed: %v", iface, rerr)
|
|
}
|
|
return 0, fmt.Errorf("apply %s: %w", iface, err)
|
|
}
|
|
|
|
// The revert runs from the timer or an explicit rollback, possibly with no
|
|
// client attached, so it uses context.Background() rather than ctx.
|
|
return m.armPending(iface, func() error { return m.be.Apply(context.Background(), iface, prior) }, cfg.RollbackSeconds)
|
|
}
|
|
|
|
// startLinkDown takes the interface down behind the same rollback safety net: if
|
|
// the change is not confirmed, the interface is brought back up. Taking a remote
|
|
// interface down is just as much a lock-yourself-out risk as a bad static config.
|
|
//
|
|
// Bringing a link UP needs no protection (it cannot lock you out), so link-up
|
|
// stays a direct, un-wrapped call in the handler.
|
|
func (m *Module) startLinkDown(ctx context.Context, iface string) (int, error) {
|
|
if err := m.checkNoPending(); err != nil {
|
|
return 0, err
|
|
}
|
|
if err := m.be.SetLinkDown(ctx, iface); err != nil {
|
|
return 0, fmt.Errorf("link down %s: %w", iface, err)
|
|
}
|
|
// Revert (bring the link back up) may run from the timer with no client.
|
|
return m.armPending(iface, func() error { return m.be.SetLinkUp(context.Background(), iface) }, 0)
|
|
}
|
|
|
|
// checkNoPending reports a 409 error if a change is already pending.
|
|
func (m *Module) checkNoPending() error {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
if m.pending != nil {
|
|
return errPending(m.pending.Iface)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// armPending installs the pending change and starts its auto-revert timer. The
|
|
// caller has already applied the change; revert is the closure that undoes it.
|
|
// It is invoked on timer expiry, explicit rollback, or if a concurrent change
|
|
// raced us between the pre-check and here (in which case we revert immediately
|
|
// and report the conflict). seconds <= 0 uses the default timeout.
|
|
func (m *Module) armPending(iface string, revert func() error, seconds int) (int, error) {
|
|
if seconds <= 0 {
|
|
seconds = defaultRollbackSeconds
|
|
}
|
|
dur := time.Duration(seconds) * time.Second
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.pending != nil {
|
|
// Lost the race — undo what we just applied and report conflict.
|
|
if err := revert(); err != nil {
|
|
log.Printf("networking: failed to undo raced change on %s: %v", iface, err)
|
|
}
|
|
return 0, errPending(m.pending.Iface)
|
|
}
|
|
|
|
pc := &pendingChange{
|
|
Iface: iface,
|
|
revert: revert,
|
|
Deadline: time.Now().Add(dur),
|
|
}
|
|
// The timer fires the auto-revert. It captures m and pc by closure so it can
|
|
// revert even if the server is otherwise idle — the whole point is protecting
|
|
// against being locked out of a remote box.
|
|
pc.Timer = time.AfterFunc(dur, func() {
|
|
// Check validity under lock, then revert outside it so a slow
|
|
// nmcli/networkctl call doesn't block the entire networking module.
|
|
m.mu.Lock()
|
|
if m.pending != pc {
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
iface := pc.Iface
|
|
revert := pc.revert
|
|
m.mu.Unlock()
|
|
|
|
log.Printf("networking: rollback timer expired for %s — reverting", iface)
|
|
if err := revert(); err != nil {
|
|
log.Printf("networking: auto-rollback of %s failed: %v", iface, err)
|
|
}
|
|
m.mu.Lock()
|
|
if m.pending == pc {
|
|
m.pending = nil
|
|
}
|
|
m.mu.Unlock()
|
|
})
|
|
|
|
m.pending = pc
|
|
return seconds, nil
|
|
}
|
|
|
|
// confirm cancels the rollback timer and clears the pending change, making it
|
|
// permanent. Errors if there is no pending change or it's for another interface.
|
|
func (m *Module) confirm(iface string) error {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.pending == nil {
|
|
return fmt.Errorf("no pending change to confirm")
|
|
}
|
|
if m.pending.Iface != iface {
|
|
return fmt.Errorf("pending change is for %s, not %s", m.pending.Iface, iface)
|
|
}
|
|
|
|
m.pending.Timer.Stop()
|
|
m.pending = nil
|
|
return nil
|
|
}
|
|
|
|
// rollbackNow immediately reverts the pending change and clears it. Errors if
|
|
// there is no pending change or it's for another interface.
|
|
func (m *Module) rollbackNow(iface string) error {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.pending == nil {
|
|
return fmt.Errorf("no pending change to rollback")
|
|
}
|
|
if m.pending.Iface != iface {
|
|
return fmt.Errorf("pending change is for %s, not %s", m.pending.Iface, iface)
|
|
}
|
|
|
|
m.pending.Timer.Stop()
|
|
err := m.pending.revert()
|
|
m.pending = nil
|
|
if err != nil {
|
|
return fmt.Errorf("rollback %s: %w", iface, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// PendingInfo is the JSON body returned by the pending-change status endpoint.
|
|
type PendingInfo struct {
|
|
Iface string `json:"interface" example:"eth0" doc:"Interface with a pending change"`
|
|
SecondsRemaining int `json:"seconds_remaining" example:"45" doc:"Seconds until auto-rollback"`
|
|
}
|
|
|
|
// pendingInfo returns the current pending change status, or nil if none.
|
|
func (m *Module) pendingInfo() *PendingInfo {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.pending == nil {
|
|
return nil
|
|
}
|
|
remaining := max(int(time.Until(m.pending.Deadline).Seconds()), 0)
|
|
return &PendingInfo{
|
|
Iface: m.pending.Iface,
|
|
SecondsRemaining: remaining,
|
|
}
|
|
}
|