2026-06-22 16:06:57 +02:00
package networking
import (
"context"
"errors"
"fmt"
"log"
"time"
)
2026-06-24 17:29:45 +02:00
const defaultRollbackSeconds = 120
2026-06-22 16:06:57 +02:00
// errAlreadyPending is returned when another change is awaiting confirmation.
// The write handlers map this to 409 Conflict.
var errAlreadyPending = errors . New ( "already pending" )
// errPending builds the 409 message. The lock is global across all interfaces
// (see pendingChange), so the message says so to avoid confusing a user who is
// touching a different interface than the one that holds the lock.
func errPending ( iface string ) error {
return fmt . Errorf ( "%w: a change to %s is awaiting confirmation. This is a global lock across all interfaces — confirm or roll that change back first" , errAlreadyPending , iface )
}
// startRollback snapshots the current state, applies the new config, and arms a
// timer that auto-reverts if not confirmed. Returns errAlreadyPending (409) if
// another change is in flight, or a wrapped error (500) if apply fails.
//
// Snapshot and Apply run WITHOUT the mutex held, so they don't block reads or
// the pending-status endpoint while shelling out to nmcli/networkctl/ifup.
func ( m * Module ) startRollback ( ctx context . Context , iface string , cfg IfaceConfig ) ( int , error ) {
// Fast pre-check so we don't snapshot/apply when something is already
// pending. armPending re-checks under the lock to close the race.
if err := m . checkNoPending (); err != nil {
return 0 , err
}
prior , err := m . be . Snapshot ( ctx , iface )
if err != nil {
return 0 , fmt . Errorf ( "snapshot %s: %w" , iface , err )
}
if err := m . be . Apply ( ctx , iface , cfg ); err != nil {
// Apply is not atomic: nmcli `con modify` may succeed before `con up`
// fails, and networkd writes the .network file before `reconfigure`
// runs. A failed Apply can therefore leave a half-applied config that
// would otherwise have NO auto-revert (we bail before arming the timer).
// Best-effort restore the snapshot so we never leave that unprotected.
if rerr := m . be . Apply ( ctx , iface , prior ); rerr != nil {
log . Printf ( "networking: apply %s failed and restore also failed: %v" , iface , rerr )
}
return 0 , fmt . Errorf ( "apply %s: %w" , iface , err )
}
// The revert runs from the timer or an explicit rollback, possibly with no
// client attached, so it uses context.Background() rather than ctx.
return m . armPending ( iface , func () error { return m . be . Apply ( context . Background (), iface , prior ) }, cfg . RollbackSeconds )
}
// startLinkDown takes the interface down behind the same rollback safety net: if
// the change is not confirmed, the interface is brought back up. Taking a remote
// interface down is just as much a lock-yourself-out risk as a bad static config.
//
// Bringing a link UP needs no protection (it cannot lock you out), so link-up
// stays a direct, un-wrapped call in the handler.
func ( m * Module ) startLinkDown ( ctx context . Context , iface string ) ( int , error ) {
if err := m . checkNoPending (); err != nil {
return 0 , err
}
if err := m . be . SetLinkDown ( ctx , iface ); err != nil {
return 0 , fmt . Errorf ( "link down %s: %w" , iface , err )
}
// Revert (bring the link back up) may run from the timer with no client.
return m . armPending ( iface , func () error { return m . be . SetLinkUp ( context . Background (), iface ) }, 0 )
}
// checkNoPending reports a 409 error if a change is already pending.
func ( m * Module ) checkNoPending () error {
m . mu . Lock ()
defer m . mu . Unlock ()
if m . pending != nil {
return errPending ( m . pending . Iface )
}
return nil
}
// armPending installs the pending change and starts its auto-revert timer. The
// caller has already applied the change; revert is the closure that undoes it.
// It is invoked on timer expiry, explicit rollback, or if a concurrent change
// raced us between the pre-check and here (in which case we revert immediately
// and report the conflict). seconds <= 0 uses the default timeout.
func ( m * Module ) armPending ( iface string , revert func () error , seconds int ) ( int , error ) {
if seconds <= 0 {
seconds = defaultRollbackSeconds
}
dur := time . Duration ( seconds ) * time . Second
m . mu . Lock ()
defer m . mu . Unlock ()
if m . pending != nil {
// Lost the race — undo what we just applied and report conflict.
if err := revert (); err != nil {
log . Printf ( "networking: failed to undo raced change on %s: %v" , iface , err )
}
return 0 , errPending ( m . pending . Iface )
}
pc := & pendingChange {
Iface : iface ,
revert : revert ,
Deadline : time . Now (). Add ( dur ),
}
// The timer fires the auto-revert. It captures m and pc by closure so it can
// revert even if the server is otherwise idle — the whole point is protecting
// against being locked out of a remote box.
pc . Timer = time . AfterFunc ( dur , func () {
2026-06-24 17:29:45 +02:00
// Check validity under lock, then revert outside it so a slow
// nmcli/networkctl call doesn't block the entire networking module.
2026-06-22 16:06:57 +02:00
m . mu . Lock ()
if m . pending != pc {
2026-06-24 17:29:45 +02:00
m . mu . Unlock ()
2026-06-22 16:06:57 +02:00
return
}
2026-06-24 17:29:45 +02:00
iface := pc . Iface
revert := pc . revert
m . mu . Unlock ()
2026-06-22 16:06:57 +02:00
log . Printf ( "networking: rollback timer expired for %s — reverting" , iface )
2026-06-24 17:29:45 +02:00
if err := revert (); err != nil {
2026-06-22 16:06:57 +02:00
log . Printf ( "networking: auto-rollback of %s failed: %v" , iface , err )
}
2026-06-24 17:29:45 +02:00
m . mu . Lock ()
if m . pending == pc {
m . pending = nil
}
m . mu . Unlock ()
2026-06-22 16:06:57 +02:00
})
m . pending = pc
return seconds , nil
}
// confirm cancels the rollback timer and clears the pending change, making it
// permanent. Errors if there is no pending change or it's for another interface.
func ( m * Module ) confirm ( iface string ) error {
m . mu . Lock ()
defer m . mu . Unlock ()
if m . pending == nil {
return fmt . Errorf ( "no pending change to confirm" )
}
if m . pending . Iface != iface {
return fmt . Errorf ( "pending change is for %s, not %s" , m . pending . Iface , iface )
}
m . pending . Timer . Stop ()
m . pending = nil
return nil
}
// rollbackNow immediately reverts the pending change and clears it. Errors if
// there is no pending change or it's for another interface.
func ( m * Module ) rollbackNow ( iface string ) error {
m . mu . Lock ()
defer m . mu . Unlock ()
if m . pending == nil {
return fmt . Errorf ( "no pending change to rollback" )
}
if m . pending . Iface != iface {
return fmt . Errorf ( "pending change is for %s, not %s" , m . pending . Iface , iface )
}
m . pending . Timer . Stop ()
err := m . pending . revert ()
m . pending = nil
if err != nil {
return fmt . Errorf ( "rollback %s: %w" , iface , err )
}
return nil
}
// PendingInfo is the JSON body returned by the pending-change status endpoint.
type PendingInfo struct {
Iface string `json:"interface" example:"eth0" doc:"Interface with a pending change"`
SecondsRemaining int `json:"seconds_remaining" example:"45" doc:"Seconds until auto-rollback"`
}
// pendingInfo returns the current pending change status, or nil if none.
func ( m * Module ) pendingInfo () * PendingInfo {
m . mu . Lock ()
defer m . mu . Unlock ()
if m . pending == nil {
return nil
}
remaining := max ( int ( time . Until ( m . pending . Deadline ). Seconds ()), 0 )
return & PendingInfo {
Iface : m . pending . Iface ,
SecondsRemaining : remaining ,
}
}