kube-scheduler — Data Flow

Filter, score, and bind unscheduled pods to nodes using a plugin pipeline

Startup

main() → NewSchedulerCommand() → Setup() → Run()
The scheduler binary builds a Scheduler struct (with its plugin framework, cache, and queue) then enters the scheduling loop.
  1. main() — entry point
    // cmd/kube-scheduler/scheduler.go L30
    func main() {
        command := app.NewSchedulerCommand()
        code := cli.Run(command)
        os.Exit(code)
    }
    cmd/kube-scheduler/scheduler.go
  2. NewSchedulerCommand() → runCommand()
    Parses flags; RunE calls runCommand which calls Setup() to build the Scheduler struct, then Run().
    // cmd/kube-scheduler/app/server.go L93-171
    func NewSchedulerCommand(registryOptions ...Option) *cobra.Command {
        RunE: func(cmd *cobra.Command, args []string) error {
            return runCommand(cmd, opts, registryOptions...)  // L111
        },
    }
    
    func runCommand(cmd *cobra.Command, opts *options.Options, ...) error {
        cc, sched, err := Setup(ctx, opts, registryOptions...)  // L161
        return Run(ctx, cc, sched)                              // L170
    }
    server.go L93–171
  3. sched.Run() — starts the scheduling loop
    Starts the SchedulingQueue goroutines, then launches ScheduleOne in a dedicated goroutine via wait.UntilWithContext.
    // pkg/scheduler/scheduler.go L524-551
    func (sched *Scheduler) Run(ctx context.Context) {
        sched.SchedulingQueue.Run(logger)   // starts queue flush goroutines
    
        go wait.UntilWithContext(ctx, sched.ScheduleOne, 0)  // L538
    
        <-ctx.Done()
        sched.SchedulingQueue.Close()
    }
    scheduler.go L524–551

Scheduling Loop

ScheduleOne() — one pod per call
ScheduleOne is the top-level function called in a tight loop. It dequeues one pod and dispatches it to either the pod-group path (generic workload) or the standard single-pod path.
// pkg/scheduler/schedule_one.go L67-96
func (sched *Scheduler) ScheduleOne(ctx context.Context) {
    podInfo, err := sched.NextPod(logger)  // blocks until pod available
    if podInfo == nil || podInfo.Pod == nil { return }

    if sched.genericWorkloadEnabled && podInfo.Pod.Spec.SchedulingGroup != nil {
        sched.scheduleOnePodGroup(ctx, podGroupInfo)  // gang scheduling
    } else {
        sched.scheduleOnePod(ctx, podInfo)            // L94 — standard path
    }
}
schedule_one.go L67–96
scheduleOnePod() — scheduling + async binding
The synchronous scheduling cycle (filter + score + assume) is separated from the asynchronous binding cycle (permit + reserve + bind). This allows the scheduler to start the next pod's scheduling cycle while waiting for the bind RPC.
// pkg/scheduler/schedule_one.go L99-148
func (sched *Scheduler) scheduleOnePod(ctx context.Context, podInfo *framework.QueuedPodInfo) {
    fwk, _ := sched.frameworkForPod(pod)

    state := framework.NewCycleState()             // L127 — fresh cycle state

    // Synchronous: filter + score + assume
    scheduleResult, assumedPodInfo, status :=
        sched.schedulingCycle(schedulingCycleCtx, state, fwk, podInfo, start, podsToActivate)  // L140
    if !status.IsSuccess() {
        sched.FailureHandler(...)
        return
    }

    // Asynchronous: permit + reserve + bind
    go sched.runBindingCycle(ctx, state, fwk, scheduleResult, assumedPodInfo, start, podsToActivate)  // L147
}
schedule_one.go L99–148
schedulingCycle() — snapshot → filter/score → assume
The scheduling cycle takes a consistent snapshot of node state, runs the scheduling algorithm (filter then score), and optimistically "assumes" the pod is on the chosen node in the cache.
// pkg/scheduler/schedule_one.go L175-198
func (sched *Scheduler) schedulingCycle(ctx, state, schedFramework, podInfo, ...) (...) {
    // 1. Refresh node snapshot (copy from cache to read-only snapshot)
    sched.Cache.UpdateSnapshot(klog.FromContext(ctx), sched.nodeInfoSnapshot)  // L183

    // 2. Run filter plugins → score plugins → select node
    scheduleResult, status := sched.schedulingAlgorithm(ctx, state, schedFramework, podInfo, start)  // L187

    // 3. Assume: mark pod→node in cache before the bind RPC completes
    assumedPodInfo, status := sched.prepareForBindingCycle(ctx, state, schedFramework, ...)  // L192
    return scheduleResult, assumedPodInfo, nil
}
schedule_one.go L175–198

Plugin extension points in the scheduling cycle

PreFilter

  • InterPodAffinity
  • NodeAffinity
  • NodePorts
  • PodTopologySpread

Filter

  • NodeUnschedulable
  • NodeResourcesFit
  • TaintToleration
  • VolumeBinding

Score

  • LeastAllocated
  • NodeAffinity
  • InterPodAffinity
  • ImageLocality

Reserve / Bind

  • VolumeBinding.Reserve
  • DefaultBinder
  • SchedulingGates
runBindingCycle() async goroutine — permit → reserve → bind
The binding cycle runs in a goroutine so the main scheduling loop can start filtering the next pod immediately. It writes the final spec.nodeName to the API server.
// pkg/scheduler/schedule_one.go L151-170
func (sched *Scheduler) runBindingCycle(ctx, state, schedFramework, scheduleResult,
    assumedPodInfo, start, podsToActivate) {
    bindingCycleCtx, cancel := context.WithCancel(ctx)
    defer cancel()

    // Runs: WaitOnPermit → PreBind → Bind → PostBind
    status := sched.bindingCycle(bindingCycleCtx, state, schedFramework,
        scheduleResult, assumedPodInfo, start, podsToActivate)  // L165
    if !status.IsSuccess() {
        sched.handleBindingCycleError(...)
    }
}
schedule_one.go L151–170
The DefaultBinder plugin sends a POST /api/v1/namespaces/{ns}/pods/{name}/binding request to the API server, which sets spec.nodeName and triggers the kubelet watch.

Key Data Structures

Scheduler struct — central state
// pkg/scheduler/scheduler.go L68-125 (condensed)
type Scheduler struct {
    // In-memory view of node / pod state (updated from informers)
    Cache internalcache.Cache

    // Priority queue of pods waiting to be scheduled
    SchedulingQueue internalqueue.SchedulingQueue

    // Read-only consistent snapshot used during a scheduling cycle
    nodeInfoSnapshot *internalcache.Snapshot

    // Plugin framework profiles (one per scheduler name)
    Profiles profile.Map

    // Called when scheduling fails; handles re-queuing
    FailureHandler FailureHandlerFn
}
scheduler.go L68–125