@@ -1770,11 +1770,8 @@ def __init__(
1770
1770
self .clients ["fire-and-forget" ] = ClientState ("fire-and-forget" )
1771
1771
self .extensions = {}
1772
1772
self .host_info = host_info
1773
- self .idle = SortedDict ()
1774
- self .idle_task_count = set ()
1775
1773
self .n_tasks = 0
1776
1774
self .resources = resources
1777
- self .saturated = set ()
1778
1775
self .tasks = tasks
1779
1776
self .replicated_tasks = {
1780
1777
ts for ts in self .tasks .values () if len (ts .who_has or ()) > 1
@@ -1865,7 +1862,6 @@ def __pdict__(self) -> dict[str, Any]:
1865
1862
return {
1866
1863
"bandwidth" : self .bandwidth ,
1867
1864
"resources" : self .resources ,
1868
- "saturated" : self .saturated ,
1869
1865
"unrunnable" : self .unrunnable ,
1870
1866
"queued" : self .queued ,
1871
1867
"n_tasks" : self .n_tasks ,
@@ -1879,7 +1875,6 @@ def __pdict__(self) -> dict[str, Any]:
1879
1875
"extensions" : self .extensions ,
1880
1876
"clients" : self .clients ,
1881
1877
"workers" : self .workers ,
1882
- "idle" : self .idle ,
1883
1878
"host_info" : self .host_info ,
1884
1879
}
1885
1880
@@ -2310,7 +2305,7 @@ def decide_worker_rootish_queuing_disabled(
2310
2305
# See root-ish-ness note below in `decide_worker_rootish_queuing_enabled`
2311
2306
assert math .isinf (self .WORKER_SATURATION ) or not ts ._queueable
2312
2307
2313
- pool = self .idle . values () if self . idle else self . running
2308
+ pool = self .running
2314
2309
if not pool :
2315
2310
return None
2316
2311
@@ -2375,22 +2370,16 @@ def decide_worker_rootish_queuing_enabled(self) -> WorkerState | None:
2375
2370
# then add that assertion here (and actually pass in the task).
2376
2371
assert not math .isinf (self .WORKER_SATURATION )
2377
2372
2378
- if not self .idle_task_count :
2379
- # All workers busy? Task gets/stays queued.
2373
+ if not self .running :
2380
2374
return None
2381
2375
2382
2376
# Just pick the least busy worker.
2383
2377
# NOTE: this will lead to worst-case scheduling with regards to co-assignment.
2384
- ws = min (
2385
- self .idle_task_count ,
2386
- key = lambda ws : len (ws .processing ) / ws .nthreads ,
2387
- )
2378
+ ws = min (self .running , key = lambda ws : len (ws .processing ) / ws .nthreads )
2379
+ if _worker_full (ws , self .WORKER_SATURATION ):
2380
+ return None
2388
2381
if self .validate :
2389
2382
assert self .workers .get (ws .address ) is ws
2390
- assert not _worker_full (ws , self .WORKER_SATURATION ), (
2391
- ws ,
2392
- _task_slots_available (ws , self .WORKER_SATURATION ),
2393
- )
2394
2383
assert ws in self .running , (ws , self .running )
2395
2384
2396
2385
return ws
@@ -2434,7 +2423,7 @@ def decide_worker_non_rootish(self, ts: TaskState) -> WorkerState | None:
2434
2423
# dependencies, but its group is also smaller than the cluster.
2435
2424
2436
2425
# Fastpath when there are no related tasks or restrictions
2437
- worker_pool = self .idle or self . workers
2426
+ worker_pool = self .workers
2438
2427
# FIXME idle and workers are SortedDict's declared as dicts
2439
2428
# because sortedcontainers is not annotated
2440
2429
wp_vals = cast ("Sequence[WorkerState]" , worker_pool .values ())
@@ -2927,7 +2916,6 @@ def _transition_waiting_queued(self, key: Key, stimulus_id: str) -> RecsMsgs:
2927
2916
ts = self .tasks [key ]
2928
2917
2929
2918
if self .validate :
2930
- assert not self .idle_task_count , (ts , self .idle_task_count )
2931
2919
self ._validate_ready (ts )
2932
2920
2933
2921
ts .state = "queued"
@@ -3158,63 +3146,6 @@ def is_rootish(self, ts: TaskState) -> bool:
3158
3146
and sum (map (len , tg .dependencies )) < self .rootish_tg_dependencies_threshold
3159
3147
)
3160
3148
3161
- def check_idle_saturated (self , ws : WorkerState , occ : float = - 1.0 ) -> None :
3162
- """Update the status of the idle and saturated state
3163
-
3164
- The scheduler keeps track of workers that are ..
3165
-
3166
- - Saturated: have enough work to stay busy
3167
- - Idle: do not have enough work to stay busy
3168
-
3169
- They are considered saturated if they both have enough tasks to occupy
3170
- all of their threads, and if the expected runtime of those tasks is
3171
- large enough.
3172
-
3173
- If ``distributed.scheduler.worker-saturation`` is not ``inf``
3174
- (scheduler-side queuing is enabled), they are considered idle
3175
- if they have fewer tasks processing than the ``worker-saturation``
3176
- threshold dictates.
3177
-
3178
- Otherwise, they are considered idle if they have fewer tasks processing
3179
- than threads, or if their tasks' total expected runtime is less than half
3180
- the expected runtime of the same number of average tasks.
3181
-
3182
- This is useful for load balancing and adaptivity.
3183
- """
3184
- if self .total_nthreads == 0 or ws .status == Status .closed :
3185
- return
3186
- if occ < 0 :
3187
- occ = ws .occupancy
3188
-
3189
- p = len (ws .processing )
3190
-
3191
- self .saturated .discard (ws )
3192
- if ws .status != Status .running :
3193
- self .idle .pop (ws .address , None )
3194
- elif self .is_unoccupied (ws , occ , p ):
3195
- self .idle [ws .address ] = ws
3196
- else :
3197
- self .idle .pop (ws .address , None )
3198
- nc = ws .nthreads
3199
- if p > nc :
3200
- pending = occ * (p - nc ) / (p * nc )
3201
- if 0.4 < pending > 1.9 * (self .total_occupancy / self .total_nthreads ):
3202
- self .saturated .add (ws )
3203
-
3204
- if not _worker_full (ws , self .WORKER_SATURATION ) and ws .status == Status .running :
3205
- self .idle_task_count .add (ws )
3206
- else :
3207
- self .idle_task_count .discard (ws )
3208
-
3209
- def is_unoccupied (
3210
- self , ws : WorkerState , occupancy : float , nprocessing : int
3211
- ) -> bool :
3212
- nthreads = ws .nthreads
3213
- return (
3214
- nprocessing < nthreads
3215
- or occupancy < nthreads * (self .total_occupancy / self .total_nthreads ) / 2
3216
- )
3217
-
3218
3149
def get_comm_cost (self , ts : TaskState , ws : WorkerState ) -> float :
3219
3150
"""
3220
3151
Get the estimated communication cost (in s.) to compute the task
@@ -3402,7 +3333,6 @@ def _add_to_processing(
3402
3333
ts .processing_on = ws
3403
3334
ts .state = "processing"
3404
3335
self .acquire_resources (ts , ws )
3405
- self .check_idle_saturated (ws )
3406
3336
self .n_tasks += 1
3407
3337
3408
3338
if ts .actor :
@@ -3468,7 +3398,6 @@ def _exit_processing_common(self, ts: TaskState) -> WorkerState | None:
3468
3398
if self .workers .get (ws .address ) is not ws : # may have been removed
3469
3399
return None
3470
3400
3471
- self .check_idle_saturated (ws )
3472
3401
self .release_resources (ts , ws )
3473
3402
3474
3403
return ws
@@ -4606,10 +4535,6 @@ async def add_worker(
4606
4535
metrics = metrics ,
4607
4536
)
4608
4537
4609
- # Do not need to adjust self.total_occupancy as self.occupancy[ws] cannot
4610
- # exist before this.
4611
- self .check_idle_saturated (ws )
4612
-
4613
4538
self .stream_comms [address ] = BatchedSend (interval = "5ms" , loop = self .loop )
4614
4539
4615
4540
awaitables = []
@@ -5227,13 +5152,11 @@ def stimulus_queue_slots_maybe_opened(self, *, stimulus_id: str) -> None:
5227
5152
so any tasks that became runnable are already in ``processing``. Otherwise,
5228
5153
overproduction can occur if queued tasks get scheduled before downstream tasks.
5229
5154
5230
- Must be called after `check_idle_saturated`; i.e. `idle_task_count` must be up to date.
5231
5155
"""
5232
5156
if not self .queued :
5233
5157
return
5234
5158
slots_available = sum (
5235
- _task_slots_available (ws , self .WORKER_SATURATION )
5236
- for ws in self .idle_task_count
5159
+ _task_slots_available (ws , self .WORKER_SATURATION ) for ws in self .running
5237
5160
)
5238
5161
if slots_available == 0 :
5239
5162
return
@@ -5466,9 +5389,6 @@ async def remove_worker(
5466
5389
self .rpc .remove (address )
5467
5390
del self .stream_comms [address ]
5468
5391
del self .aliases [ws .name ]
5469
- self .idle .pop (ws .address , None )
5470
- self .idle_task_count .discard (ws )
5471
- self .saturated .discard (ws )
5472
5392
del self .workers [address ]
5473
5393
self ._workers_removed_total += 1
5474
5394
ws .status = Status .closed
@@ -5818,23 +5738,6 @@ def validate_state(self, allow_overlap: bool = False) -> None:
5818
5738
if not (set (self .workers ) == set (self .stream_comms )):
5819
5739
raise ValueError ("Workers not the same in all collections" )
5820
5740
5821
- assert self .running .issuperset (self .idle .values ()), (
5822
- self .running .copy (),
5823
- set (self .idle .values ()),
5824
- )
5825
- assert self .running .issuperset (self .idle_task_count ), (
5826
- self .running .copy (),
5827
- self .idle_task_count .copy (),
5828
- )
5829
- assert self .running .issuperset (self .saturated ), (
5830
- self .running .copy (),
5831
- self .saturated .copy (),
5832
- )
5833
- assert self .saturated .isdisjoint (self .idle .values ()), (
5834
- self .saturated .copy (),
5835
- set (self .idle .values ()),
5836
- )
5837
-
5838
5741
task_prefix_counts : defaultdict [str , int ] = defaultdict (int )
5839
5742
for w , ws in self .workers .items ():
5840
5743
assert isinstance (w , str ), (type (w ), w )
@@ -5845,14 +5748,10 @@ def validate_state(self, allow_overlap: bool = False) -> None:
5845
5748
assert ws in self .running
5846
5749
else :
5847
5750
assert ws not in self .running
5848
- assert ws .address not in self .idle
5849
- assert ws not in self .saturated
5850
5751
5851
5752
assert ws .long_running .issubset (ws .processing )
5852
5753
if not ws .processing :
5853
5754
assert not ws .occupancy
5854
- if ws .status == Status .running :
5855
- assert ws .address in self .idle
5856
5755
assert not ws .needs_what .keys () & ws .has_what
5857
5756
actual_needs_what : defaultdict [TaskState , int ] = defaultdict (int )
5858
5757
for ts in ws .processing :
@@ -6136,7 +6035,6 @@ def handle_long_running(
6136
6035
ts .prefix .duration_average = (old_duration + compute_duration ) / 2
6137
6036
6138
6037
ws .add_to_long_running (ts )
6139
- self .check_idle_saturated (ws )
6140
6038
6141
6039
self .stimulus_queue_slots_maybe_opened (stimulus_id = stimulus_id )
6142
6040
@@ -6164,16 +6062,12 @@ def handle_worker_status_change(
6164
6062
6165
6063
if ws .status == Status .running :
6166
6064
self .running .add (ws )
6167
- self .check_idle_saturated (ws )
6168
6065
self .transitions (
6169
6066
self .bulk_schedule_unrunnable_after_adding_worker (ws ), stimulus_id
6170
6067
)
6171
6068
self .stimulus_queue_slots_maybe_opened (stimulus_id = stimulus_id )
6172
6069
else :
6173
6070
self .running .discard (ws )
6174
- self .idle .pop (ws .address , None )
6175
- self .idle_task_count .discard (ws )
6176
- self .saturated .discard (ws )
6177
6071
self ._refresh_no_workers_since ()
6178
6072
6179
6073
def handle_request_refresh_who_has (
0 commit comments