1
1
#include < cstring>
2
2
#include < set>
3
3
#include < stdarg.h>
4
- #ifdef GGML_USE_CUDA
5
- # include " ggml-cuda.h"
6
- #elif defined(GGML_USE_VULKAN)
7
- # include " ggml-vulkan.h"
8
- extern void ggml_vk_print_devices_info (void );
9
- #elif defined(GGML_USE_SYCL)
10
- # include " ggml-sycl.h"
11
- #elif defined(GGML_USE_KOMPUTE)
12
- # include " ggml-kompute.h"
13
- #elif defined(GGML_USE_CANN)
14
- # include " ggml-cann.h"
15
- #endif
16
-
17
- #ifdef GGML_USE_BLAS
18
- # include " ggml-blas.h"
19
- #endif
20
-
21
- #ifdef GGML_USE_METAL
22
- # include " ggml-metal.h"
23
- #endif
24
4
25
5
#include " backend.h"
26
-
27
6
#include " basics.h"
28
-
29
7
#include " ggml-cpu.h"
30
8
31
9
namespace chatllm
@@ -287,33 +265,46 @@ namespace chatllm
287
265
return id;
288
266
}
289
267
290
- ggml_backend_allocator ComputeManager::get_default_allocator_cpu (bool host_buffer, bool use_gpu)
268
+ void ComputeManager::init (void )
269
+ {
270
+ #if GGML_BACKEND_DL
271
+ static bool initialized = false ;
272
+ if (initialized) return ;
273
+ initialized = true ;
274
+
275
+ ggml_backend_load_all ();
276
+ #endif
277
+ }
278
+
279
+ std::string ComputeManager::dev_type_to_str (DeviceType type)
280
+ {
281
+ switch (type)
282
+ {
283
+ case DeviceType::CPU:
284
+ return " CPU" ;
285
+ case DeviceType::GPU:
286
+ return " GPU" ;
287
+ case DeviceType::ACCEL:
288
+ return " ACCEL" ;
289
+ default :
290
+ return " UNKNOWN" ;
291
+ }
292
+ }
293
+
294
+ ggml_backend_allocator ComputeManager::get_default_allocator_cpu (bool host_buffer, int gpu_id)
291
295
{
292
296
ggml_backend_allocator allocator = nullptr ;
293
297
294
- if (use_gpu )
298
+ if (gpu_id >= 0 )
295
299
{
296
- #if defined(GGML_USE_CUDA)
297
- // host buffers should only be used when data is expected to be copied to/from the GPU
298
- if (host_buffer) {
299
- allocator = ggml_backend_cuda_host_buffer_type ();
300
- }
301
- #elif defined(GGML_USE_SYCL)
302
- if (host_buffer) {
303
- allocator = ggml_backend_sycl_host_buffer_type ();
304
- }
305
- #elif defined(GGML_USE_CPU_HBM)
306
- allocator = ggml_backend_cpu_hbm_buffer_type ();
307
- #elif defined(GGML_USE_VULKAN)
308
- if (host_buffer) {
309
- allocator = ggml_backend_vk_host_buffer_type ();
310
- }
311
- #endif
300
+ auto dev = ggml_backend_dev_get (gpu_id);
301
+ if (dev)
302
+ allocator = ggml_backend_dev_host_buffer_type (dev);
312
303
}
313
304
314
- if (allocator == nullptr ) {
305
+ if (allocator == nullptr )
315
306
allocator = ggml_backend_cpu_buffer_type ();
316
- }
307
+
317
308
return allocator;
318
309
}
319
310
@@ -324,48 +315,23 @@ namespace chatllm
324
315
325
316
int ComputeManager::get_device_count (void )
326
317
{
327
- int count = 1 ;
328
- #if defined(GGML_USE_CUDA)
329
- count = ggml_backend_cuda_get_device_count ();
330
- #elif defined(GGML_USE_SYCL)
331
- count = ggml_backend_sycl_get_device_count ();
332
- #elif defined(GGML_USE_VULKAN)
333
- count = ggml_backend_vk_get_device_count ();
334
- #elif defined(GGML_USE_CANN)
335
- return ggml_backend_cann_get_device_count ();
336
- #endif
337
- return count;
338
- }
339
-
340
- ggml_backend_t ComputeManager::init_backend_device (int index)
341
- {
342
- #if defined(GGML_USE_CUDA)
343
- return ggml_backend_cuda_init (index);
344
- #elif defined(GGML_USE_SYCL)
345
- return ggml_backend_sycl_init (index);
346
- #elif defined(GGML_USE_VULKAN)
347
- return ggml_backend_vk_init (index);
348
- #elif defined(GGML_USE_CANN)
349
- return ggml_backend_cann_init (index);
350
- #endif
351
- return nullptr ;
318
+ ComputeManager::init ();
319
+ return (int )ggml_backend_dev_count ();
320
+ }
321
+
322
+ ggml_backend_t ComputeManager::init_backend_device (int index, const char *param)
323
+ {
324
+ auto dev = ggml_backend_dev_get (index);
325
+ return dev ? ggml_backend_dev_init (dev, param) : nullptr ;
352
326
}
353
327
354
- ggml_backend_allocator ComputeManager::get_default_allocator_offload (int gpu )
328
+ ggml_backend_allocator ComputeManager::get_default_allocator_offload (int device )
355
329
{
356
330
ggml_backend_allocator allocator = nullptr ;
357
331
358
- #if defined(GGML_USE_METAL)
359
- allocator = ggml_backend_metal_buffer_type ();
360
- #elif defined(GGML_USE_CUDA)
361
- allocator = ggml_backend_cuda_buffer_type (gpu);
362
- #elif defined(GGML_USE_VULKAN)
363
- allocator = ggml_backend_vk_buffer_type (gpu);
364
- #elif defined(GGML_USE_SYCL)
365
- allocator = ggml_backend_sycl_buffer_type (gpu);
366
- #elif defined(GGML_USE_CANN)
367
- allocator = ggml_backend_cann_buffer_type (gpu);
368
- #endif
332
+ auto dev = ggml_backend_dev_get (device);
333
+ if (dev)
334
+ allocator = ggml_backend_dev_buffer_type (dev);
369
335
370
336
if (allocator == nullptr )
371
337
allocator = get_default_allocator_cpu (true , true );
@@ -376,47 +342,42 @@ namespace chatllm
376
342
{
377
343
size_t total = 0 ;
378
344
size_t free = 0 ;
379
- #if defined(GGML_USE_CUDA)
380
- ggml_backend_cuda_get_device_memory (device, &free, &total);
381
- #elif defined(GGML_USE_SYCL)
382
- ggml_backend_sycl_get_device_memory (device, &free, &total);
383
- #elif defined(GGML_USE_VULKAN)
384
- ggml_backend_vk_get_device_memory (device, &free, &total);
385
- #elif defined(GGML_USE_CANN)
386
- ggml_backend_cann_get_device_memory (device, &free, &total);
387
- #else
388
- free = 1 ;
389
- #endif
345
+
346
+ auto dev = ggml_backend_dev_get (device);
347
+ if (dev)
348
+ ggml_backend_dev_memory (dev, &free, &total);
349
+
390
350
if (p_total) *p_total = total;
391
351
return free;
392
352
}
393
353
394
- int ComputeManager::get_max_devices ( void )
354
+ bool ComputeManager::get_device_info ( int device, DeviceInfo &info )
395
355
{
396
- # if defined(GGML_USE_METAL)
397
- return 1 ;
398
- # elif defined(GGML_USE_CUDA )
399
- return GGML_CUDA_MAX_DEVICES ;
400
- # elif defined(GGML_USE_SYCL)
401
- return GGML_SYCL_MAX_DEVICES ;
402
- # elif defined(GGML_USE_VULKAN)
403
- return GGML_VK_MAX_DEVICES ;
404
- # elif defined(GGML_USE_CANN)
405
- return GGML_CANN_MAX_DEVICES ;
406
- # else
407
- return 1 ;
408
- # endif
356
+ auto dev = ggml_backend_dev_get (device);
357
+
358
+ if ( nullptr == dev )
359
+ return false ;
360
+
361
+ ggml_backend_dev_props props ;
362
+ ggml_backend_dev_get_props (dev, &props);
363
+ info. type = (DeviceType)props. type ;
364
+ info. backend_name = ggml_backend_reg_name ( ggml_backend_dev_backend_reg (dev));
365
+ info. name = ggml_backend_dev_name (dev) ;
366
+ info. total_memory = props. memory_total ;
367
+ info. free_memory = props. memory_free ;
368
+ return true ;
409
369
}
410
370
411
- bool ComputeManager::is_gpu_offload_supported ( void )
371
+ void ComputeManager::get_devices_info (std::vector<DeviceInfo> &devs )
412
372
{
413
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
414
- defined (GGML_USE_SYCL) || defined (GGML_USE_CANN)
415
- // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
416
- return true ;
417
- #else
418
- return false ;
419
- #endif
373
+ ComputeManager::init ();
374
+ devs.clear ();
375
+ for (int i = 0 ; i < get_device_count (); i++)
376
+ {
377
+ DeviceInfo info;
378
+ CHATLLM_CHECK (get_device_info (i, info)) << __func__ << " : failed to get device #" << i;
379
+ devs.push_back (info);
380
+ }
420
381
}
421
382
422
383
Backend::Backend (ggml_backend_t backend, int n_layers, bool use_gpu)
@@ -480,14 +441,15 @@ namespace chatllm
480
441
481
442
BackendContext::BackendContext ()
482
443
{
444
+ ComputeManager::init ();
483
445
}
484
446
485
447
bool BackendContext::is_using_gpu (void ) const
486
448
{
487
449
return backends.size () > 1 ;
488
450
}
489
451
490
- void BackendContext::init (const std::vector<gpu_cfg> &gpu_cfgs, const int n_layers, const size_t graph_max_nodes_num)
452
+ void BackendContext::init (const std::vector<gpu_cfg> &gpu_cfgs, const int n_layers, const size_t graph_max_nodes_num, const int n_threads )
491
453
{
492
454
int prolog_id = -1 ;
493
455
int epilog_id = -1 ;
@@ -505,49 +467,46 @@ namespace chatllm
505
467
506
468
buf_compute_meta.resize (ggml_tensor_overhead () * graph_max_nodes_num + ggml_graph_overhead_custom (graph_max_nodes_num, false ));
507
469
508
- if ( ComputeManager::is_gpu_offload_supported () )
470
+ auto init_device = [ this , use_gpu, n_threads]( int device, ggml_backend_dev_t dev, int n_layers )
509
471
{
510
- int dev_cnt = ComputeManager::get_device_count ();
511
- for (int i = 0 ; i < dev_cnt; i++)
472
+ auto reg = ggml_backend_dev_backend_reg (dev);
473
+
474
+ ggml_backend_t backend = ComputeManager::init_backend_device (device);
475
+ CHATLLM_CHECK (backend != nullptr ) << __func__ << " : failed to initialize backend: #" << device;
476
+ backends.emplace_back (backend, n_layers, use_gpu);
477
+
478
+ if (n_threads > 0 )
512
479
{
513
- size_t total = 0 ;
514
- size_t free = ComputeManager::get_device_free_memory ( 0 , &total);
515
- ggml::log (GGML_LOG_LEVEL_INFO, " Device %d memory: total = %zd, free = %zd \n " , i, total, free );
480
+ auto set_n_threads = ( ggml_backend_set_n_threads_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_n_threads " ) ;
481
+ if (set_n_threads)
482
+ set_n_threads (backend, n_threads );
516
483
}
517
- }
484
+ };
518
485
519
- #if defined(GGML_USE_METAL)
520
- if (use_gpu)
521
- {
522
- CHATLLM_CHECK (gpu_cfgs.size () == 1 ) << __func__ << " : Metal backends number must be 1\n " ;
523
-
524
- backend_metal = ggml_backend_metal_init ();
525
- CHATLLM_CHECK (backend_metal != nullptr ) << __func__ << " : failed to initialize Metal backend\n " ;
526
- backends.emplace_back (backend_metal, n_gpu_layers, use_gpu);
527
- }
528
- #elif defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) || defined(GGML_USE_CANN)
529
486
if (use_gpu)
530
487
{
531
488
for (auto cfg : gpu_cfgs)
532
489
{
533
490
int device = cfg.id >= 0 ? cfg.id : 0 ;
534
- ggml_backend_t backend = ComputeManager::init_backend_device (device);
535
- CHATLLM_CHECK (backend != nullptr ) << __func__ << " : failed to initialize backend: #" << device;
536
- backends.emplace_back (backend, cfg.n_layers , use_gpu);
491
+ auto dev = ggml_backend_dev_get (device);
492
+ CHATLLM_CHECK (dev != nullptr ) << __func__ << " : failed to found GPU device: #" << device;
493
+ CHATLLM_CHECK (ggml_backend_dev_type (dev) != GGML_BACKEND_DEVICE_TYPE_CPU) << __func__ << " : device #" << device << " is CPU" ;
494
+
495
+ init_device (device, dev, cfg.n_layers );
537
496
}
538
497
}
539
- #endif
540
498
541
- #ifdef GGML_USE_BLAS
542
- #error TODO
543
- backend_blas = ggml_backend_blas_init ();
544
- if (backend_blas)
545
- backends.emplace_back (ctx->backend_blas , 0 );
546
- #endif
499
+ // append CPU backend
500
+ {
501
+ int device = ComputeManager::get_device_count () - 1 ;
502
+ auto dev = ggml_backend_dev_get (device);
503
+ CHATLLM_CHECK (dev != nullptr ) << __func__ << " : failed to found CPU device: #" << device;
504
+ CHATLLM_CHECK (ggml_backend_dev_type (dev) == GGML_BACKEND_DEVICE_TYPE_CPU) << __func__ << " : device #" << device << " is not CPU, but " << ggml_backend_dev_type (dev);
505
+
506
+ init_device (device, dev, n_layers - n_gpu_layers);
547
507
548
- backend_cpu = ggml_backend_cpu_init ();
549
- CHATLLM_CHECK (backend_cpu != nullptr ) << __func__ << " : failed to initialize CPU backend" ;
550
- backends.emplace_back (backend_cpu, n_layers - n_gpu_layers, use_gpu);
508
+ backend_cpu = backends[backends.size () - 1 ].backend ;
509
+ }
551
510
552
511
host_allocator.alloc_matrix = host_allocator.alloc_others = backends[backends.size () - 1 ].get_allocator (BufferType::Shared);
553
512
@@ -619,28 +578,13 @@ namespace chatllm
619
578
return p->observe_tensor_callback (t, p->observe_tensor_callback_data );
620
579
}
621
580
622
- void BackendContext::compute_graph (ggml_cgraph *gf, int n_threads )
581
+ void BackendContext::compute_graph (ggml_cgraph *gf)
623
582
{
624
- #ifdef GGML_USE_METAL
625
- if (ggml_backend_is_metal (backend_metal))
626
- {
627
- ggml_backend_metal_set_n_cb (backend_metal, n_threads);
628
- }
629
- #endif
630
-
631
583
if (backend_cpu != nullptr )
632
584
{
633
- ggml_backend_cpu_set_n_threads (backend_cpu, n_threads);
634
585
ggml_backend_cpu_set_abort_callback (backend_cpu, abort_callback, abort_callback_data);
635
586
}
636
587
637
- #ifdef GGML_USE_BLAS
638
- if (backend_blas != nullptr )
639
- {
640
- ggml_backend_blas_set_n_threads (backend_blas, n_threads);
641
- }
642
- #endif
643
-
644
588
if (observe_tensor_callback)
645
589
ggml_backend_sched_set_eval_callback (sched, _backend_sched_eval_callback, this );
646
590
else
@@ -764,9 +708,9 @@ namespace chatllm
764
708
return dynamic_cast <LayerBufAllocator *>(get_allocator ())->get_backend ();
765
709
}
766
710
767
- void ComputeContext::compute (int n_threads )
711
+ void ComputeContext::compute (void )
768
712
{
769
- backend_context->compute_graph (get_cgraph (), n_threads );
713
+ backend_context->compute_graph (get_cgraph ());
770
714
}
771
715
772
716
void ComputeContext::synchronize (void )
0 commit comments