Skip to content

Commit 0561d51

Browse files
author
Judd
committed
use api from ggml_backend to decouple from backends
1 parent 56c61da commit 0561d51

File tree

9 files changed

+199
-232
lines changed

9 files changed

+199
-232
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
4747

4848
## Quick Start
4949

50-
As simple as `main -i -m :model_id`. [Check it out](./docs/quick_start.md).
50+
As simple as `main_nim -i -m :model_id`. [Check it out](./docs/quick_start.md).
5151

5252
## Usage
5353

docs/quick_start.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
For Windows users, the easies way is to download a release, extract it, and start chatting:
44

55
```
6-
main -i -m :qwen2:0.5b
6+
main_nim -i -m :qwen2:0.5b
77
Downloading qwen2:0.5b
88
|████████████████████████████████████████████████████████████| 100.0%
99
________ __ __ __ __ ___ (通义千问)
@@ -21,7 +21,7 @@ You >
2121

2222
For Linux/MacOS (and Windows) users, build [binding](binding.md) and start chatting.
2323

24-
Note: `main` built from [`main.nim`](../bindings/main.nim) supports model id and model downloading, while the _default_ `main`
24+
Note: `main_nim` built from [`main.nim`](../bindings/main.nim) supports model id and model downloading, while the _default_ `main`
2525
built from [`main.cpp`](../src/main.cpp) does not.
2626

2727
### Download Quantized Models

src/backend.cpp

Lines changed: 105 additions & 161 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,9 @@
11
#include <cstring>
22
#include <set>
33
#include <stdarg.h>
4-
#ifdef GGML_USE_CUDA
5-
# include "ggml-cuda.h"
6-
#elif defined(GGML_USE_VULKAN)
7-
# include "ggml-vulkan.h"
8-
extern void ggml_vk_print_devices_info(void);
9-
#elif defined(GGML_USE_SYCL)
10-
# include "ggml-sycl.h"
11-
#elif defined(GGML_USE_KOMPUTE)
12-
# include "ggml-kompute.h"
13-
#elif defined(GGML_USE_CANN)
14-
# include "ggml-cann.h"
15-
#endif
16-
17-
#ifdef GGML_USE_BLAS
18-
# include "ggml-blas.h"
19-
#endif
20-
21-
#ifdef GGML_USE_METAL
22-
# include "ggml-metal.h"
23-
#endif
244

255
#include "backend.h"
26-
276
#include "basics.h"
28-
297
#include "ggml-cpu.h"
308

319
namespace chatllm
@@ -287,33 +265,46 @@ namespace chatllm
287265
return id;
288266
}
289267

290-
ggml_backend_allocator ComputeManager::get_default_allocator_cpu(bool host_buffer, bool use_gpu)
268+
void ComputeManager::init(void)
269+
{
270+
#if GGML_BACKEND_DL
271+
static bool initialized = false;
272+
if (initialized) return;
273+
initialized = true;
274+
275+
ggml_backend_load_all();
276+
#endif
277+
}
278+
279+
std::string ComputeManager::dev_type_to_str(DeviceType type)
280+
{
281+
switch (type)
282+
{
283+
case DeviceType::CPU:
284+
return "CPU";
285+
case DeviceType::GPU:
286+
return "GPU";
287+
case DeviceType::ACCEL:
288+
return "ACCEL";
289+
default:
290+
return "UNKNOWN";
291+
}
292+
}
293+
294+
ggml_backend_allocator ComputeManager::get_default_allocator_cpu(bool host_buffer, int gpu_id)
291295
{
292296
ggml_backend_allocator allocator = nullptr;
293297

294-
if (use_gpu)
298+
if (gpu_id >= 0)
295299
{
296-
#if defined(GGML_USE_CUDA)
297-
// host buffers should only be used when data is expected to be copied to/from the GPU
298-
if (host_buffer) {
299-
allocator = ggml_backend_cuda_host_buffer_type();
300-
}
301-
#elif defined(GGML_USE_SYCL)
302-
if (host_buffer) {
303-
allocator = ggml_backend_sycl_host_buffer_type();
304-
}
305-
#elif defined(GGML_USE_CPU_HBM)
306-
allocator = ggml_backend_cpu_hbm_buffer_type();
307-
#elif defined(GGML_USE_VULKAN)
308-
if (host_buffer) {
309-
allocator = ggml_backend_vk_host_buffer_type();
310-
}
311-
#endif
300+
auto dev = ggml_backend_dev_get(gpu_id);
301+
if (dev)
302+
allocator = ggml_backend_dev_host_buffer_type(dev);
312303
}
313304

314-
if (allocator == nullptr) {
305+
if (allocator == nullptr)
315306
allocator = ggml_backend_cpu_buffer_type();
316-
}
307+
317308
return allocator;
318309
}
319310

@@ -324,48 +315,23 @@ namespace chatllm
324315

325316
int ComputeManager::get_device_count(void)
326317
{
327-
int count = 1;
328-
#if defined(GGML_USE_CUDA)
329-
count = ggml_backend_cuda_get_device_count();
330-
#elif defined(GGML_USE_SYCL)
331-
count = ggml_backend_sycl_get_device_count();
332-
#elif defined(GGML_USE_VULKAN)
333-
count = ggml_backend_vk_get_device_count();
334-
#elif defined(GGML_USE_CANN)
335-
return ggml_backend_cann_get_device_count();
336-
#endif
337-
return count;
338-
}
339-
340-
ggml_backend_t ComputeManager::init_backend_device(int index)
341-
{
342-
#if defined(GGML_USE_CUDA)
343-
return ggml_backend_cuda_init(index);
344-
#elif defined(GGML_USE_SYCL)
345-
return ggml_backend_sycl_init(index);
346-
#elif defined(GGML_USE_VULKAN)
347-
return ggml_backend_vk_init(index);
348-
#elif defined(GGML_USE_CANN)
349-
return ggml_backend_cann_init(index);
350-
#endif
351-
return nullptr;
318+
ComputeManager::init();
319+
return (int)ggml_backend_dev_count();
320+
}
321+
322+
ggml_backend_t ComputeManager::init_backend_device(int index, const char *param)
323+
{
324+
auto dev = ggml_backend_dev_get(index);
325+
return dev ? ggml_backend_dev_init(dev, param) : nullptr;
352326
}
353327

354-
ggml_backend_allocator ComputeManager::get_default_allocator_offload(int gpu)
328+
ggml_backend_allocator ComputeManager::get_default_allocator_offload(int device)
355329
{
356330
ggml_backend_allocator allocator = nullptr;
357331

358-
#if defined(GGML_USE_METAL)
359-
allocator = ggml_backend_metal_buffer_type();
360-
#elif defined(GGML_USE_CUDA)
361-
allocator = ggml_backend_cuda_buffer_type(gpu);
362-
#elif defined(GGML_USE_VULKAN)
363-
allocator = ggml_backend_vk_buffer_type(gpu);
364-
#elif defined(GGML_USE_SYCL)
365-
allocator = ggml_backend_sycl_buffer_type(gpu);
366-
#elif defined(GGML_USE_CANN)
367-
allocator = ggml_backend_cann_buffer_type(gpu);
368-
#endif
332+
auto dev = ggml_backend_dev_get(device);
333+
if (dev)
334+
allocator = ggml_backend_dev_buffer_type(dev);
369335

370336
if (allocator == nullptr)
371337
allocator = get_default_allocator_cpu(true, true);
@@ -376,47 +342,42 @@ namespace chatllm
376342
{
377343
size_t total = 0;
378344
size_t free = 0;
379-
#if defined(GGML_USE_CUDA)
380-
ggml_backend_cuda_get_device_memory(device, &free, &total);
381-
#elif defined(GGML_USE_SYCL)
382-
ggml_backend_sycl_get_device_memory(device, &free, &total);
383-
#elif defined(GGML_USE_VULKAN)
384-
ggml_backend_vk_get_device_memory(device, &free, &total);
385-
#elif defined(GGML_USE_CANN)
386-
ggml_backend_cann_get_device_memory(device, &free, &total);
387-
#else
388-
free = 1;
389-
#endif
345+
346+
auto dev = ggml_backend_dev_get(device);
347+
if (dev)
348+
ggml_backend_dev_memory(dev, &free, &total);
349+
390350
if (p_total) *p_total = total;
391351
return free;
392352
}
393353

394-
int ComputeManager::get_max_devices(void)
354+
bool ComputeManager::get_device_info(int device, DeviceInfo &info)
395355
{
396-
#if defined(GGML_USE_METAL)
397-
return 1;
398-
#elif defined(GGML_USE_CUDA)
399-
return GGML_CUDA_MAX_DEVICES;
400-
#elif defined(GGML_USE_SYCL)
401-
return GGML_SYCL_MAX_DEVICES;
402-
#elif defined(GGML_USE_VULKAN)
403-
return GGML_VK_MAX_DEVICES;
404-
#elif defined(GGML_USE_CANN)
405-
return GGML_CANN_MAX_DEVICES;
406-
#else
407-
return 1;
408-
#endif
356+
auto dev = ggml_backend_dev_get(device);
357+
358+
if (nullptr == dev)
359+
return false;
360+
361+
ggml_backend_dev_props props;
362+
ggml_backend_dev_get_props(dev, &props);
363+
info.type = (DeviceType)props.type;
364+
info.backend_name = ggml_backend_reg_name(ggml_backend_dev_backend_reg(dev));
365+
info.name = ggml_backend_dev_name(dev);
366+
info.total_memory = props.memory_total;
367+
info.free_memory = props.memory_free;
368+
return true;
409369
}
410370

411-
bool ComputeManager::is_gpu_offload_supported(void)
371+
void ComputeManager::get_devices_info(std::vector<DeviceInfo> &devs)
412372
{
413-
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
414-
defined(GGML_USE_SYCL) || defined(GGML_USE_CANN)
415-
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
416-
return true;
417-
#else
418-
return false;
419-
#endif
373+
ComputeManager::init();
374+
devs.clear();
375+
for (int i = 0; i < get_device_count(); i++)
376+
{
377+
DeviceInfo info;
378+
CHATLLM_CHECK(get_device_info(i, info)) << __func__ << ": failed to get device #" << i;
379+
devs.push_back(info);
380+
}
420381
}
421382

422383
Backend::Backend(ggml_backend_t backend, int n_layers, bool use_gpu)
@@ -480,14 +441,15 @@ namespace chatllm
480441

481442
BackendContext::BackendContext()
482443
{
444+
ComputeManager::init();
483445
}
484446

485447
bool BackendContext::is_using_gpu(void) const
486448
{
487449
return backends.size() > 1;
488450
}
489451

490-
void BackendContext::init(const std::vector<gpu_cfg> &gpu_cfgs, const int n_layers, const size_t graph_max_nodes_num)
452+
void BackendContext::init(const std::vector<gpu_cfg> &gpu_cfgs, const int n_layers, const size_t graph_max_nodes_num, const int n_threads)
491453
{
492454
int prolog_id = -1;
493455
int epilog_id = -1;
@@ -505,49 +467,46 @@ namespace chatllm
505467

506468
buf_compute_meta.resize(ggml_tensor_overhead() * graph_max_nodes_num + ggml_graph_overhead_custom(graph_max_nodes_num, false));
507469

508-
if (ComputeManager::is_gpu_offload_supported())
470+
auto init_device = [this, use_gpu, n_threads](int device, ggml_backend_dev_t dev, int n_layers)
509471
{
510-
int dev_cnt = ComputeManager::get_device_count();
511-
for (int i = 0; i < dev_cnt; i++)
472+
auto reg = ggml_backend_dev_backend_reg(dev);
473+
474+
ggml_backend_t backend = ComputeManager::init_backend_device(device);
475+
CHATLLM_CHECK(backend != nullptr) << __func__ << ": failed to initialize backend: #" << device;
476+
backends.emplace_back(backend, n_layers, use_gpu);
477+
478+
if (n_threads > 0)
512479
{
513-
size_t total = 0;
514-
size_t free = ComputeManager::get_device_free_memory(0, &total);
515-
ggml::log(GGML_LOG_LEVEL_INFO, "Device %d memory: total = %zd, free = %zd\n", i, total, free);
480+
auto set_n_threads = (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
481+
if (set_n_threads)
482+
set_n_threads(backend, n_threads);
516483
}
517-
}
484+
};
518485

519-
#if defined(GGML_USE_METAL)
520-
if (use_gpu)
521-
{
522-
CHATLLM_CHECK(gpu_cfgs.size() == 1) << __func__ << ": Metal backends number must be 1\n";
523-
524-
backend_metal = ggml_backend_metal_init();
525-
CHATLLM_CHECK(backend_metal != nullptr) << __func__ << ": failed to initialize Metal backend\n";
526-
backends.emplace_back(backend_metal, n_gpu_layers, use_gpu);
527-
}
528-
#elif defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) || defined(GGML_USE_CANN)
529486
if (use_gpu)
530487
{
531488
for (auto cfg : gpu_cfgs)
532489
{
533490
int device = cfg.id >= 0 ? cfg.id : 0;
534-
ggml_backend_t backend = ComputeManager::init_backend_device(device);
535-
CHATLLM_CHECK(backend != nullptr) << __func__ << ": failed to initialize backend: #" << device;
536-
backends.emplace_back(backend, cfg.n_layers, use_gpu);
491+
auto dev = ggml_backend_dev_get(device);
492+
CHATLLM_CHECK(dev != nullptr) << __func__ << ": failed to found GPU device: #" << device;
493+
CHATLLM_CHECK(ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) << __func__ << ": device #" << device << " is CPU";
494+
495+
init_device(device, dev, cfg.n_layers);
537496
}
538497
}
539-
#endif
540498

541-
#ifdef GGML_USE_BLAS
542-
#error TODO
543-
backend_blas = ggml_backend_blas_init();
544-
if (backend_blas)
545-
backends.emplace_back(ctx->backend_blas, 0);
546-
#endif
499+
// append CPU backend
500+
{
501+
int device = ComputeManager::get_device_count() - 1;
502+
auto dev = ggml_backend_dev_get(device);
503+
CHATLLM_CHECK(dev != nullptr) << __func__ << ": failed to found CPU device: #" << device;
504+
CHATLLM_CHECK(ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) << __func__ << ": device #" << device << " is not CPU, but " << ggml_backend_dev_type(dev);
505+
506+
init_device(device, dev, n_layers - n_gpu_layers);
547507

548-
backend_cpu = ggml_backend_cpu_init();
549-
CHATLLM_CHECK(backend_cpu != nullptr) << __func__ << ": failed to initialize CPU backend";
550-
backends.emplace_back(backend_cpu, n_layers - n_gpu_layers, use_gpu);
508+
backend_cpu = backends[backends.size() - 1].backend;
509+
}
551510

552511
host_allocator.alloc_matrix = host_allocator.alloc_others = backends[backends.size() - 1].get_allocator(BufferType::Shared);
553512

@@ -619,28 +578,13 @@ namespace chatllm
619578
return p->observe_tensor_callback(t, p->observe_tensor_callback_data);
620579
}
621580

622-
void BackendContext::compute_graph(ggml_cgraph *gf, int n_threads)
581+
void BackendContext::compute_graph(ggml_cgraph *gf)
623582
{
624-
#ifdef GGML_USE_METAL
625-
if (ggml_backend_is_metal(backend_metal))
626-
{
627-
ggml_backend_metal_set_n_cb(backend_metal, n_threads);
628-
}
629-
#endif
630-
631583
if (backend_cpu != nullptr)
632584
{
633-
ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
634585
ggml_backend_cpu_set_abort_callback(backend_cpu, abort_callback, abort_callback_data);
635586
}
636587

637-
#ifdef GGML_USE_BLAS
638-
if (backend_blas != nullptr)
639-
{
640-
ggml_backend_blas_set_n_threads(backend_blas, n_threads);
641-
}
642-
#endif
643-
644588
if (observe_tensor_callback)
645589
ggml_backend_sched_set_eval_callback(sched, _backend_sched_eval_callback, this);
646590
else
@@ -764,9 +708,9 @@ namespace chatllm
764708
return dynamic_cast<LayerBufAllocator *>(get_allocator())->get_backend();
765709
}
766710

767-
void ComputeContext::compute(int n_threads)
711+
void ComputeContext::compute(void)
768712
{
769-
backend_context->compute_graph(get_cgraph(), n_threads);
713+
backend_context->compute_graph(get_cgraph());
770714
}
771715

772716
void ComputeContext::synchronize(void)

0 commit comments

Comments
 (0)