Skip to content

Commit

Permalink
ggml-qnn: refine ggml backend subsystem (#216)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhouwg authored May 31, 2024
1 parent 1665cca commit bee4a4b
Show file tree
Hide file tree
Showing 7 changed files with 341 additions and 187 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ public static boolean isDebug() {
public void initGlobal() {
long startTime = System.currentTimeMillis();
String buildTime = BuildConfig.BUILD_TIME;
CDEUtils.setReleaseMode(false);
CDEUtils.setReleaseMode(true);
CDELog.j(TAG, "*************************enter initGlobal *********************************");
CDELog.j(TAG, "buildTime: " + buildTime);
CDELog.j(TAG, "init app");
Expand Down
189 changes: 147 additions & 42 deletions core/ggml/jni/ggml-jni-impl-external.cpp

Large diffs are not rendered by default.

155 changes: 153 additions & 2 deletions core/ggml/llamacpp/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -280,21 +280,170 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
return backend->iface.graph_plan_compute(backend, plan);
}

static ggml_backend_t g_cpu_backend = NULL;
static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
static void ggml_setup_op_has_task_pass(void) {
{ // INIT
bool * p = GGML_OP_HAS_INIT;

p[GGML_OP_ACC ] = true;
p[GGML_OP_MUL_MAT ] = true;
p[GGML_OP_MUL_MAT_ID ] = true;
p[GGML_OP_OUT_PROD ] = true;
p[GGML_OP_SET ] = true;
p[GGML_OP_GET_ROWS_BACK ] = true;
p[GGML_OP_DIAG_MASK_INF ] = true;
p[GGML_OP_DIAG_MASK_ZERO ] = true;
p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
p[GGML_OP_ADD_REL_POS ] = true;
}

{ // FINALIZE
bool * p = GGML_OP_HAS_FINALIZE;

p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
}
}

ggml_backend_t ggml_backend_get_default_cpu_backend() {
if (NULL == g_cpu_backend) {
ggml_backend_cpu_init();
}

return g_cpu_backend;
}

struct ggml_compute_state;
extern void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state);
static enum ggml_status ggml_backend_graph_compute_mixed(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status result = GGML_STATUS_SUCCESS;
int node_n = -1;

static bool is_first_call = true;
if (is_first_call) {
ggml_setup_op_has_task_pass();
is_first_call = false;
}

struct ggml_cplan plan = ggml_graph_plan(cgraph, 1);
if (plan.work_size > 0) {
plan.work_data = (uint8_t *)(malloc(plan.work_size));
if (NULL == plan.work_data) {
return GGML_STATUS_ALLOC_FAILED;
}
}

struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
/*.ith =*/ 0,
/*.nth =*/ 0,
/*.wsize =*/ plan.work_size,
/*.wdata =*/ plan.work_data
};
while (++node_n < cgraph->n_nodes) {
struct ggml_tensor * node = cgraph->nodes[node_n];
params.nth = 1;

if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
}

if (ggml_backend_supports_op(backend, node)) {
//LOGGD("%s: running op %s (%s) with backend %s\n", __func__, node->name, ggml_op_name(node->op), ggml_backend_name(backend));
if (backend->iface.offload_op != NULL) {
backend->iface.offload_op(backend, node);
}
} else {
//LOGGD("%s: error: op not supported %s (%s) with backend %s\n", __func__, node->name, ggml_op_name(node->op), ggml_backend_name(backend));
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_TYPE_INIT;
ggml_compute_forward(&params, node, NULL);
}
params.type = GGML_TASK_TYPE_COMPUTE;
ggml_compute_forward(&params, node, NULL);
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_TYPE_FINALIZE;
ggml_compute_forward(&params, node, NULL);
}
}
}

if (NULL != plan.work_data) {
free(plan.work_data);
}

return result;
}

#ifdef GGML_USE_QNN
extern bool ggml_backend_is_qnn(ggml_backend_t backend);
#endif

static bool is_qnn_backend(ggml_backend_t backend) {
#ifdef GGML_USE_QNN
return ggml_backend_is_qnn(backend);
#else
GGML_UNUSED(backend);
return false;
#endif
}

enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
enum ggml_status err = GGML_STATUS_SUCCESS;

if (NULL == g_cpu_backend) {
ggml_backend_cpu_init();
}
if (backend != g_cpu_backend) {
if (is_qnn_backend(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
//mixed inference between Qualcomm's CPU/GPU or CPU/NPU
err = ggml_backend_graph_compute_mixed(backend, cgraph);
} else { //compatible for sycl backend or other existing backend
err = backend->iface.graph_compute(backend, cgraph);
}
} else {
//compatible for existing backend
err = backend->iface.graph_compute(backend, cgraph);;
}
ggml_backend_synchronize(backend);
return err;
}


enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph);
enum ggml_status err = GGML_STATUS_SUCCESS;

if (NULL == g_cpu_backend) {
ggml_backend_cpu_init();
}
if (backend != g_cpu_backend) {
if (is_qnn_backend(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
//mixed inference between Qualcomm's CPU/GPU or CPU/NPU
err = ggml_backend_graph_compute_mixed(backend, cgraph);
} else { //compatible for sycl backend or other existing backend
err = backend->iface.graph_compute(backend, cgraph);
}
} else {
//compatible for existing backend
err = backend->iface.graph_compute(backend, cgraph);;
}

return err;
}

bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
return backend->iface.supports_op(backend, op);
}

bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
if (is_qnn_backend(backend)) { //compatible for sycl backend or other existing backend
return false;
}

if (backend->iface.offload_op != NULL) {
return backend->iface.offload_op(backend, op);
}
Expand Down Expand Up @@ -899,6 +1048,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
/* .interface = */ cpu_backend_i,
/* .context = */ ctx
};
g_cpu_backend = cpu_backend;

return cpu_backend;
}

Expand Down
1 change: 1 addition & 0 deletions core/ggml/llamacpp/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ extern "C" {
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_get_default_cpu_backend(void);

GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
Expand Down
Loading

1 comment on commit bee4a4b

@zhouwg
Copy link
Owner Author

@zhouwg zhouwg commented on bee4a4b May 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#216 in this project
ggerganov/llama.cpp#7641 in upstream

whisper and llm and minicpm-v inference using QNN backend works fine as expected on Xiaomi14.

new ggml backend can following this style for mixed inference between CPU&GPU / CPU&NPU very easily and just focus on bottle-neck performance fine-tuning for edge AI inference on Android phone.

there are three known bugs(some UT cases in JNI layer and a resouce clenup issue in llm infererence) in this commit.

Please sign in to comment.