Skip to content

Commit

Permalink
ggml : sync latest llama.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Jun 19, 2023
1 parent 4e1e135 commit c251685
Show file tree
Hide file tree
Showing 11 changed files with 9,163 additions and 1,046 deletions.
11 changes: 11 additions & 0 deletions examples/common-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_ALL_F32:
case GGML_FTYPE_MOSTLY_F16:
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
case GGML_FTYPE_MOSTLY_Q2_K:
case GGML_FTYPE_MOSTLY_Q3_K:
case GGML_FTYPE_MOSTLY_Q4_K:
case GGML_FTYPE_MOSTLY_Q5_K:
case GGML_FTYPE_MOSTLY_Q6_K:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
Expand Down Expand Up @@ -187,6 +192,12 @@ bool ggml_common_quantize_0(
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q8_K:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
Expand Down
178 changes: 168 additions & 10 deletions include/ggml/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,13 @@ extern "C" {
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
// k-quantizations
GGML_TYPE_Q2_K = 10,
GGML_TYPE_Q3_K = 11,
GGML_TYPE_Q4_K = 12,
GGML_TYPE_Q5_K = 13,
GGML_TYPE_Q6_K = 14,
GGML_TYPE_Q8_K = 15,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
Expand All @@ -249,8 +256,8 @@ extern "C" {

enum ggml_backend {
GGML_BACKEND_CPU = 0,
GGML_BACKEND_CUDA = 1,
GGML_BACKEND_CL = 2,
GGML_BACKEND_GPU = 10,
GGML_BACKEND_GPU_SPLIT = 20,
};

// model file types
Expand All @@ -264,6 +271,11 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
};

// available tensor operations:
Expand All @@ -284,6 +296,7 @@ extern "C" {
GGML_OP_SUM_ROWS,
GGML_OP_MEAN,
GGML_OP_REPEAT,
GGML_OP_REPEAT_BACK,
GGML_OP_ABS,
GGML_OP_SGN,
GGML_OP_NEG,
Expand All @@ -298,6 +311,7 @@ extern "C" {
GGML_OP_RMS_NORM_BACK,

GGML_OP_MUL_MAT,
GGML_OP_OUT_PROD,

GGML_OP_SCALE,
GGML_OP_SET,
Expand All @@ -313,6 +327,7 @@ extern "C" {
GGML_OP_DIAG_MASK_INF,
GGML_OP_DIAG_MASK_ZERO,
GGML_OP_SOFT_MAX,
GGML_OP_SOFT_MAX_BACK,
GGML_OP_ROPE,
GGML_OP_ROPE_BACK,
GGML_OP_ALIBI,
Expand All @@ -323,12 +338,16 @@ extern "C" {

GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF,
GGML_OP_FLASH_ATTN_BACK,
GGML_OP_WIN_PART,
GGML_OP_WIN_UNPART,

GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY,

GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK,

GGML_OP_COUNT,
};

Expand Down Expand Up @@ -379,7 +398,9 @@ extern "C" {

char name[GGML_MAX_NAME];

char padding[16];
void * extra; // extra things e.g. for ggml-cuda.cu

char padding[4];
};

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
Expand Down Expand Up @@ -417,6 +438,25 @@ extern "C" {
bool no_alloc; // don't allocate memory for the tensor data
};


// compute types
enum ggml_task_type {
GGML_TASK_INIT = 0,
GGML_TASK_COMPUTE,
GGML_TASK_FINALIZE,
};

struct ggml_compute_params {
enum ggml_task_type type;

// ith = thread index, nth = number of threads
int ith, nth;

// work buffer for all threads
size_t wsize;
void * wdata;
};

// misc

GGML_API void ggml_time_init(void); // call this once at the beginning of the program
Expand All @@ -428,8 +468,10 @@ extern "C" {
GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx);

GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);

GGML_API int ggml_blck_size (enum ggml_type type);
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
Expand All @@ -445,21 +487,26 @@ extern "C" {
// TODO: temporary until model loading of ggml examples is refactored
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);

GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);

// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);

// main

GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
GGML_API void ggml_free(struct ggml_context * ctx);
GGML_API void ggml_free(struct ggml_context * ctx);

GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);

GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);

GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);

GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
Expand Down Expand Up @@ -639,6 +686,11 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);

GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);

GGML_API struct ggml_tensor * ggml_abs(
struct ggml_context * ctx,
struct ggml_tensor * a);
Expand Down Expand Up @@ -736,14 +788,22 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);

// A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally)
// A: n columns, m rows
// B: n columns, p rows (i.e. we transpose it internally)
// result is m columns, p rows
GGML_API struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);

// A: m columns, n rows,
// B: p columns, n rows,
// result is m columns, p rows
GGML_API struct ggml_tensor * ggml_out_prod(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);

//
// operations on tensors without backpropagation
//
Expand Down Expand Up @@ -954,6 +1014,17 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_soft_max_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);

// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);

// rotary position embedding
// if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style
Expand Down Expand Up @@ -1059,6 +1130,14 @@ extern "C" {
struct ggml_tensor * v,
bool masked);

GGML_API struct ggml_tensor * ggml_flash_attn_back(
struct ggml_context * ctx,
struct ggml_tensor * q,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * d,
bool masked);

GGML_API struct ggml_tensor * ggml_flash_ff(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand Down Expand Up @@ -1102,6 +1181,19 @@ extern "C" {
struct ggml_tensor * b,
ggml_binary_op_f32_t fun);

// loss function

GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);

GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c);

//
// automatic differentiation
//
Expand Down Expand Up @@ -1196,6 +1288,8 @@ extern "C" {
struct {
int n_iter;

float sched; // schedule multiplier (fixed, decay or warmup)
float decay; // weight decay for AdamW, use 0.0f to disable
float alpha; // learning rate
float beta1;
float beta2;
Expand All @@ -1220,6 +1314,49 @@ extern "C" {
} lbfgs;
};

struct ggml_opt_context {
struct ggml_context * ctx;
struct ggml_opt_params params;

int iter;
int64_t nx; // number of parameter elements

bool just_initialized;

struct {
struct ggml_tensor * x; // view of the parameters
struct ggml_tensor * g1; // gradient
struct ggml_tensor * g2; // gradient squared
struct ggml_tensor * m; // first moment
struct ggml_tensor * v; // second moment
struct ggml_tensor * mh; // first moment hat
struct ggml_tensor * vh; // second moment hat
struct ggml_tensor * pf; // past function values
float fx_best;
float fx_prev;
int n_no_improvement;
} adam;

struct {
struct ggml_tensor * x; // current parameters
struct ggml_tensor * xp; // previous parameters
struct ggml_tensor * g; // current gradient
struct ggml_tensor * gp; // previous gradient
struct ggml_tensor * d; // search direction
struct ggml_tensor * pf; // past function values
struct ggml_tensor * lmal; // the L-BFGS memory alpha
struct ggml_tensor * lmys; // the L-BFGS memory ys
struct ggml_tensor * lms; // the L-BFGS memory s
struct ggml_tensor * lmy; // the L-BFGS memory y
float fx_best;
float step;
int j;
int k;
int end;
int n_no_improvement;
} lbfgs;
};

GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);

// optimize the function defined by the tensor f
Expand All @@ -1228,6 +1365,27 @@ extern "C" {
struct ggml_opt_params params,
struct ggml_tensor * f);

// initialize optimizer context
GGML_API void ggml_opt_init(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_opt_params params,
int64_t nx);

// continue optimizing the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt_resume(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_tensor * f);

// continue optimizing the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt_resume_g(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_tensor * f,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb);

//
// quantization
//
Expand Down
15 changes: 9 additions & 6 deletions scripts/sync-llama.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#!/bin/bash

cp -rpv ../llama.cpp/ggml.c src/ggml.c
cp -rpv ../llama.cpp/ggml-cuda.h src/ggml-cuda.h
cp -rpv ../llama.cpp/ggml-cuda.cu src/ggml-cuda.cu
cp -rpv ../llama.cpp/ggml-opencl.h src/ggml-opencl.h
cp -rpv ../llama.cpp/ggml-opencl.c src/ggml-opencl.c
cp -rpv ../llama.cpp/ggml.h include/ggml/ggml.h
cp -rpv ../llama.cpp/ggml.c src/ggml.c
cp -rpv ../llama.cpp/ggml-cuda.h src/ggml-cuda.h
cp -rpv ../llama.cpp/ggml-cuda.cu src/ggml-cuda.cu
cp -rpv ../llama.cpp/ggml-opencl.h src/ggml-opencl.h
cp -rpv ../llama.cpp/ggml-opencl.cpp src/ggml-opencl.cpp
cp -rpv ../llama.cpp/ggml-metal.h src/ggml-metal.h
cp -rpv ../llama.cpp/ggml-metal.m src/ggml-metal.m
cp -rpv ../llama.cpp/ggml-metal.metal src/ggml-metal.metal
cp -rpv ../llama.cpp/ggml.h include/ggml/ggml.h
Loading

0 comments on commit c251685

Please sign in to comment.