DeepSeek-V4-Flash-GGUF / lcpp-dsv4-lid-combo.diff
sokann's picture
Update README.md, add .diff for 1M context
5322580
Raw
History Blame Contribute Delete
16.1 kB
diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h
index 5ad121ae5..de770ee63 100644
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -11,7 +11,7 @@ extern "C" {
#define RPC_PROTO_PATCH_VERSION 1
#ifdef __cplusplus
-static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
+static_assert(GGML_OP_COUNT == 98, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
#endif
#define GGML_RPC_MAX_SERVERS 16
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index d6807b6dd..a050211b1 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -568,6 +568,7 @@ extern "C" {
GGML_OP_RWKV_WKV7,
GGML_OP_SOLVE_TRI,
GGML_OP_GATED_DELTA_NET,
+ GGML_OP_LIGHTNING_INDEXER,
GGML_OP_UNARY,
@@ -2573,6 +2574,14 @@ extern "C" {
struct ggml_tensor * state,
int64_t K);
+ GGML_API struct ggml_tensor * ggml_lightning_indexer(
+ struct ggml_context * ctx,
+ struct ggml_tensor * q,
+ struct ggml_tensor * k,
+ struct ggml_tensor * weights,
+ float scale_embd,
+ float scale_heads);
+
// custom operators
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index eb8341c9a..a290e6dad 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2051,6 +2051,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_gated_delta_net(params, tensor);
} break;
+ case GGML_OP_LIGHTNING_INDEXER:
+ {
+ ggml_compute_forward_lightning_indexer(params, tensor);
+ } break;
case GGML_OP_MAP_CUSTOM1:
{
ggml_compute_forward_map_custom1(params, tensor);
@@ -2371,6 +2375,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_FLASH_ATTN_BACK:
case GGML_OP_SSM_CONV:
case GGML_OP_SSM_SCAN:
+ case GGML_OP_LIGHTNING_INDEXER:
{
n_tasks = n_threads;
} break;
@@ -2956,6 +2961,12 @@ struct ggml_cplan ggml_graph_plan(
{
GGML_ABORT("fatal error");
}
+ case GGML_OP_LIGHTNING_INDEXER:
+ {
+ // temp buffer for dequantizing lightning indexer keys
+ const int64_t ne10 = node->src[1]->ne[0];
+ cur += sizeof(float)*ne10*n_tasks;
+ } break;
default:
break;
}
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 6724686b8..971798728 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -11512,3 +11512,76 @@ void ggml_compute_forward_fwht(const ggml_compute_params * params, ggml_tensor *
}
}
}
+
+// ggml_compute_forward_lightning_indexer
+
+void ggml_compute_forward_lightning_indexer(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+
+ const ggml_tensor * src0 = dst->src[0]; // q
+ const ggml_tensor * src1 = dst->src[1]; // k
+ const ggml_tensor * src2 = dst->src[2]; // weights
+
+ const float scale_embd = ggml_get_op_params_f32(dst, 0);
+ const float scale_heads = ggml_get_op_params_f32(dst, 1);
+
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
+
+ GGML_TENSOR_TERNARY_OP_LOCALS
+
+ GGML_ASSERT( nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ int n_embd = src0->ne[0];
+ int n_head = src0->ne[1];
+ int n_batch = src0->ne[2];
+ int n_stream = src0->ne[3];
+ int n_kv = src1->ne[2];
+
+ ggml_to_float_t const k_to_float = ggml_get_type_traits(src1->type)->to_float;
+ GGML_ASSERT((src1->type == GGML_TYPE_F32 || k_to_float) && "lightning indexer: unsupported K-type");
+
+ const int nr = n_kv;
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ // (temporary) buffer for K converted to float
+ float * src1_row_f32 = (float *) params->wdata + ith*(1*n_embd + CACHE_LINE_SIZE_F32);
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int i_stream = 0; i_stream < n_stream; ++i_stream) {
+ for (int i_batch = 0; i_batch < n_batch; ++i_batch) {
+ for (int i_kv = ir0; i_kv < ir1; ++i_kv) {
+ char * src1_row = (char *) src1->data + i_kv*nb12 + i_stream*nb13;
+ if (k_to_float) {
+ k_to_float(src1_row, src1_row_f32, n_embd);
+ } else {
+ src1_row_f32 = (float *) src1_row;
+ }
+ float * src2_row = (float *) ((char *) src2->data + i_batch*nb21 + i_stream*nb23);
+ float * dst_row = (float *) ((char *) dst->data + i_batch*nb1 + i_stream*nb3);
+ float score = 0.0f;
+ for (int i_head = 0; i_head < n_head; ++i_head) {
+ // dot product of q and k for head i_head
+ float qk = 0.0f;
+ float * src0_row = (float *) ((char *) src0->data + i_head*nb01 + i_batch*nb02 + i_stream*nb03);
+ ggml_vec_dot_f32(n_embd, &qk, 0, src0_row, 0, src1_row_f32, 0, 1);
+ qk *= scale_embd;
+ // ReLU and weights
+ score += MAX(qk, 0.0f) * src2_row[i_head];
+ }
+ score *= scale_heads;
+ dst_row[i_kv] = score;
+ }
+ }
+ }
+}
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index a8e18c716..e956c25d3 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -105,6 +105,7 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s
void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_lightning_indexer(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index c4f08091e..33a38c23e 100644
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -28,6 +28,20 @@ static __global__ void init_offsets(int * offsets, const int ncols, const int nr
#endif // STRIDED_ITERATOR_AVAILABLE
#ifdef GGML_CUDA_USE_CUB
+
+// returns the suggested maximum number of rows to process during one argsort_f32_i32_cuda_cub() call
+int argsort_f32_i32_cuda_cub_chunk_nrows(const size_t nb01, const int64_t nrows) {
+ // perform argsort in chunks up to approximately this size (currently 64MB)
+ // to avoid excessive temporary buffers memory usage
+ const int chunk_bytes = 1 << 26;
+
+ // calculate how many rows will fit in one chunk (must be at least one)
+ const int chunk_nrows = chunk_bytes > nb01 ? chunk_bytes / nb01 : 1;
+
+ // limit the resulting amount to total nrows
+ return nrows < chunk_nrows ? nrows : chunk_nrows;
+}
+
void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
const float * x,
int * dst,
@@ -254,11 +268,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const size_t shared_mem = ncols_pad * sizeof(int);
const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
- if (shared_mem > max_shared_mem || ncols > 1024) {
- ggml_cuda_pool & pool = ctx.pool();
- argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
- } else {
- argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ // early return if we can use bitonic argsort
+ if (shared_mem <= max_shared_mem && ncols <= 1024) {
+ return argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ }
+
+ const int chunk_nrows = argsort_f32_i32_cuda_cub_chunk_nrows(src0->nb[1], nrows);
+
+ ggml_cuda_pool & pool = ctx.pool();
+
+ for (int64_t i = 0; i < nrows; i += chunk_nrows) {
+ int iter_nrows = chunk_nrows < nrows - i ? chunk_nrows : nrows - i;
+
+ argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, iter_nrows, order, stream);
+
+ src0_d += ncols * iter_nrows;
+ dst_d += ncols * iter_nrows;
}
#else
argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
diff --git a/ggml/src/ggml-cuda/argsort.cuh b/ggml/src/ggml-cuda/argsort.cuh
index 22b7306f2..3abb6448a 100644
--- a/ggml/src/ggml-cuda/argsort.cuh
+++ b/ggml/src/ggml-cuda/argsort.cuh
@@ -3,6 +3,7 @@
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
#ifdef GGML_CUDA_USE_CUB
+int argsort_f32_i32_cuda_cub_chunk_nrows(const size_t nb01, const int64_t nrows);
void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
const float * x,
int * dst,
diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu
index db1d39e2d..5e708e6c5 100644
--- a/ggml/src/ggml-cuda/top-k.cu
+++ b/ggml/src/ggml-cuda/top-k.cu
@@ -75,17 +75,26 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int ncols_pad = next_power_of_2(ncols);
const size_t shared_mem = ncols_pad * sizeof(int);
const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+ const bool use_bitonic = shared_mem <= max_shared_mem && ncols <= 1024;
+ const int chunk_nrows = argsort_f32_i32_cuda_cub_chunk_nrows(src0->nb[1], nrows);
- ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
+ ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * chunk_nrows);
int * tmp_dst = temp_dst_alloc.get();
- if (shared_mem > max_shared_mem || ncols > 1024) {
- argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
- } else {
- argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
+ for (int64_t i = 0; i < nrows; i += chunk_nrows) {
+ int iter_nrows = chunk_nrows < nrows - i ? chunk_nrows : nrows - i;
+
+ if (use_bitonic) {
+ argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, iter_nrows, GGML_SORT_ORDER_DESC, stream);
+ } else {
+ argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, iter_nrows, GGML_SORT_ORDER_DESC, stream);
+ }
+ CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), iter_nrows,
+ cudaMemcpyDeviceToDevice, stream));
+
+ src0_d += ncols * iter_nrows;
+ dst_d += k * iter_nrows;
}
- CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
- cudaMemcpyDeviceToDevice, stream));
#else // GGML_CUDA_USE_CUB
ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
int * tmp_dst = temp_dst_alloc.get();
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 0f682fd18..31a5cbc97 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1061,6 +1061,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"RWKV_WKV7",
"SOLVE_TRI",
"GATED_DELTA_NET",
+ "LIGHTNING_INDEXER",
"UNARY",
@@ -1078,7 +1079,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU",
};
-static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
+static_assert(GGML_OP_COUNT == 98, "GGML_OP_COUNT != 98");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1172,6 +1173,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"rwkv_wkv7(r, w, k, v, a, b, s)",
"A X = B, A triangular, solve X",
"gated_delta_net(q, k, v, g, beta, s)",
+ "lightning_indexer(q, k, weights, scale_embd, scale_heads)",
"unary(x)",
@@ -1189,7 +1191,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"glu(x)",
};
-static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
+static_assert(GGML_OP_COUNT == 98, "GGML_OP_COUNT != 98");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -6268,6 +6270,40 @@ struct ggml_tensor * ggml_gated_delta_net(
return result;
}
+// ggml_lightning_indexer
+
+struct ggml_tensor * ggml_lightning_indexer(
+ struct ggml_context * ctx,
+ struct ggml_tensor * q,
+ struct ggml_tensor * k,
+ struct ggml_tensor * weights,
+ float scale_embd,
+ float scale_heads) {
+
+ GGML_ASSERT(q->type == GGML_TYPE_F32);
+ GGML_ASSERT(weights->type == GGML_TYPE_F32);
+ GGML_ASSERT(q->ne[0] == k->ne[0]);
+ GGML_ASSERT(q->ne[1] == weights->ne[0]);
+ GGML_ASSERT(k->ne[1] == 1);
+ GGML_ASSERT(q->ne[2] == weights->ne[1]);
+ GGML_ASSERT(weights->ne[2] == 1);
+ GGML_ASSERT(q->ne[3] == k->ne[3]);
+ GGML_ASSERT(k->ne[3] == weights->ne[3]);
+
+ int64_t ne[4] = { k->ne[2], q->ne[2], 1, q->ne[3] };
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+ ggml_set_op_params_f32(result, 0, scale_embd);
+ ggml_set_op_params_f32(result, 1, scale_heads);
+
+ result->op = GGML_OP_LIGHTNING_INDEXER;
+ result->src[0] = q;
+ result->src[1] = k;
+ result->src[2] = weights;
+
+ return result;
+}
+
////////////////////////////////////////////////////////////////////////////////
struct ggml_hash_set ggml_hash_set_new(size_t size) {
diff --git a/src/models/deepseek4.cpp b/src/models/deepseek4.cpp
index 759654228..38652ef83 100644
--- a/src/models/deepseek4.cpp
+++ b/src/models/deepseek4.cpp
@@ -582,21 +582,7 @@ ggml_tensor * llama_model_deepseek4::graph::build_lid_top_k(
indexer_weights->ne[0], indexer_weights->ne[1]/n_stream, indexer_weights->ne[2], n_stream,
indexer_weights->nb[1], indexer_weights->nb[2]/n_stream, indexer_weights->nb[3]/n_stream, 0);
- indexer_q = ggml_permute(ctx0, indexer_q, 0, 2, 1, 3);
- cb(indexer_q, "lid_q", il);
- indexer_k = ggml_permute(ctx0, indexer_k, 0, 2, 1, 3);
- cb(indexer_k, "lid_k", il);
-
- ggml_tensor * indexer_kq = ggml_mul_mat(ctx0, indexer_k, indexer_q);
- cb(indexer_kq, "lid_kq", il);
-
- indexer_kq = ggml_cont(ctx0, ggml_permute(ctx0, indexer_kq, 2, 1, 0, 3));
- cb(indexer_kq, "lid_kq", il);
-
- ggml_tensor * indexer_score = ggml_relu(ctx0, indexer_kq);
- indexer_score = ggml_mul(ctx0, indexer_score, indexer_weights);
- indexer_score = ggml_sum_rows(ctx0, indexer_score);
- indexer_score = ggml_cont(ctx0, ggml_permute(ctx0, indexer_score, 2, 1, 0, 3));
+ ggml_tensor * indexer_score = ggml_lightning_indexer(ctx0, indexer_q, indexer_k, indexer_weights, 1.0f, 1.0f);
cb(indexer_score, "lid_score", il);
indexer_score = ggml_add(ctx0, indexer_score, inp_lid.kq_mask);