CUDA: fix negative KV_max values in FA (#15321)

JohannesGaessler · web-flow · commit 4227c9be4268 · 2025-08-14T23:21:24.000+02:00
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -539,11 +539,15 @@ static __global__ void flash_attn_mask_to_KV_max(
         all_inf = warp_reduce_all(all_inf);
 
         if (!all_inf) {
-            KV_max_sj += FATTN_KQ_STRIDE;
             break;
         }
     }
 
+    // If the break in the loop was not triggered, KV_max_sj is now -FATTN_KQ_STRIDE.
+    // If the break was triggered it's the lower edge of the tile with the first non-masked values.
+    // In either case, walk back the decrementation by FATTN_KQ_STRIDE.
+    KV_max_sj += FATTN_KQ_STRIDE;
+
     if (threadIdx.x != 0) {
         return;
     }

Original file line number	Diff line number	Diff line change
`@@ -539,11 +539,15 @@ static __global__ void flash_attn_mask_to_KV_max(`
`539`	`539`	`all_inf = warp_reduce_all(all_inf);`
`540`	`540`
`541`	`541`	`if (!all_inf) {`
`542`		`- KV_max_sj += FATTN_KQ_STRIDE;`
`543`	`542`	`break;`
`544`	`543`	`}`
`545`	`544`	`}`
`546`	`545`
	`546`	`+ // If the break in the loop was not triggered, KV_max_sj is now -FATTN_KQ_STRIDE.`
	`547`	`+ // If the break was triggered it's the lower edge of the tile with the first non-masked values.`
	`548`	`+ // In either case, walk back the decrementation by FATTN_KQ_STRIDE.`
	`549`	`+ KV_max_sj += FATTN_KQ_STRIDE;`
	`550`	`+`
`547`	`551`	`if (threadIdx.x != 0) {`
`548`	`552`	`return;`
`549`	`553`	`}`