[Core][Bugfix]: fix prefix caching for blockv2 (vllm-project#4764)

Co-authored-by: Lei Wen <wenlei03@qiyi.com>
sroy745 · sroy745 · May 29, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
commit e64fde4b013cb8bb2321f59ba78aca50b02071cb
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
@@ -410,6 +410,123 @@ def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
 
         assert (len(res) == zero_point_blocks)
 
+    # Test case that assume those prompted block after first immutable would
+    # be freed into hashless allocator, while first immutable block get ref
+    # increased.
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [3])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        token_ids = list(range(block_size))
+
+        block = allocator.allocate_immutable(prev_block=None,
+                                             token_ids=token_ids)
+
+        assert allocator._refcounter.get(block.block_id) == 1
+        m = allocator.allocate_mutable(prev_block=None)
+
+        block_id = m.block_id
+        for i in range(block_size):
+            m.append_token_ids([i])
+        # After block get promoted to immutable from mutable, if there is
+        # already same content hash block, then it shall be released into
+        # hashless_allocator
+        # And first immutable block's ref get increased by 1
+        assert m.block_id == block.block_id
+        assert block_id in allocator._hashless_allocator._free_block_indices
+        assert allocator._refcounter.get(block.block_id) == 2
+
+    # Test case when eviction and allocation are mixed,
+    # make sure they work as expected
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [3])
+    @pytest.mark.parametrize("block_size", [16])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
+        random.seed(seed)
+
+        all_blocks_list = [i for i in range(num_blocks)]
+        zero_ref = {i: 0 for i in range(num_blocks)}
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        token_ids = list(range(num_blocks * block_size))
+
+        # now we have num_blocks free blocks in hashless allocator
+        # with internal tracking list _blocks _cached_blocks and evictor
+        # empty and block's ref shall be 0
+        assert list(allocator._hashless_allocator._free_block_indices
+                    ) == all_blocks_list
+        assert len(allocator._blocks.keys()) == 0
+        assert len(allocator._cached_blocks.values()) == 0
+        assert len(allocator.evictor.free_table.keys()) == 0
+        assert allocator._refcounter._refcounts == zero_ref
+
+        # Allocate immutable chains with only one block residuled in
+        new_block = []
+        for i in range(num_blocks):
+            block = allocator.allocate_immutable(
+                prev_block=None,
+                token_ids=token_ids[block_size * i:block_size * (i + 1)])
+            new_block.append(block)
+
+        # Free all blocks, and now all blocks shall be in the evictor
+        # there shall be no tracking data left in _blocks
+        # all blocks shall be tracked in _cached_blocks
+        # all blocks' ref shall be zero
+        for block in new_block:
+            allocator.free(block)
+
+        assert len(allocator._blocks.keys()) == 0
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        assert list(allocator._cached_blocks.values()) == all_blocks_list
+        assert list(allocator.evictor.free_table.keys()) == all_blocks_list
+        assert allocator._refcounter._refcounts == zero_ref
+
+        # Allocate a mutable block, and the first block shall be evicted
+        # and set its content hash into None, ref to 1
+        mutable = allocator.allocate_mutable(prev_block=None)
+
+        assert mutable.block_id == 0
+        assert mutable.content_hash is None
+        assert 0 in allocator._blocks
+        assert allocator._refcounter.get(0) == 1
+        assert 0 not in allocator._cached_blocks
+        assert 0 not in allocator.evictor
+
+        # Since this mutable block has no hash yet, it shall be released into
+        # hashless allocator
+        allocator.free(mutable)
+
+        assert len(allocator._blocks.keys()) == 0
+        assert allocator._refcounter._refcounts == zero_ref
+        assert 0 not in allocator._cached_blocks
+        assert 0 not in allocator.evictor
+        assert 0 in allocator._hashless_allocator._free_block_indices
+
+        # when allocate immutable with first block_size tokens, we
+        # shall get free block from hashless allocator, thus no block left
+        # in hashless
+        block = allocator.allocate_immutable(prev_block=None,
+                                             token_ids=token_ids[:block_size])
+
+        assert block.block_id == 0
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        assert 0 in allocator._blocks
+        assert 0 in allocator._cached_blocks.values()
+        assert allocator._refcounter.get(0) == 1
+        assert 0 not in allocator.evictor
+
+        # allocate mutable block again, it shall be popped from evictor
+        mutable = allocator.allocate_mutable(prev_block=None)
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
+        assert mutable.block_id not in allocator.evictor.free_table
+        assert allocator._refcounter.get(mutable.block_id) == 1
+
     # Test case where two last accessed times are equal
     @staticmethod
     @pytest.mark.parametrize("num_blocks", [1024])

diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
@@ -160,21 +160,17 @@ def allocate_mutable(self,
         # If the evictor has blocks available for eviction, evict a block
         # and return it.
         if self.evictor.num_blocks > 0:
+            # here we get an evicted block, which is only added
+            # into evictor if its ref counter is 0
+            # and since its content would be changed, we need
+            # to remove it from _cached_blocks's tracking list
             block_id, content_hash_to_evict = self.evictor.evict()
 
-            # Here we may have scenario that several blocks have
-            # the same content hash, but due to the latter coming block
-            # is coming from mutable to immutable path, their physical
-            # block is added into evictor.
-            # However in this case, we shall not pop the _cached_blocks,
-            # as the same content is still used by others, which means
-            # we need to check ref before decide to pop the list.
-
             _block_id = self._cached_blocks[content_hash_to_evict]
-            refcount = self._refcounter.get(_block_id)
-            if refcount == 1:
-                self._cached_blocks.pop(content_hash_to_evict)
-                assert _block_id == block_id
+            assert self._refcounter.get(_block_id) == 0
+            assert _block_id == block_id
+
+            self._cached_blocks.pop(content_hash_to_evict)
 
             self._refcounter.incr(block_id)
 
@@ -199,7 +195,11 @@ def allocate_mutable(self,
 
     def _incr_refcount_cached_block(self, block: Block,
                                     block_id: BlockId) -> None:
-        # since block is already computed, mark it
+        # now _incr_refcount_cached_block comes from two place
+        # allocate_immutable/promote_to_immutable_block where hit
+        # _cached_blocks hash key.
+        # In both cases, it means that already exists a already
+        # computed block which shared with block now
         block.computed = True
 
         refcount = self._refcounter.incr(block_id)
@@ -228,13 +228,19 @@ def _free_block_id_for_block(self, block_id: BlockId,
                                  block: Block) -> None:
         assert isinstance(block, PrefixCachingBlock)
 
-        if block.content_hash is None:
+        # if we comes from promote_to_immutable_block, it means that
+        # block.content_hash is never None.
+        # However we need to release the same content block, so that
+        # physical block could get reused.
+        if block.block_id != block_id or block.content_hash is None:
             refcount = self._refcounter.get(block_id)
             # We have fork case where block would get more than one ref,
             # so we cannot free it from tracking if ref cnt large than 1
-            if refcount <= 1:
-                assert block.block_id is not None
+            assert block.block_id is not None
+            refcount = self._refcounter.get(block.block_id)
+            if refcount == 1:
                 del self._blocks[block.block_id]
+
             return self._hashless_allocator.free(block)
 
         refcount = self._refcounter.decr(block_id)
@@ -317,7 +323,8 @@ def promote_to_immutable_block(self, block: Block) -> BlockId:
         if block.content_hash not in self._cached_blocks:
             self._cached_blocks[block.content_hash] = block.block_id
         else:
-            self._free_block_id_for_block(block.block_id, block)
+            self._free_block_id_for_block(
+                self._cached_blocks[block.content_hash], block)
             self._incr_refcount_cached_block(
                 block, self._cached_blocks[block.content_hash])