Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize the Falcon block for inference #500

Merged
merged 18 commits into from
Sep 4, 2023
Prev Previous commit
Next Next commit
Fix formatting, reduce diff
  • Loading branch information
mryab committed Sep 4, 2023
commit 67764fea9e595e711d949105bcafade86553f0c4
6 changes: 1 addition & 5 deletions src/petals/models/falcon/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
rotate_half,
)


KVCache = Tuple[torch.Tensor, torch.Tensor]
INFERENCE_MAX_LENGTH = 8192

Expand Down Expand Up @@ -225,6 +224,7 @@ def __init__(self, config: FalconConfig):
self.hidden_dropout = config.hidden_dropout
self.config = config

assert not self.config.alibi
assert config.new_decoder_architecture
self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
Expand Down Expand Up @@ -299,10 +299,6 @@ def forward(


class WrappedFalconBlock(OptimizedFalconDecoderLayer):
def __init__(self, config: FalconConfig):
super().__init__(config)
assert not self.config.alibi

def forward(
self,
hidden_states: torch.Tensor,
Expand Down
5 changes: 3 additions & 2 deletions tests/test_optimized_layers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import torch

from petals.models.falcon.block import UnoptimizedWrappedFalconBlock
from petals.server.block_utils import resolve_block_dtype
from petals.server.from_pretrained import load_pretrained_block
from petals.utils.auto_config import AutoDistributedConfig
from petals.server.block_utils import resolve_block_dtype
from petals.utils.convert_block import QuantType, convert_block
import torch


def test_falcon():
Expand Down