Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test Llama, rebalancing, throughput eval, and all CLI scripts #452

Merged
merged 32 commits into from
Aug 8, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
69abacc
Show argparse defaults, fix docstring
borzunov Aug 8, 2023
ca2850e
Test petals.cli.run_dht
borzunov Aug 8, 2023
816401e
Increase mean_block_selection_delay
borzunov Aug 8, 2023
7330653
Test rebalancing
borzunov Aug 8, 2023
a00e79d
Add help to benchmark argparse
borzunov Aug 8, 2023
5b3d4c4
Use less RAM
borzunov Aug 8, 2023
2b765b9
Don't set default model in benchmarks
borzunov Aug 8, 2023
fae58d9
Fix sleep time
borzunov Aug 8, 2023
856f53f
Test --throughput eval
borzunov Aug 8, 2023
05dc383
Fix flapping test
borzunov Aug 8, 2023
18e5b00
Use AutoDistributed{Config,Model} in tests
borzunov Aug 8, 2023
168e478
Add Maykeye/TinyLLama-v0 to tests
borzunov Aug 8, 2023
5760b15
Test using includes only
borzunov Aug 8, 2023
015238a
Adjust --num_blocks and --block_indices for 8-layer TinyLlama-v0
borzunov Aug 8, 2023
17cae64
Refactor matrix
borzunov Aug 8, 2023
b7b7464
Fix commands
borzunov Aug 8, 2023
c907990
Skip TP tests for llama
borzunov Aug 8, 2023
0040539
Fix test_greedy_generation() for llama
borzunov Aug 8, 2023
a5a95c4
Fix commands
borzunov Aug 8, 2023
c3e7638
Fix test_server_info()
borzunov Aug 8, 2023
b622a14
Fix server layout
borzunov Aug 8, 2023
8a379aa
Try reducing RAM usage
borzunov Aug 8, 2023
ecd7d3f
Check if benchmarks work
borzunov Aug 8, 2023
6ffbc28
Watch free RAM (common issue in CI)
borzunov Aug 8, 2023
033a3ca
Reduce RAM further
borzunov Aug 8, 2023
f06cebd
Tune constants to save RAM
borzunov Aug 8, 2023
47d2d53
Speed benchmark tests
borzunov Aug 8, 2023
d8e08e6
Fix flapping test
borzunov Aug 8, 2023
315c5c6
Try --no_relay
borzunov Aug 8, 2023
5cbb33b
Increase swap space
borzunov Aug 8, 2023
54cd213
Fix flapping test
borzunov Aug 8, 2023
1e34dfd
Fix flapping test
borzunov Aug 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use AutoDistributed{Config,Model} in tests
  • Loading branch information
borzunov committed Aug 8, 2023
commit 18e5b00263161e9b5eec51a849dda7a7603bf85a
4 changes: 2 additions & 2 deletions tests/test_block_exact_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import pytest
import torch

from petals import DistributedBloomConfig, RemoteSequential
from petals import AutoDistributedConfig, RemoteSequential
from petals.server.from_pretrained import load_pretrained_block
from test_utils import *


@pytest.mark.forked
def test_remote_block_exact_match(atol_forward=1e-4, atol_inference=1e-3):
config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
remote_sequential = RemoteSequential(config)

for block_index in random.sample(range(config.num_hidden_layers), 3):
Expand Down
6 changes: 3 additions & 3 deletions tests/test_chained_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
import pytest
import torch

from petals import DistributedBloomConfig
from petals import AutoDistributedConfig
from petals.client.remote_sequential import RemoteSequential
from petals.server.from_pretrained import load_pretrained_block
from test_utils import *


@pytest.mark.forked
def test_forward_backward_exact_match(atol_forward=1e-4, atol_backward=1e-4, seq_length=1):
config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
remote_blocks = RemoteSequential(config, start_block=3, end_block=6)
assert isinstance(remote_blocks, RemoteSequential)

Expand Down Expand Up @@ -43,7 +43,7 @@ def test_forward_backward_exact_match(atol_forward=1e-4, atol_backward=1e-4, seq

@pytest.mark.forked
def test_chained_inference_exact_match(atol_inference=1e-4):
config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
remote_blocks = RemoteSequential(config, start_block=3, end_block=5)

inputs = torch.randn(1, 8, config.hidden_size)
Expand Down
54 changes: 27 additions & 27 deletions tests/test_full_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,31 @@
import torch
import transformers
from hivemind import get_logger
from transformers.generation import BeamSearchScorer
from transformers.models.bloom import BloomForCausalLM
from transformers.generation import BeamSearchScorer, GenerationMixin as HfGenerationMixin

from petals import DistributedBloomForCausalLM
from petals import AutoDistributedModelForCausalLM
from test_utils import *

logger = get_logger(__name__)


@pytest.fixture
def tokenizer():
# We set use_fast=False since LlamaTokenizerFast is slow on load
return transformers.AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)


@pytest.mark.forked
@pytest.mark.parametrize("use_peft", (True, False) if ADAPTER_NAME else (False,))
@pytest.mark.parametrize("pass_empty_tensors", (True, False))
def test_full_model_exact_match(use_peft: bool, pass_empty_tensors: bool, atol_forward=1e-3, atol_inference=1e-3):
tokenizer = transformers.BloomTokenizerFast.from_pretrained(MODEL_NAME)
model = DistributedBloomForCausalLM.from_pretrained(
def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_forward=1e-3, atol_inference=1e-3):
model = AutoDistributedModelForCausalLM.from_pretrained(
MODEL_NAME,
initial_peers=INITIAL_PEERS,
low_cpu_mem_usage=True,
torch_dtype=torch.float32,
active_adapter=ADAPTER_NAME if use_peft else None,
)
config = model.config
assert isinstance(model, DistributedBloomForCausalLM)
assert len(model.transformer.h) == model.config.num_hidden_layers

test_inputs = tokenizer("A quick brown fox was minding its own buisness", return_tensors="pt")["input_ids"]
Expand Down Expand Up @@ -63,7 +65,7 @@ def test_full_model_exact_match(use_peft: bool, pass_empty_tensors: bool, atol_f
del model, embs, recurrent_outputs

if REF_NAME:
ref_model = transformers.BloomForCausalLM.from_pretrained(
ref_model = transformers.AutoModelForCausalLM.from_pretrained(
REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
)
if use_peft:
Expand All @@ -86,17 +88,16 @@ def test_full_model_exact_match(use_peft: bool, pass_empty_tensors: bool, atol_f


@pytest.mark.forked
def test_greedy_generation(max_new_tokens=4):
tokenizer = transformers.BloomTokenizerFast.from_pretrained(MODEL_NAME)
model = DistributedBloomForCausalLM.from_pretrained(
MODEL_NAME, initial_peers=INITIAL_PEERS, low_cpu_mem_usage=True, torch_dtype=torch.float32
def test_greedy_generation(tokenizer, max_new_tokens=4):
model = AutoDistributedModelForCausalLM.from_pretrained(
MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
)
inputs = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
remote_outputs = model.generate(
inputs,
max_new_tokens=max_new_tokens,
)
hf_outputs = BloomForCausalLM.greedy_search(model, input_ids=inputs, max_length=inputs.size(1) + max_new_tokens)
hf_outputs = HfGenerationMixin.greedy_search(model, input_ids=inputs, max_length=inputs.size(1) + max_new_tokens)
assert torch.allclose(remote_outputs, hf_outputs), "Greedy search results are not identical to HF"

inputs_batch = tokenizer(["A cat sat on a mat", "A dog sat on a mat"], return_tensors="pt", padding=True)[
Expand All @@ -106,7 +107,7 @@ def test_greedy_generation(max_new_tokens=4):
inputs_batch,
max_new_tokens=max_new_tokens,
)
hf_outputs_batch = BloomForCausalLM.greedy_search(
hf_outputs_batch = HfGenerationMixin.greedy_search(
model, input_ids=inputs_batch, max_length=inputs_batch.size(1) + max_new_tokens
)
assert torch.allclose(
Expand All @@ -117,13 +118,13 @@ def test_greedy_generation(max_new_tokens=4):
@pytest.mark.forked
@pytest.mark.parametrize("sampling_options", [dict(), dict(temperature=100.0), dict(top_k=5), dict(top_p=0.9)])
@pytest.mark.skip("Sampling is currently not consistent with outputs from Transformers")
def test_sampling(sampling_options, max_new_tokens=4):
def test_sampling(tokenizer, sampling_options, max_new_tokens=4):
torch.manual_seed(0)
tokenizer = transformers.BloomTokenizerFast.from_pretrained(MODEL_NAME)
model = DistributedBloomForCausalLM.from_pretrained(
MODEL_NAME, initial_peers=INITIAL_PEERS, low_cpu_mem_usage=True, torch_dtype=torch.float32

model = AutoDistributedModelForCausalLM.from_pretrained(
MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
)
logits_warper = BloomForCausalLM._get_logits_warper(model, num_beams=1, **sampling_options)
logits_warper = HfGenerationMixin._get_logits_warper(model, num_beams=1, **sampling_options)
inputs = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
with torch.random.fork_rng():
remote_outputs = model.generate(
Expand All @@ -133,7 +134,7 @@ def test_sampling(sampling_options, max_new_tokens=4):
**sampling_options,
)
with torch.random.fork_rng():
hf_outputs = BloomForCausalLM.sample(
hf_outputs = HfGenerationMixin.sample(
model, input_ids=inputs, max_length=inputs.size(1) + max_new_tokens, logits_warper=logits_warper
)
assert torch.allclose(remote_outputs, hf_outputs), "Sampling results are not identical to HF"
Expand All @@ -149,7 +150,7 @@ def test_sampling(sampling_options, max_new_tokens=4):
**sampling_options,
)
with torch.random.fork_rng():
hf_outputs_batch = BloomForCausalLM.sample(
hf_outputs_batch = HfGenerationMixin.sample(
model,
input_ids=inputs_batch,
max_length=inputs_batch.size(1) + max_new_tokens,
Expand All @@ -161,10 +162,9 @@ def test_sampling(sampling_options, max_new_tokens=4):


@pytest.mark.forked
def test_beam_search_generation(max_new_tokens=4, num_beams=2):
tokenizer = transformers.BloomTokenizerFast.from_pretrained(MODEL_NAME)
model = DistributedBloomForCausalLM.from_pretrained(
MODEL_NAME, initial_peers=INITIAL_PEERS, low_cpu_mem_usage=True, torch_dtype=torch.float32
def test_beam_search_generation(tokenizer, max_new_tokens=4, num_beams=2):
model = AutoDistributedModelForCausalLM.from_pretrained(
MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
)
text = "A cat sat on a mat"
inputs = tokenizer(text, return_tensors="pt")["input_ids"]
Expand All @@ -181,7 +181,7 @@ def test_beam_search_generation(max_new_tokens=4, num_beams=2):
do_early_stopping=False,
)
hf_inputs = tokenizer([text] * 2, return_tensors="pt")["input_ids"]
hf_outputs = BloomForCausalLM.beam_search(
hf_outputs = HfGenerationMixin.beam_search(
model, input_ids=hf_inputs, max_length=inputs.size(1) + max_new_tokens, beam_scorer=beam_scorer
)
assert torch.allclose(remote_outputs, hf_outputs), "Beam search results are not identical to HF"
6 changes: 3 additions & 3 deletions tests/test_remote_sequential.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from hivemind import DHT, BatchTensorDescriptor, get_logger
from hivemind.proto import runtime_pb2

from petals import DistributedBloomConfig
from petals import AutoDistributedConfig
from petals.client import RemoteSequenceManager, RemoteSequential
from petals.data_structures import UID_DELIMITER
from petals.server.from_pretrained import load_pretrained_block
Expand All @@ -15,7 +15,7 @@

@pytest.mark.forked
def test_remote_sequential():
config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
dht = DHT(initial_peers=config.initial_peers, client_mode=True, start=True)
test_inputs = torch.randn(1, 5, config.hidden_size, requires_grad=True)
grad_proj = torch.randn(1, 5, config.hidden_size)
Expand Down Expand Up @@ -87,7 +87,7 @@ def get_request_metadata(self, protocol: str, *args, **kwargs):

@pytest.mark.forked
def test_remote_sequential_prompts(batch_size=2, seq_len=5, pre_seq_len=3):
config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
remote_sequential = RemoteSequential(config)

inputs = F.normalize(torch.randn(batch_size, seq_len, config.hidden_size), dim=-1)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_sequence_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch
from hivemind import DHT, get_logger

from petals import DistributedBloomConfig
from petals import AutoDistributedConfig
from petals.client import RemoteSequenceManager, RemoteSequential
from petals.data_structures import UID_DELIMITER
from test_utils import *
Expand All @@ -16,7 +16,7 @@
@pytest.mark.forked
@pytest.mark.parametrize("mode", ["max_throughput", "min_latency"])
def test_sequence_manager_basics(mode: str):
config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
dht = DHT(initial_peers=config.initial_peers, client_mode=True, start=True)
sequential = RemoteSequential(config, dht=dht)
shutdown_evt = threading.Event()
Expand Down
4 changes: 2 additions & 2 deletions tests/test_server_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import pytest
import torch

from petals import DistributedBloomConfig, RemoteSequential
from petals import AutoDistributedConfig, RemoteSequential
from petals.server.handler import CACHE_TOKENS_AVAILABLE
from test_utils import *


@pytest.mark.forked
def test_server_info(block_from: int = 22, block_to: int = 24, max_length: int = 100, max_length2: int = 50):
config = DistributedBloomConfig.from_pretrained(MODEL_NAME)
config = AutoDistributedConfig.from_pretrained(MODEL_NAME)
dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
blocks1 = RemoteSequential(config, dht=dht, start_block=block_from, end_block=block_to)
blocks2 = RemoteSequential(config, dht=dht, start_block=block_to - 1, end_block=block_to)
Expand Down
Loading