Skip to content

Commit

Permalink
Compute the generation time per sample (Lightning-AI#178)
Browse files Browse the repository at this point in the history
  • Loading branch information
carmocca authored Apr 20, 2023
1 parent 9e4846e commit 9ff49e9
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 21 deletions.
10 changes: 4 additions & 6 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,8 @@ def main(
encoded_prompt = tokenizer.encode(prompt, bos=True, eos=False, device=fabric.device)

L.seed_everything(1234)
t0 = time.perf_counter()

for _ in range(num_samples):
for i in range(num_samples):
t0 = time.perf_counter()
y = generate(
model,
encoded_prompt,
Expand All @@ -137,10 +136,9 @@ def main(
temperature=temperature,
top_k=top_k,
)
t = time.perf_counter() - t0
print(tokenizer.decode(y))

t = time.perf_counter() - t0
print(f"\n\nTime for inference: {t:.02f} sec total, {num_samples * max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
print(f"Time for inference {i + 1}: {t:.02f} sec total, {max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
if fabric.device.type == "cuda":
print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)

Expand Down
13 changes: 5 additions & 8 deletions generate_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def main(
quantize: Whether to quantize the model and using which method:
``"llm.int8"``: LLM.int8() mode,
``"gptq.int4"``: GPTQ 4-bit mode.
dtype: The dtype to use during generation.
max_new_tokens: The number of generation steps to take.
top_k: The number of top most probable tokens to consider in the sampling process.
temperature: A value controlling the randomness of the sampling process. Higher values result in more random
Expand All @@ -56,9 +55,8 @@ def main(
assert pretrained_path.is_file()
assert tokenizer_path.is_file()

fabric = L.Fabric(accelerator="cuda", devices=1)

dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
fabric = L.Fabric(devices=1)
dtype = torch.bfloat16 if fabric.device.type == "cuda" and torch.cuda.is_bf16_supported() else torch.float32

print("Loading model ...", file=sys.stderr)
t0 = time.time()
Expand Down Expand Up @@ -95,16 +93,15 @@ def main(
top_k=top_k,
eos_id=tokenizer.eos_id
)
t = time.perf_counter() - t0

output = tokenizer.decode(output)
output = output.split("### Response:")[1].strip()

print(output)
t = time.perf_counter() - t0

print(f"\n\nTime for inference: {t:.02f} sec total, {max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)

if fabric.device.type == "cuda":
print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)


if __name__ == "__main__":
Expand Down
11 changes: 4 additions & 7 deletions generate_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def main(
max_new_tokens: int = 100,
top_k: int = 200,
temperature: float = 0.8,
accelerator: str = "auto",
) -> None:
"""Generates a response based on a given instruction and an optional input.
This script will only work with checkpoints from the instruction-tuned LoRA model.
Expand All @@ -50,8 +49,6 @@ def main(
top_k: The number of top most probable tokens to consider in the sampling process.
temperature: A value controlling the randomness of the sampling process. Higher values result in more random
samples.
accelerator: The hardware to run on. Possible choices are:
``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``.
"""
if not lora_path:
lora_path = Path("out/lora/alpaca/lit-llama-lora-finetuned.pth")
Expand All @@ -67,7 +64,7 @@ def main(
if quantize is not None:
raise NotImplementedError("Quantization in LoRA is not supported yet")

fabric = L.Fabric(accelerator=accelerator, devices=1)
fabric = L.Fabric(devices=1)

dt = getattr(torch, dtype, None)
if not isinstance(dt, torch.dtype):
Expand Down Expand Up @@ -109,15 +106,15 @@ def main(
top_k=top_k,
eos_id=tokenizer.eos_id
)
t = time.perf_counter() - t0

output = tokenizer.decode(output)
output = output.split("### Response:")[1].strip()

print(output)
t = time.perf_counter() - t0

print(f"\n\nTime for inference: {t:.02f} sec total, {max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
if fabric.device.type == "cuda":
print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)


if __name__ == "__main__":
Expand Down

0 comments on commit 9ff49e9

Please sign in to comment.