Compute the generation time per sample (Lightning-AI#178)

zha0 · Apr 20, 2023 · 9ff49e9 · 9ff49e9
1 parent 9e4846e
commit 9ff49e9
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 21 deletions.
diff --git a/generate.py b/generate.py
@@ -126,9 +126,8 @@ def main(
     encoded_prompt = tokenizer.encode(prompt, bos=True, eos=False, device=fabric.device)
 
     L.seed_everything(1234)
-    t0 = time.perf_counter()
-
-    for _ in range(num_samples):
+    for i in range(num_samples):
+        t0 = time.perf_counter()
         y = generate(
             model,
             encoded_prompt,
@@ -137,10 +136,9 @@ def main(
             temperature=temperature,
             top_k=top_k,
         )
+        t = time.perf_counter() - t0
         print(tokenizer.decode(y))
-
-    t = time.perf_counter() - t0
-    print(f"\n\nTime for inference: {t:.02f} sec total, {num_samples * max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
+        print(f"Time for inference {i + 1}: {t:.02f} sec total, {max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
     if fabric.device.type == "cuda":
         print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
 

diff --git a/generate_adapter.py b/generate_adapter.py
@@ -39,7 +39,6 @@ def main(
         quantize: Whether to quantize the model and using which method:
             ``"llm.int8"``: LLM.int8() mode,
             ``"gptq.int4"``: GPTQ 4-bit mode.
-        dtype: The dtype to use during generation.
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
@@ -56,9 +55,8 @@ def main(
     assert pretrained_path.is_file()
     assert tokenizer_path.is_file()
 
-    fabric = L.Fabric(accelerator="cuda", devices=1)
-
-    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+    fabric = L.Fabric(devices=1)
+    dtype = torch.bfloat16 if fabric.device.type == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
 
     print("Loading model ...", file=sys.stderr)
     t0 = time.time()
@@ -95,16 +93,15 @@ def main(
         top_k=top_k,
         eos_id=tokenizer.eos_id
     )
+    t = time.perf_counter() - t0
 
     output = tokenizer.decode(output)
     output = output.split("### Response:")[1].strip()
-
     print(output)
-    t = time.perf_counter() - t0
 
     print(f"\n\nTime for inference: {t:.02f} sec total, {max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
-    print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
-
+    if fabric.device.type == "cuda":
+        print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
 
 
 if __name__ == "__main__":

diff --git a/generate_lora.py b/generate_lora.py
@@ -29,7 +29,6 @@ def main(
     max_new_tokens: int = 100,
     top_k: int = 200,
     temperature: float = 0.8,
-    accelerator: str = "auto",
 ) -> None:
     """Generates a response based on a given instruction and an optional input.
     This script will only work with checkpoints from the instruction-tuned LoRA model.
@@ -50,8 +49,6 @@ def main(
         top_k: The number of top most probable tokens to consider in the sampling process.
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
-        accelerator: The hardware to run on. Possible choices are:
-            ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``.
     """
     if not lora_path:
         lora_path = Path("out/lora/alpaca/lit-llama-lora-finetuned.pth")
@@ -67,7 +64,7 @@ def main(
     if quantize is not None:
         raise NotImplementedError("Quantization in LoRA is not supported yet")
 
-    fabric = L.Fabric(accelerator=accelerator, devices=1)
+    fabric = L.Fabric(devices=1)
 
     dt = getattr(torch, dtype, None)
     if not isinstance(dt, torch.dtype):
@@ -109,15 +106,15 @@ def main(
         top_k=top_k,
         eos_id=tokenizer.eos_id
     )
+    t = time.perf_counter() - t0
 
     output = tokenizer.decode(output)
     output = output.split("### Response:")[1].strip()
-
     print(output)
-    t = time.perf_counter() - t0
 
     print(f"\n\nTime for inference: {t:.02f} sec total, {max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
-    print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
+    if fabric.device.type == "cuda":
+        print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
 
 
 if __name__ == "__main__":