InstructBlipVideo: Update docstring (#31886)

* update docs * one more change
huggingface · Jul 11, 2024 · d625294 · d625294
1 parent c54af4c
commit d625294
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 14 deletions.
diff --git a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py
@@ -158,7 +158,8 @@ def forward(
         >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
         >>> import torch
         >>> from huggingface_hub import hf_hub_download
-        >>> from av
+        >>> import av
+        >>> import numpy as np
 
         >>> def read_video_pyav(container, indices):
         ...     '''
@@ -180,20 +181,21 @@ def forward(
         ...             frames.append(frame)
         ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 
-        >>> model = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
-        >>> processor = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
+        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
 
         >>> file_path = hf_hub_download(
-                repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
-            )
-        >>> container = av.open(video_path)
+        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
         >>> # sample uniformly 4 frames from the videWhy is this video funny?o
         >>> total_frames = container.streams.video[0].frames
         >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
         >>> clip = read_video_pyav(container, indices)
 
         >>> prompt = "What is happening in the video?"
-        >>> inputs = processor(videos=clip, text=prompt, return_tensors="pt").to(device)
+        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)
 
         >>> outputs = model.generate(
         ...     **inputs,

diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -1393,7 +1393,8 @@ def forward(
         >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
         >>> import torch
         >>> from huggingface_hub import hf_hub_download
-        >>> from av
+        >>> import av
+        >>> import numpy as np
 
         >>> def read_video_pyav(container, indices):
         ...     '''
@@ -1415,20 +1416,21 @@ def forward(
         ...             frames.append(frame)
         ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 
-        >>> model = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
-        >>> processor = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
+        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
 
         >>> file_path = hf_hub_download(
-                repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
-            )
-        >>> container = av.open(video_path)
+        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
         >>> # sample uniformly 4 frames from the videWhy is this video funny?o
         >>> total_frames = container.streams.video[0].frames
         >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
         >>> clip = read_video_pyav(container, indices)
 
         >>> prompt = "What is happening in the video?"
-        >>> inputs = processor(videos=clip, text=prompt, return_tensors="pt").to(device)
+        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)
 
         >>> outputs = model.generate(
         ...     **inputs,