Add support to only load a model partially to GPU.

This was surprisingly easy to add.
Noeda · Apr 5, 2023 · e4af9d9 · e4af9d9
1 parent 35a3a5c
commit e4af9d9
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@ RLLaMA is a pure Rust implementation of [LLaMA large language model inference.](
   * LLaMA-7B, LLaMA-13B, LLaMA-30B, LLaMA-65B all confirmed working
   * Hand-optimized AVX2 implementation
   * OpenCL support for GPU inference.
+  * Load model only partially to GPU with `--percentage-to-gpu` command line switch to run hybrid-GPU-CPU inference.
   * Simple HTTP API support, with the possibility of doing token sampling on
     client side
   * It can load `Vicuna-13B` instruct-finetuned model (although currently there is no nice UX).
@@ -93,6 +94,19 @@ rllama --tokenizer-path /path/to/tokenizer.model \
 
 Use `rllama --help` to see all the options.
 
+## Partially load model to GPU
+
+`rllama` can load only some of the transformer blocks to GPU. There is a
+command line argument:
+
+`--percentage-to-gpu <value between 0 and 1, defaults to 1>`
+
+1 means 100% and 0 means 0%. Values in-between load the model partially to GPU.
+
+You can use this to load LLaMA-13B or Vicuna-13B on a consumer GPU of 24
+gigabytes at around `--percentage-to-gpu 0.9` before it fails to out-of-memory
+error (if there are no competing programs on the computer that use GPU memory).
+
 ## Interactive mode
 
 There is a simple experimental interactive mode to try force a type of

diff --git a/src/rllama_main.rs b/src/rllama_main.rs
@@ -72,6 +72,10 @@ struct Cli {
     #[arg(long)]
     opencl_device: Option<usize>,
 
+    #[cfg(feature = "opencl")]
+    #[arg(long)]
+    percentage_to_gpu: Option<f32>,
+
     #[arg(long, action)]
     inference_server: bool,
 
@@ -122,6 +126,9 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
         }
     };
 
+    #[cfg(feature = "opencl")]
+    let percentage_to_gpu: f32 = cli.percentage_to_gpu.unwrap_or(1.0);
+
     let mut be_quiet: bool = false;
     if !colored::control::SHOULD_COLORIZE.should_colorize() {
         be_quiet = true;
@@ -211,7 +218,7 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
         {
             if let Some(opencl) = opencl {
                 let ds = DataSettings::new(Some(opencl));
-                ds.use_opencl()
+                ds.percentage_to_gpu(percentage_to_gpu).use_opencl()
             } else {
                 DataSettings::new(None)
             }

diff --git a/src/transformer.rs b/src/transformer.rs
@@ -36,6 +36,8 @@ pub struct Transformer {
 // Clone is cheap
 #[derive(Clone)]
 pub struct DataSettings {
+    #[cfg(feature = "opencl")]
+    percentage_to_gpu: f32,
     #[cfg(feature = "opencl")]
     use_opencl_for_feedforward: bool,
     #[cfg(feature = "opencl")]
@@ -57,6 +59,7 @@ impl DataSettings {
             use_opencl_for_feedforward: false,
             use_opencl_for_attention: false,
             force_f16: false,
+            percentage_to_gpu: 1.0,
             cl: cl.clone(),
         }
     }
@@ -77,6 +80,28 @@ impl DataSettings {
         self
     }
 
+    #[cfg(feature = "opencl")]
+    pub fn dont_use_opencl(mut self) -> DataSettings {
+        self.use_opencl_for_feedforward = false;
+        self.use_opencl_for_attention = false;
+        self
+    }
+
+    #[cfg(feature = "opencl")]
+    pub fn percentage_to_gpu(mut self, percentage: f32) -> DataSettings {
+        self.percentage_to_gpu = percentage;
+        if self.percentage_to_gpu >= 1.0 {
+            self.percentage_to_gpu = 1.0;
+        }
+        if self.percentage_to_gpu < 0.0 {
+            self.percentage_to_gpu = 0.0;
+        }
+        if self.percentage_to_gpu.is_nan() {
+            self.percentage_to_gpu = 0.0;
+        }
+        self
+    }
+
     pub fn force_f16(mut self) -> DataSettings {
         self.force_f16 = true;
         self
@@ -234,13 +259,32 @@ impl Transformer {
         let layers: Vec<TransformerBlock> = (0..n_layers)
             .into_par_iter()
             .map(|layer_id| {
+                let data_settings = {
+                    #[cfg(feature = "opencl")]
+                    {
+                        let max_layers = n_layers;
+                        let last_layer_on_gpu = (data_settings.percentage_to_gpu
+                            * (max_layers - 1) as f32)
+                            .round() as usize;
+                        if layer_id > last_layer_on_gpu {
+                            data_settings.clone().dont_use_opencl()
+                        } else {
+                            data_settings.clone()
+                        }
+                    }
+                    #[cfg(not(feature = "opencl"))]
+                    {
+                        data_settings.clone()
+                    }
+                };
+
                 let result = TransformerBlock::from_unpickled(
                     layer_id,
                     eps,
                     n_local_heads,
                     head_dim,
                     dim,
-                    data_settings.clone(),
+                    data_settings,
                     data_source.clone(),
                 );
                 progress_bar.inc(1);