Switch to LoRA adapter merge in compile_model. Re-implement LoRA tens…

…ors loading.
openvinotoolkit · slyalin · Aug 4, 2024 · Aug 4, 2024 · Aug 5, 2024 · Aug 7, 2024
commit 0cd4cc24abf4bb2bb765538376072eff5e23bacd
diff --git a/image_generation/common/diffusers/include/lora.hpp b/image_generation/common/diffusers/include/lora.hpp
@@ -9,6 +9,9 @@
 #include "openvino/op/constant.hpp"
 #include "openvino/pass/graph_rewrite.hpp"
 
+#define DEBUG_PRINT(X) do { std::cerr << "[ DEBUG ] " << X << "\n"; } while(false)
+
+
 class InsertLoRA : public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("InsertLoRA", "0");
@@ -17,9 +20,21 @@ class InsertLoRA : public ov::pass::MatcherPass {
 
     explicit InsertLoRA(LoRAMap& lora_map);
 
+    ~InsertLoRA () {
+        DEBUG_PRINT("Applied: " << applied);
+    }
+
 private:
     LoRAMap* m_lora_map;
+    size_t applied = 0;
 };
 
 std::map<std::string, InsertLoRA::LoRAMap>
 read_lora_adapters(const std::string& filename, const float alpha = 0.75f);
+
+using Adapter = std::vector<std::shared_ptr<ov::op::v0::Constant>>;
+using AdapterMap = std::map<std::string, Adapter>;
+using LoRAPrefixes = std::map<std::string, std::string>;
+
+std::map<std::string, AdapterMap> load_lora_adapter(const std::string& adapter_file_path, const float alpha, const LoRAPrefixes& prefixes);
+void apply_lora_adapter(std::shared_ptr<ov::Model> model, const AdapterMap& adapter_map);
diff --git a/image_generation/common/diffusers/src/lora.cpp b/image_generation/common/diffusers/src/lora.cpp
@@ -13,14 +13,21 @@
 #include <Eigen/Dense>
 
 #include "openvino/op/add.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/convert.hpp"
 #include "openvino/op/convolution.hpp"
 #include "openvino/op/matmul.hpp"
+#include "openvino/op/reshape.hpp"
 #include "openvino/pass/pattern/matcher.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "openvino/pass/manager.hpp"
 
 #define SAFETENSORS_IMPLEMENTATION
 #include "safetensors.h"
 
+using NodePtr = std::shared_ptr<ov::Node>;
+
 InsertLoRA::InsertLoRA(LoRAMap& lora_map) :
     m_lora_map(&lora_map) {
     OPENVINO_ASSERT(!m_lora_map->empty(), "Map with LoRA weights is empty");
@@ -34,6 +41,7 @@ InsertLoRA::InsertLoRA(LoRAMap& lora_map) :
         }
         std::string root_name = root->get_friendly_name();
         std::replace(root_name.begin(), root_name.end(), '.', '_');
+        DEBUG_PRINT(root->get_type_info().name);
 
         auto it = m_lora_map->begin();
         while (it != m_lora_map->end()) {
@@ -52,6 +60,7 @@ InsertLoRA::InsertLoRA(LoRAMap& lora_map) :
                 it++;
             }
         }
+        ++applied;
         return true;
     };
 
@@ -63,20 +72,69 @@ InsertLoRA::InsertLoRA(LoRAMap& lora_map) :
 
 namespace {
 
-std::vector<std::uint8_t> read_file(const std::string& filename) {
+// FIXME: Use ov::AlignedBuffer instead of std::vector. ov::AlignedBuffer is not available in public OV API
+using Buffer = std::vector<std::uint8_t>;
+using BufferPtr = std::shared_ptr<Buffer>;
+
+BufferPtr read_file_helper(const std::string& filename) {
     std::ifstream file(filename, std::ios::binary | std::ios::ate);
     OPENVINO_ASSERT(file.is_open(), "Cannot open file ", filename, " with LoRA weights");
 
     size_t filesize = file.tellg();
-    std::vector<std::uint8_t> buffer;
-    buffer.reserve(filesize);
-
+    auto buffer = std::make_shared<std::vector<std::uint8_t>>();
+    buffer->reserve(filesize);
     file.seekg(0, std::ios::beg);
-    std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), std::back_inserter(buffer));
+    // FIXME: Use mmapped AlignedBuffer as ov::Core::read_model can do, necessary functionality is not available in public OV API
+    std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), std::back_inserter(*buffer));
 
     return buffer;
 }
 
+// FIXME: Remove this legacy trampoline
+Buffer read_file(const std::string& filename) {
+    return std::move(*read_file_helper(filename));
+}
+
+ov::element::Type safetensors_to_ov_element_type (int dtype) {
+    switch(dtype) {
+        case SAFETENSORS_F32:
+            return ov::element::f32;
+        case SAFETENSORS_F16:
+            return ov::element::f16;
+        case SAFETENSORS_BF16:
+            return ov::element::bf16;
+        default:
+            OPENVINO_THROW("Not supported safetensors dtype: ", dtype);
+    }
+}
+
+using ConstantMap = std::map<std::string, std::shared_ptr<ov::op::v0::Constant>>;
+ConstantMap read_safetensors(const std::string& filename) {
+    ConstantMap tensors;
+    auto buffer = read_file_helper(filename);
+    safetensors_File safe_tensors_file = {0};
+    OPENVINO_ASSERT(safetensors_file_init(&(*buffer)[0], buffer->size(), &safe_tensors_file) == nullptr, "Cannot parse ", filename, " using safetensors");
+    DEBUG_PRINT("Opened " << filename << " as safetensors file format, it contains " << safe_tensors_file.num_tensors << " tensors");
+    for (int i = 0; i < safe_tensors_file.num_tensors; i++) {
+        safetensors_TensorDescriptor tensor = safe_tensors_file.tensors[i];
+        std::string name(tensor.name.ptr, tensor.name.ptr + tensor.name.len);
+        ov::Shape shape(tensor.shape, tensor.shape + tensor.n_dimensions);
+        void* ptr = tensor.ptr;     // FIXME: needs a non-constant pointer because Tensor doesn't accept a constant pointer
+        OPENVINO_ASSERT(ov::shape_size(shape) <= tensor.end_offset_bytes - tensor.begin_offset_bytes, " ", ov::shape_size(shape), " ", tensor.end_offset_bytes - tensor.begin_offset_bytes);
+        auto type = safetensors_to_ov_element_type(tensor.dtype);
+        // FIXME: Extend OV with a new Constant ctor that shares memory to avoid two stage Tensor->Constant initialization
+        ov::Tensor wrapper(type, shape, ptr);  // wraps existing memory, no ownership
+        auto constant = std::make_shared<ov::op::v0::Constant>(wrapper);    // wraps existing memory, no ownership
+        constant->get_rt_info()["__safetensors_buffer_holder"] = buffer;    // to automatically deallocate underlying memory buffer when last constant holding it is destoyed
+        DEBUG_PRINT("Tensor with name " << name << ", shape " << shape << " and type " << type << " was allocated.");
+        tensors[name] = constant;
+    }
+    free(safe_tensors_file.tensors);
+    free(safe_tensors_file.metadata);
+    return std::move(tensors);
+}
+
+
 std::vector<float> convert_to_float(const safetensors_TensorDescriptor& tensor) {
     std::vector<float> data;
     size_t tensor_size = (tensor.end_offset_bytes - tensor.begin_offset_bytes) / sizeof(ov::float16);
@@ -89,10 +147,168 @@ std::vector<float> convert_to_float(const safetensors_TensorDescriptor& tensor)
     return data;
 }
 
+
+
+#define OPENVINO_REGISTER_MATCHER(PATTERN, CALLBACK) do register_matcher(std::make_shared<ov::pass::pattern::Matcher>(PATTERN, this->get_type_info().name), CALLBACK); while(false)
+
+// Squeeze all dimensions from right to 2D shape
+NodePtr squeeze_2d (NodePtr node) {
+    // auto rank = node->get_output_partial_shape(0).rank().get_length();
+    // std::vector<unsigned int> dims(2);
+    //auto squeeze_num = rank - 2;
+    // std::fill_n(dims.begin() + 2, dims.end(), 1);
+    auto shape = ov::op::v0::Constant::create(ov::element::i32, {2}, std::vector<unsigned int>{0, 0});
+    auto reshape = std::make_shared<ov::op::v1::Reshape>(node->output(0), shape->output(0), true);
+    return reshape;
+}
+
+// Unsqueeze shape to add dimensions to the right to have `rank`-D tensor
+NodePtr unsqueeze (NodePtr node, unsigned int rank) {
+    auto src_rank = node->get_output_partial_shape(0).rank().get_length();
+    std::vector<unsigned int> dims(rank);
+    std::fill(dims.begin() + src_rank, dims.end(), 1);
+    auto shape = ov::op::v0::Constant::create(ov::element::i32, {rank}, dims);
+    auto reshape = std::make_shared<ov::op::v1::Reshape>(node->output(0), shape->output(0), true);
+    return reshape;
+}
+
+class ApplyLoRA : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("ApplyLoRA");
+    ApplyLoRA(const AdapterMap& adapter_map) {
+        OPENVINO_REGISTER_MATCHER(
+            (ov::pass::pattern::wrap_type<ov::op::v0::MatMul, ov::op::v1::Convolution>()),
+            ([=, this](ov::pass::pattern::Matcher& m) {
+                auto node = m.get_match_root();
+                auto name = node->get_friendly_name();
+                try{
+                std::replace(name.begin(), name.end(), '.', '_');   // FIXME: Customize mapping or change PT FE to produce correct weight names
+                auto adapter_iter = std::find_if(adapter_map.begin(), adapter_map.end(), [name](const AdapterMap::value_type& pair){
+                    return name.find(pair.first) != std::string::npos;  // FIXME: Should it be an exact match instead of substring taking into account that we should provide custom mapper for names?
+                });
+
+                if(adapter_iter == adapter_map.end()) {
+                    return false;
+                }
+
+                ov::Output<ov::Node> weights = node->input_value(1);
+                auto weights_type = weights.get_element_type();
+                auto adapter = adapter_iter->second;
+                NodePtr add_term = nullptr;
+                bool normalize_shape = false;
+                for(auto multiplier : adapter) {
+                    NodePtr normalized = multiplier;
+                    if(normalized->get_element_type() != weights_type) {
+                        normalized = std::make_shared<ov::op::v0::Convert>(normalized, weights_type);
+                    }
+                    if(normalized->get_output_partial_shape(0).rank().get_length() > 2) {
+                        // FIXME: Any other shape patterns possible?
+                        normalized = squeeze_2d(normalized);
+                    }
+                    if(add_term) {
+                        // FIXME: Apply alpha multiplication separately
+                        if(add_term->get_output_partial_shape(0).rank().get_length() == 0) {
+                            add_term = std::make_shared<ov::op::v1::Multiply>(add_term, normalized);
+                        } else {
+                            add_term = std::make_shared<ov::op::v0::MatMul>(add_term, normalized);
+                        }
+                    } else {
+                        add_term = multiplier;
+                    }
+                }
+
+                auto weights_rank =  weights.get_partial_shape().rank();
+                if(add_term->get_output_partial_shape(0).rank() != weights_rank) {
+                    // FIXME: Make sure that this is always unsqueeze of the same kind
+                    add_term = unsqueeze(add_term, weights_rank.get_length());
+                }
+
+                auto consumers = weights.get_target_inputs();
+                auto add = std::make_shared<ov::op::v1::Add>(weights, add_term);
+                for (auto consumer : consumers) {
+                    consumer.replace_source_output(add->output(0));
+                }
+                ++applied;
+                return true;
+                } catch(...) {
+                    DEBUG_PRINT("Exception happens on layer: " << name);
+                    throw;
+                }
+            })
+        );
+    }
+
+    ~ApplyLoRA () {
+        DEBUG_PRINT("LoRA Applied: " << applied);
+    }
+
+private:
+    size_t applied = 0;
+};
+
+
+
 } // namespace
 
+
+std::map<std::string, AdapterMap> load_lora_adapter(const std::string& adapter_file_path, const float alpha, const LoRAPrefixes& prefixes) {
+    auto adapter_tensors = read_safetensors(adapter_file_path);
+    std::map<std::string, AdapterMap> result;
+    auto alpha_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape(), {alpha});
+    for(const auto& named_tensor: adapter_tensors) {
+        if(named_tensor.first.find(".alpha") != std::string::npos) {
+            DEBUG_PRINT("Alpha tensor was ignored: " << named_tensor.first);
+            continue;
+        }
+
+        auto prefix_it = std::find_if(prefixes.begin(), prefixes.end(), [&named_tensor](const LoRAPrefixes::value_type& pair) {
+            // FIXME: Make sure there is no other matches
+            return named_tensor.first.find(pair.first) != std::string::npos;
+        });
+
+        if(prefix_it == prefixes.end()) {
+            DEBUG_PRINT("Ignored LoRA tensor " << named_tensor.first << " as there is are no matches with any of given prefixes." );
+            continue;
+        }
+
+        auto name = named_tensor.first.substr(named_tensor.first.find(prefix_it->first) + prefix_it->first.length() + 1);
+        auto delimiter = name.find('.');
+        auto layer_name = name.substr(0, delimiter);
+        auto suffix = name.substr(delimiter);
+
+        auto& adapter = result[prefix_it->second][layer_name];
+        if(adapter.empty()) {
+            adapter.push_back(alpha_const);
+        }
+        switch(adapter.size()) {
+            case 1:
+                adapter.push_back(named_tensor.second);
+                break;
+            case 2:
+                if(suffix.find("lora_down") != std::string::npos) {
+                    adapter.push_back(named_tensor.second);
+                } else {
+                    adapter.insert(adapter.begin() + 1, named_tensor.second);
+                }
+                break;
+            default:
+                OPENVINO_THROW("More than two adapter tensors appers for the same layer: ", layer_name, ", started with tensor: ", named_tensor.first);
+        }
+        DEBUG_PRINT("Size for tensor layer " << layer_name << ": " << adapter.size());
+    }
+    return result;
+}
+
+
+void apply_lora_adapter(std::shared_ptr<ov::Model> model, const AdapterMap& adapter_map) {
+    ov::pass::Manager pm;
+    pm.register_pass<ApplyLoRA>(adapter_map);
+    pm.run_passes(model);
+}
+
 std::map<std::string, InsertLoRA::LoRAMap>
 read_lora_adapters(const std::string& filename, const float alpha) {
+    read_safetensors(filename);
     std::vector<std::uint8_t> file_buffer = read_file(filename);
     void* buffer_ptr = file_buffer.data();
 
@@ -119,8 +335,10 @@ read_lora_adapters(const std::string& filename, const float alpha) {
         const bool tensor_visited = std::find(visited.begin(), visited.end(), tensor_name) != visited.end();
         // alpha tensors are overriden by users' alpha
         bool alpha_tensor = tensor_name.find(".alpha") != std::string::npos;
-        if (alpha_tensor || tensor_visited)
+        if (alpha_tensor || tensor_visited) {
+            DEBUG_PRINT((alpha_tensor ? "Alpha tensor was ignored: " : "Tensor was visited: ") << tensor_name);
             continue;
+        }
 
         const bool is_text_lora = tensor_name.find("text") != std::string::npos;
         const std::string lora_prefix = is_text_lora ? LORA_PREFIX_TEXT_ENCODER : LORA_PREFIX_UNET;

diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
@@ -147,11 +147,21 @@ StableDiffusionModels compile_models(const std::string& model_path,
 
     core.add_extension(TOKENIZERS_LIBRARY_PATH);
 
+    #define NEW_LORA_ADAPTERS 1
+
     // read LoRA weights
+    #if NEW_LORA_ADAPTERS
+    std::map<std::string, AdapterMap> lora_adapter;
+    #else
     std::map<std::string, InsertLoRA::LoRAMap> lora_weights;
+    #endif
     if (!lora_path.empty()) {
         Timer t("Loading and multiplying LoRA weights");
+    #if NEW_LORA_ADAPTERS
+        lora_adapter = load_lora_adapter(lora_path, alpha, {{"lora_te", "text_encoder"}, {"lora_unet", "unet"}});
+    #else
         lora_weights = read_lora_adapters(lora_path, alpha);
+    #endif
     }
 
     // Text encoder
@@ -161,7 +171,11 @@ StableDiffusionModels compile_models(const std::string& model_path,
         if (!use_dynamic_shapes) {
             reshape_text_encoder(text_encoder_model, batch_size, TOKENIZER_MODEL_MAX_LENGTH);
         }
+        #if NEW_LORA_ADAPTERS
+        apply_lora_adapter(text_encoder_model, lora_adapter["text_encoder"]);
+        #else
         apply_lora(text_encoder_model, lora_weights["text_encoder"]);
+        #endif
         models.text_encoder = core.compile_model(text_encoder_model, device);
     }
 
@@ -172,7 +186,11 @@ StableDiffusionModels compile_models(const std::string& model_path,
         if (!use_dynamic_shapes) {
             reshape_unet(unet_model, batch_size, height, width, TOKENIZER_MODEL_MAX_LENGTH);
         }
+        #if NEW_LORA_ADAPTERS
+        apply_lora_adapter(unet_model, lora_adapter["unet"]);
+        #else
         apply_lora(unet_model, lora_weights["unet"]);
+        #endif
         models.unet = core.compile_model(unet_model, device);
     }