diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml
index b758aec..6473b08 100644
--- a/.github/workflows/create-release.yml
+++ b/.github/workflows/create-release.yml
@@ -54,7 +54,26 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
-        
+
+      - name: Verify version match
+        if: startsWith(github.event.ref, 'refs/tags/v')
+        run: |
+          tag_version=$(echo ${{ github.ref }} | sed 's/refs\/tags\/v//')
+          component_version_manifest=$(jq -r '.version' custom_components/llama_conversation/manifest.json)
+          component_version_const=$(cat custom_components/llama_conversation/const.py | grep "INTEGRATION_VERSION" | tr -d ' ' | tr -d '"' | tr -d 'INTEGRATION_VERSION=')
+
+          if [ "$tag_version" != "$component_version_manifest" ]; then
+            echo "The version in the GitHub tag ($tag_version) does not match the version in the Home Assistant custom component manifest ($component_version_manifest)!"
+            exit 1
+          fi
+
+          if [ "$tag_version" != "$component_version_const" ]; then
+            echo "The version in the GitHub tag ($tag_version) does not match the version in const.py ($component_version_const)!"
+            exit 1
+          fi
+
+          echo "All required versions match."
+
       - name: Read llama-cpp-python version
         run: cat custom_components/llama_conversation/const.py | grep "EMBEDDED_LLAMA_CPP_PYTHON_VERSION" | tr -d ' ' | tr -d '"' >> $GITHUB_ENV
 
@@ -109,7 +128,7 @@ jobs:
     name: Create Release
     needs: [ build_wheels ]
     runs-on: ubuntu-latest
-    if: "startsWith(github.event.ref, 'refs/tags/v')" # only create a release if this was run on a tag
+    if: startsWith(github.event.ref, 'refs/tags/v')
 
     steps:
       - name: Download artifacts
diff --git a/README.md b/README.md
index 2875a2d..855739a 100644
--- a/README.md
+++ b/README.md
@@ -150,6 +150,7 @@ In order to facilitate running the project entirely on the system where Home Ass
 ## Version History
 | Version | Description                                                                                                                                                                                                          |
 |---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| v0.3.6  | Small llama.cpp backend fixes                                                                                                                                                                                        |
 | v0.3.5  | Fix for llama.cpp backend installation, Fix for Home LLM v1-3 API parameters, add Polish ICL examples                                                                                                                |
 | v0.3.4  | Significantly improved language support including full Polish translation, Update bundled llama-cpp-python to support new models, various bug fixes                                                                  |
 | v0.3.3  | Improvements to the Generic OpenAI Backend, improved area handling, fix issue using RGB colors, remove EOS token from responses, replace requests dependency with aiohttp included with Home Assistant               |
diff --git a/custom_components/llama_conversation/const.py b/custom_components/llama_conversation/const.py
index e147990..c2f88f2 100644
--- a/custom_components/llama_conversation/const.py
+++ b/custom_components/llama_conversation/const.py
@@ -383,5 +383,5 @@
     },
 }
 
-INTEGRATION_VERSION = "0.3.4"
+INTEGRATION_VERSION = "0.3.6"
 EMBEDDED_LLAMA_CPP_PYTHON_VERSION = "0.2.88"
\ No newline at end of file
diff --git a/custom_components/llama_conversation/conversation.py b/custom_components/llama_conversation/conversation.py
index 76d19a3..6458287 100644
--- a/custom_components/llama_conversation/conversation.py
+++ b/custom_components/llama_conversation/conversation.py
@@ -732,7 +732,7 @@ def _generate_icl_examples(self, num_examples, entity_names):
             
         return examples
 
-    def _generate_system_prompt(self, prompt_template: str, llm_api: llm.APIInstance) -> str:
+    def _generate_system_prompt(self, prompt_template: str, llm_api: llm.APIInstance | None) -> str:
         """Generate the system prompt with current entity states"""
         entities_to_expose, domains = self._async_get_exposed_entities()
 
@@ -1076,7 +1076,7 @@ async def _async_cache_prompt(self, entity, old_state, new_state):
         refresh_end = time.time()
         _LOGGER.debug(f"cache refresh took {(refresh_end - refresh_start):.2f} sec")
 
-    def _cache_prompt(self, llm_api: llm.API) -> None:
+    def _cache_prompt(self, llm_api: llm.APIInstance | None) -> None:
         # if a refresh is already scheduled then exit
         if self.cache_refresh_after_cooldown:
             return
@@ -1165,6 +1165,11 @@ def _generate(self, conversation: dict) -> str:
             )
 
             context_len = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
+            if len(input_tokens) >= context_len:
+                num_entities = len(self._async_get_exposed_entities()[0])
+                context_size = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
+                self._warn_context_size()
+                raise Exception(f"The model failed to produce a result because too many devices are exposed ({num_entities} devices) for the context size ({context_size} tokens)!")
             if len(input_tokens) + max_tokens >= context_len:
                 self._warn_context_size()
 
diff --git a/custom_components/llama_conversation/manifest.json b/custom_components/llama_conversation/manifest.json
index 3b8f0a2..a36be09 100644
--- a/custom_components/llama_conversation/manifest.json
+++ b/custom_components/llama_conversation/manifest.json
@@ -1,7 +1,7 @@
 {
   "domain": "llama_conversation",
   "name": "Local LLM Conversation",
-  "version": "0.3.5",
+  "version": "0.3.6",
   "codeowners": ["@acon96"],
   "config_flow": true,
   "dependencies": ["conversation"],
diff --git a/data/generate_home_assistant_data.py b/data/generate_home_assistant_data.py
index daeec4b..4d3dcc5 100644
--- a/data/generate_home_assistant_data.py
+++ b/data/generate_home_assistant_data.py
@@ -371,6 +371,14 @@ def get_random_state(self, extra_exposed_attributes=[]):
     "spanish": "H:m EEEE, d 'de' MMMM 'de' yyyy"
 }
 
+USER_INSTRUCTION_PROMPT = {
+    "english": "User instruction",
+    "german": "Benutzeranweisung",
+    "french": "Instruction de l'utilisateur ",
+    "spanish": "Instrucción del usuario",
+    "polish": "Instrukcja użytkownika"
+}
+
 
 class NoResponseAvailableException(Exception):
     pass
@@ -827,7 +835,7 @@ def generate_dpo_extra_service_call(template: dict, persona: str, max_devices: i
 def generate_dpo_incorrect_persona(template: dict, persona: str, max_devices: int = 32):
     pass
 
-def format_example_raw_chatml(example, persona, language):
+def format_example_raw_chatml(example, persona, language, use_system_role):
     """Don't use this one anymore"""
     sys_prompt = pile_of_system_prompts[persona]
     services_block = f"{SERVICES_PROMPT[language]}: " + ", ".join(sorted(example["available_services"]))
@@ -835,8 +843,13 @@ def format_example_raw_chatml(example, persona, language):
     question = example["question"]
     answers = " ".join(example["answers"])
 
-    system_block = "\n".join([ "<|im_start|>system", sys_prompt, services_block, states_block ]) + "<|im_end|>"
-    user_block = "\n".join([ "<|im_start|>user", question]) + "<|im_end|>"
+    if use_system_role:
+        system_block = "\n".join([ "<|im_start|>system", sys_prompt, services_block, states_block ]) + "<|im_end|>"
+        user_block = "\n".join([ "<|im_start|>user", question]) + "<|im_end|>"
+    else:
+        user_instruction_words = USER_INSTRUCTION_PROMPT[language] + ":"
+        system_block = ""
+        user_block = "\n".join([ "<|im_start|>user", sys_prompt, services_block, states_block, user_instruction_words, question]) + "<|im_end|>"
 
     assistant_block = "<|im_start|>assistant\n" + answers
     if len(example["service_calls"]) > 0:
@@ -855,7 +868,7 @@ def format_example_raw_chatml(example, persona, language):
     result = result.replace("garage_door.", "cover.")
     return { "text": result }
 
-def format_example_sharegpt(example, persona, language):
+def format_example_sharegpt(example, persona, language, use_system_role):
     sys_prompt = pile_of_system_prompts[persona]
     random_datetime = generate_random_datetime()
     translate_datetime = babel.dates.format_datetime(random_datetime, BABEL_FORMAT[language], locale=BABEL_LOCALE[language])
@@ -876,11 +889,18 @@ def format_example_sharegpt(example, persona, language):
     states_block = states_block.replace("blinds.", "cover.").replace("garage_door.", "cover.")
     services_block = services_block.replace("blinds.", "cover.").replace("garage_door.", "cover.")
 
-    conversation = [
-        { "from": "system", "value": "\n".join([ sys_prompt, time_block, services_block, states_block ])},
-        { "from": "user", "value": question },
-        { "from": "assistant", "value": assistant_block },
-    ]
+    if use_system_role:
+        conversation = [
+            { "from": "system", "value": "\n".join([ sys_prompt, time_block, services_block, states_block ])},
+            { "from": "user", "value": question },
+            { "from": "assistant", "value": assistant_block },
+        ]
+    else:
+        user_instruction_words = USER_INSTRUCTION_PROMPT[language] + ":"
+        conversation = [
+            { "from": "user", "value": "\n".join([ sys_prompt, time_block, services_block, states_block, user_instruction_words, question ]) },
+            { "from": "assistant", "value": assistant_block },
+        ]
     
     return { "conversations": conversation }
 
@@ -918,7 +938,7 @@ def format_example_dpo(example, persona, language):
         "rejected": rejected_assistant_block,
     }
 
-def generate_sft_file(filename: str, seed: int, format_func: Callable, personas: list[str], language: str, *, static_factor: int, template_factor: int, status_request_factor: int):
+def generate_sft_file(filename: str, seed: int, format_func: Callable, use_system_role: bool, personas: list[str], language: str, *, static_factor: int, template_factor: int, status_request_factor: int):
     random.seed(seed)
     np.random.seed(seed)
 
@@ -927,10 +947,10 @@ def generate_sft_file(filename: str, seed: int, format_func: Callable, personas:
     def run_factor_times(func, examples, data, persona, factor, language):
         if factor >= 1:
             for i in range(factor):
-                examples.append(format_func(func(data, persona), persona, language))
+                examples.append(format_func(func(data, persona), persona, language, use_system_role))
         else:
             if random.random() < factor:
-                examples.append(format_func(func(data, persona), persona, language))
+                examples.append(format_func(func(data, persona), persona, language, use_system_role))
     
     generated_examples = []
 
@@ -1139,6 +1159,7 @@ def main():
     parser.add_argument("--dpo", action="store_true", help="Set this flag to enable generation of the DPO dataset.")
     parser.add_argument("--merge", help="Set this flag to merge the generated datasets with the specified dataset.")
     parser.add_argument("--language", nargs="+", default=["english"], help="List of languages to generate: english, german, french, spanish, polish")
+    parser.add_argument("--no-system-role", action="store_true", help="Set this flag to disable the system role. It will be combined with the user role")
 
     train_size_group = parser.add_mutually_exclusive_group()
     train_size_group.add_argument('--small', action='store_const', const='small', dest='size')
@@ -1165,26 +1186,28 @@ def main():
     elif args.format == "sharegpt":
         format_func = format_example_sharegpt
 
+    use_system_role = not args.no_system_role
+
     for language in args.language:
         load_dataset_piles(language)
         personas = list(pile_of_system_prompts.keys())
         suffix = f"_{language}" if len(args.language) > 1 else ""
 
         if args.sample:
-            generate_sft_file(f"sample{suffix}", 42, format_func, personas, language, static_factor=1, template_factor=1, status_request_factor=1)
+            generate_sft_file(f"sample{suffix}", 42, format_func, use_system_role, personas, language, static_factor=1, template_factor=1, status_request_factor=1)
         if args.train:
             if args.size == "small":
-                generate_sft_file(f"home_assistant_train{suffix}", 42, format_func, personas, language, static_factor=1, template_factor=10, status_request_factor=8)
+                generate_sft_file(f"home_assistant_train{suffix}", 42, format_func, use_system_role, personas, language, static_factor=1, template_factor=10, status_request_factor=8)
             elif args.size == "medium":
-                generate_sft_file(f"home_assistant_train{suffix}", 42, format_func, personas, language, static_factor=5, template_factor=15, status_request_factor=12)
+                generate_sft_file(f"home_assistant_train{suffix}", 42, format_func, use_system_role, personas, language, static_factor=5, template_factor=15, status_request_factor=12)
             elif args.size == "large":
-                generate_sft_file(f"home_assistant_train{suffix}", 42, format_func, personas, language, static_factor=5, template_factor=20, status_request_factor=15)
+                generate_sft_file(f"home_assistant_train{suffix}", 42, format_func, use_system_role, personas, language, static_factor=5, template_factor=20, status_request_factor=15)
             elif args.size == "xl":
-                generate_sft_file(f"home_assistant_train{suffix}", 42, format_func, personas, language, static_factor=7, template_factor=25, status_request_factor=18)
+                generate_sft_file(f"home_assistant_train{suffix}", 42, format_func, use_system_role, personas, language, static_factor=7, template_factor=25, status_request_factor=18)
             else:
                 raise Exception(f"Unrecognized dataset size: {args.size}")
         if args.test:
-            generate_sft_file(f"home_assistant_test{suffix}", 12345, format_func, personas, language, static_factor=0.25, template_factor=1, status_request_factor=2)
+            generate_sft_file(f"home_assistant_test{suffix}", 12345, format_func, use_system_role, personas, language, static_factor=0.25, template_factor=1, status_request_factor=2)
 
     if len(args.language) > 1:
         if args.sample:
diff --git a/find_split.py b/find_split.py
index 419d056..609be6d 100644
--- a/find_split.py
+++ b/find_split.py
@@ -1,29 +1,79 @@
 # this script attempts to figure out the correct prefix_ids and suffix_ids for the given model
 # usage: python3 find_split.py <model name>
 from transformers import AutoTokenizer
+from jinja2.exceptions import TemplateError
 import sys
 
 if len(sys.argv) > 1:
     model = sys.argv[1]
 else:
-    model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    print(f"Usage: {sys.argv[0]} <model name>")
+    exit(-1)
 
 prefix_ids = None
 suffix_ids = None
 tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
 
-assistant_prompt = tokenizer.apply_chat_template(
-    conversation=[{"role": "assistant", "content":  r"%%%%%%%%%%%%%%%%"}],
+test_prompt = tokenizer.apply_chat_template(
+    conversation=[
+        {"role": "user", "content": r"HA_REQUEST"},
+        {"role": "assistant", "content": r"HA_RESPONSE"}
+    ],
     tokenize=False,
     add_generation_prompt=False,
-).split( r"%%%%%%%%%%%%%%%%")
+)
+
+print("Chat template:")
+print("-" * 100)
+print(test_prompt)
+print("-" * 100)
+
+# Added real example to test the tokenizer
+test_prompt_tokens = tokenizer.apply_chat_template(
+    conversation=[
+        {"role": "system", "content": "this is a system prompt"},
+        {"role": "user", "content":  "a user request goes here"},
+        {"role": "assistant", "content":  "the response is in here"}
+    ],
+    tokenize=True,
+    add_generation_prompt=False
+)
+
+print("Chat template tokens:")
+print("-" * 100)
+print(test_prompt_tokens)
+print("-" * 100)
+
+try:
+    assistant_prompt = tokenizer.apply_chat_template(
+        conversation=[{"role": "assistant", "content":  r"%%%%%%%%%%%%%%%%"}],
+        tokenize=False,
+        add_generation_prompt=False,
+    ).split( r"%%%%%%%%%%%%%%%%")
+except TemplateError:
+    user_prompt = tokenizer.apply_chat_template(
+        conversation=[
+            {"role": "user", "content": r"$$$$$$$$$$$$$$$$"}
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    # some prompt templates require user/assistant alternating
+    assistant_prompt = tokenizer.apply_chat_template(
+        conversation=[
+            {"role": "user", "content": r"$$$$$$$$$$$$$$$$"},
+            {"role": "assistant", "content":  r"%%%%%%%%%%%%%%%%"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+    ).split(r"$$$$$$$$$$$$$$$$")[-1].strip().split(r"%%%%%%%%%%%%%%%%")
 
 response_prefix = assistant_prompt[0]
 response_suffix = assistant_prompt[1]
 
 # check for inserted system prompt and remove it
 if tokenizer.eos_token in response_prefix:
-    response_prefix = response_prefix.split(tokenizer.eos_token)[1].lstrip()
+    response_prefix = response_prefix.split(tokenizer.eos_token)[-1].lstrip()
 
 # some chat templates ALWAYS add the bos token
 if tokenizer.bos_token in response_prefix:
@@ -38,20 +88,25 @@
 prefix_ids3 = tokenizer("\n" + response_prefix, add_special_tokens=False)["input_ids"]
 suffix_ids3 = tokenizer("\n" + response_suffix, add_special_tokens=False)["input_ids"]
 
+prefix_ids4 = tokenizer(response_prefix.strip(), add_special_tokens=False)["input_ids"]
+suffix_ids4 = tokenizer(response_suffix.strip(), add_special_tokens=False)["input_ids"]
+
 print(f"Estimated tokens for {model}")
 print("response prefix:")
 print(response_prefix)
 print("tokens with no leading whitespace:", prefix_ids)
 print("tokens with leading whitespace:", prefix_ids2)
 print("tokens with leading newline:", prefix_ids3)
+print("tokens with stripped whitespace:", prefix_ids4)
 
-print("---------------")
+print("-" * 100)
 
 print("response suffix:")
 print(response_suffix)
 print("tokens with no leading whitespace:", suffix_ids)
 print("tokens with leading whitespace:", suffix_ids2)
 print("tokens with leading newline:", suffix_ids3)
+print("tokens with stripped whitespace:", suffix_ids4)
 
 
 def _find_mask_ranges(input_ids, prefix_ids, suffix_ids):
@@ -105,28 +160,52 @@ def _find_mask_ranges(input_ids, prefix_ids, suffix_ids):
 
     return inverse_ranges
 
-label = tokenizer.apply_chat_template(
-    conversation=[
-        {"role": "system", "content": "this is a system prompt"},
-        {"role": "user", "content":  "a user request goes here"},
-        {"role": "assistant", "content":  "the response is in here"}],
-    add_generation_prompt=False,
-)
+try:
+    label = tokenizer.apply_chat_template(
+        conversation=[
+            {"role": "system", "content": "this is a system prompt"},
+            {"role": "user", "content":  "a user request goes here"},
+            {"role": "assistant", "content":  "the response is in here"}
+        ],
+        add_generation_prompt=False,
+    )
+except TemplateError:
+    # some chat templates don't have a system prompt option
+    label = tokenizer.apply_chat_template(
+        conversation=[
+            {"role": "user", "content":  "a user request goes here"},
+            {"role": "assistant", "content":  "the response is in here"}
+        ],
+        add_generation_prompt=False,
+    )
 
 def check_range(label, name, prefix_ids, suffix_ids):
     label = label[:]
     mask_ranges = _find_mask_ranges(label, prefix_ids, suffix_ids)
 
+    found = False
     for start, end in mask_ranges:
         if end - start == len(label) - 1:
             print(f"'{name}' did not find the assistant response")
         else:
-            print(f"'{name}' found the assistant response!")
-            print(f"\t--prefix-ids {','.join([str(x) for x in prefix_ids])}")
-            print(f"\t--suffix-ids {','.join([str(x) for x in suffix_ids])}")
-            break
-
-print("---------------")
-check_range(label, "no whitespace", prefix_ids, suffix_ids)
+            found = True
+            # label[start:end] = [-100] * (end - start)
+
+    # assistant_tokens = [x for x in label if x != -100]
+    # decoded_string = tokenizer.decode(assistant_tokens)
+    # expected_decoded_string = "the response is in here" + tokenizer.decode(suffix_ids)
+    # if decoded_string == expected_decoded_string:
+    #     found = True
+
+    if found:
+        print(f"'{name}' found the assistant response!")
+        print(f"\t--prefix_ids {','.join([str(x) for x in prefix_ids])}")
+        print(f"\t--suffix_ids {','.join([str(x) for x in suffix_ids])}")
+    # else:
+    #     print(f"'{decoded_string}' != '{expected_decoded_string}'")
+
+print("-" * 100)
+check_range(label, "no added whitespace", prefix_ids, suffix_ids)
 check_range(label, "leading space", prefix_ids2, suffix_ids2)
 check_range(label, "leading newline", prefix_ids3, suffix_ids3)
+check_range(label, "stripped whitespace", prefix_ids4, suffix_ids4)