Skip to content

Commit

Permalink
Add a flag that will exit the HTTP server after just one query.
Browse files Browse the repository at this point in the history
This is for some experiments I want to run to kill the server gracefully
whenever I pull out the logits out of it from a Python script.
  • Loading branch information
Noeda committed Mar 21, 2023
1 parent 957a8f9 commit 26f343a
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ The command line flags for this are:
calculations should be cached. Default is 50. This speeds up token
generation for prompts that were already requested before, however it also
increases memory use as the cache gets more full.
* `--inference-server-exit-after-one-query` will make the server exit with
exit code 0 after it has served one HTTP query. This is used for
troubleshooting and experiments.

Prompts and flags related to token sampling are all ignored in inference server
mode. Instead, they are obtained from each HTTP JSON API request.
Expand Down
13 changes: 13 additions & 0 deletions src/rllama_main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ struct Cli {

#[arg(long)]
inference_server_prompt_cache_size: Option<usize>,

#[arg(long, action)]
inference_server_exit_after_one_query: bool,
}

#[derive(Clone, Serialize, Deserialize)]
Expand Down Expand Up @@ -337,6 +340,7 @@ fn server_inference(
attention_cache_repository: Arc::new(RwLock::new(AttentionCacheRepository::empty(
inference_server_prompt_cache_size,
))),
exit_after_one_query: cli.inference_server_exit_after_one_query,
});

app.launch();
Expand Down Expand Up @@ -382,6 +386,7 @@ struct GeneratingSession {
no_token_sampling: bool,
stop_at_end_token: bool,
sent_stuff_last_time: bool,
exit_after_one_query: bool,
result: Vec<u8>, // stores JSONL lines to be returned from read()
}

Expand Down Expand Up @@ -429,9 +434,15 @@ impl Read for GeneratingSession {
return Ok(bytes_read);
}
if self.tokens.len() >= self.req_max_seq_len {
if self.exit_after_one_query {
std::process::exit(0);
}
return Ok(0);
}
if self.new_tokens_generated >= self.req_max_new_tokens {
if self.exit_after_one_query {
std::process::exit(0);
}
return Ok(0);
}

Expand Down Expand Up @@ -575,6 +586,7 @@ struct InferenceServerState {
max_seq_len: usize,
concurrent_requests_semaphore: Semaphore,
attention_cache_repository: Arc<RwLock<AttentionCacheRepository>>,
exit_after_one_query: bool,
}

#[cfg(feature = "server")]
Expand Down Expand Up @@ -652,6 +664,7 @@ fn handle_request(
no_token_sampling: no_token_sampling,
stop_at_end_token: stop_at_end_token,
sent_stuff_last_time: false,
exit_after_one_query: state.exit_after_one_query,
result: Vec::new(),
};

Expand Down

0 comments on commit 26f343a

Please sign in to comment.