Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ jobs:
run: |
source ${{ env.INSTALL_DIR }}/setupvars.sh
chmod +x ${{ env.INSTALL_DIR }}/tests/tests_continuous_batching
${{ env.INSTALL_DIR }}/tests/tests_continuous_batching --gtest_filter="-AddSecondInputTest.*"
${{ env.INSTALL_DIR }}/tests/tests_continuous_batching --gtest_filter="-AddSecondInputTest.*" --gtest_filter="DeepSeekR1ReasoningParserTest.*" --gtest_filter="ParserTest.*"

- name: Test Continuous Batching Tools
if: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -695,7 +695,7 @@ jobs:
run: |
source ${{ env.INSTALL_DIR }}/setupvars.sh
chmod +x ${{ env.INSTALL_DIR }}/tests/tests_continuous_batching
${{ env.INSTALL_DIR }}/tests/tests_continuous_batching --gtest_filter="-AddSecondInputTest.*"
${{ env.INSTALL_DIR }}/tests/tests_continuous_batching --gtest_filter="-AddSecondInputTest.*" --gtest_filter="DeepSeekR1ReasoningParserTest.*" --gtest_filter="ParserTest.*

- name: Test C++ Tools
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,7 @@ jobs:
- name: gtests unit tests
run: |
. "${{ env.INSTALL_DIR }}/setupvars.ps1"
& "${{ env.INSTALL_DIR }}/tests/tests_continuous_batching.exe" --gtest_filter="-AddSecondInputTest.*"
& "${{ env.INSTALL_DIR }}/tests/tests_continuous_batching.exe" --gtest_filter="-AddSecondInputTest.*" --gtest_filter="DeepSeekR1ReasoningParserTest.*" --gtest_filter="ParserTest.*

- name: Test C++ Tools
run: |
Expand Down
1 change: 1 addition & 0 deletions samples/cpp/text_generation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ set (SAMPLE_LIST
lora_greedy_causal_lm
multinomial_causal_lm
prompt_lookup_decoding_lm
parsed_output_sample
speculative_decoding_lm)

foreach(sample IN LISTS SAMPLE_LIST)
Expand Down
52 changes: 52 additions & 0 deletions samples/cpp/text_generation/parsed_output_sample.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright (C) 2023-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "openvino/genai/llm_pipeline.hpp"
#include "openvino/genai/parsers.hpp"
#include "openvino/genai/text_streamer.hpp"
Copy link

Copilot AI Sep 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ParsingState type is used in a using declaration but is not defined in any of the included headers. This will result in a compilation error.

Suggested change
#include "openvino/genai/text_streamer.hpp"
#include "openvino/genai/text_streamer.hpp"
#include "openvino/genai/parsing_state.hpp"

Copilot uses AI. Check for mistakes.



class CurrentStreamer : public ov::genai::TextParserStreamer {
private:
public:
CurrentStreamer(const ov::genai::Tokenizer& tokenizer)
: ov::genai::TextParserStreamer(tokenizer) {}
ov::genai::StreamingStatus write(ov::genai::ParsedMessage& message) {
std::cout << message["content"].get_string() << std::flush;
return ov::genai::StreamingStatus::RUNNING;
}
};


int main(int argc, char* argv[]) try {
if (argc < 2 || argc > 3) {
throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> <DEVICE>");
}
// std::string prompt = "<|begin▁of▁sentence|><|User|>Please think of a dificcult task to solve x**2 + y**2 = 1<|Assistant|><think>";
std::string prompt = "<|begin▁of▁sentence|><|User|>Why is the Sky blue?<|Assistant|><think>";
std::string models_path = argv[1];

// Default device is CPU; can be overridden by the second argument
std::string device = (argc == 3) ? argv[2] : "CPU"; // GPU, NPU can be used as well
ov::genai::LLMPipeline pipe(models_path, device);

ov::genai::GenerationConfig config;
config.max_new_tokens = 1000;

auto tok = pipe.get_tokenizer();
std::shared_ptr<CurrentStreamer> streamer = std::make_shared<CurrentStreamer>(tok);

pipe.generate(prompt, config, streamer);


} catch (const std::exception& error) {
try {
std::cerr << error.what() << '\n';
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
} catch (...) {
try {
std::cerr << "Non-exception object thrown\n";
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
}
33 changes: 33 additions & 0 deletions samples/python/text_generation/chat_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,36 @@ def main():

if '__main__' == __name__:
main()

pipe = openvino_genai.LLMPipeline(args.model_dir, device)

prompt = "What is the weather in New York today?"
res = pipe.generate(prompt, max_new_tokens=100, streamer=streamer)
print(res.texts[0])

res.parsed['tool_caling']

class LlamaToolCallParser(ParserBase):
def parse(self, parsed_data: ParsedData) -> ParsedData:
# parsed_data
# process parsed_data
# e.g. extract tool calls, or other fields from content
return new_parsed_output

llama_parser = LlamaToolCallParser()
res = pipe.generate(prompt, parsers=[llama_parser | "LLama3.2Pythonic"], max_new_tokens=100)

# At the beginning msg['original_content'] is filled with full text
msg = res.texts[i]
for parser in m_parsers:
msg = parser.parse(msg)

# At the end msg is filled with all parsed fields
parsed_data = {
'original_content': '<|system|>You are a helpful assistant... I will call the `get_weather` function with the location… \n\nfunctools[{"name": "get_weather", "arguments": {"location": "New York", "unit": "celsius"}}]<|end|>',
'content': 'blah blah',
'reasoning_content': '',
'tool_calls': "[{\"name\":\"get_weather\",\"arguments\":{\"location\":\"New York, NY\",\"unit\":\"celsius\"}}]",
}

res.parsed: ParsedData
4 changes: 4 additions & 0 deletions src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "openvino/genai/tokenizer.hpp"
#include "openvino/genai/scheduler_config.hpp"
#include "openvino/genai/lora_adapter.hpp"
#include "openvino/genai/parsers.hpp"

namespace ov {
namespace genai {
Expand Down Expand Up @@ -348,6 +349,9 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
bool is_prompt_lookup() const;
bool is_structured_output_generation() const;

// parsers
std::vector<std::variant<std::string, std::shared_ptr<ParserBase>>> parsers;

OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release")
bool is_speculative_decoding() const;

Expand Down
3 changes: 3 additions & 0 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "openvino/genai/perf_metrics.hpp"
#include "openvino/genai/scheduler_config.hpp"
#include "openvino/genai/common_types.hpp"
#include "openvino/genai/json_container.hpp"

namespace ov {
namespace genai {
Expand Down Expand Up @@ -68,6 +69,8 @@ class DecodedResults {
std::vector<float> scores;
PerfMetrics perf_metrics;
std::shared_ptr<ExtendedPerfMetrics> extended_perf_metrics;
// std::vector<ParsedMessage> parsed;
std::vector<JsonContainer> parsed;

// @brief Convert DecodedResults to a string.
operator std::string() const {
Expand Down
111 changes: 111 additions & 0 deletions src/cpp/include/openvino/genai/parsers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Copyright (C) 2023-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once
#include <string>
#include <memory>
#include <variant>
#include <map>
#include <functional>
#include <optional>
#include <vector>
#include "openvino/genai/json_container.hpp"

namespace ov {
namespace genai {

// TODO: will be converted to JSONLike object
// using ParsedMessage = std::map<std::string, std::string>;
using ParsedMessage = JsonContainer;

class IncrementalParserBase {
public:
IncrementalParserBase() = default;

// We return string which with filtered text to be added to content.
virtual std::string parse(
ParsedMessage& msg,
const std::string& previous_text,
std::string& delta_text,
const std::optional<std::vector<int64_t>>& previous_tokens = std::nullopt,
const std::optional<std::vector<int64_t>>& delta_tokens = std::nullopt
) = 0;

virtual bool is_active() const = 0;
static std::shared_ptr<IncrementalParserBase> get_parser(std::string name);
};

// Forward declaration
class ReasoningParserImpl;

class ReasoningParser : public IncrementalParserBase {
private:
std::shared_ptr<ReasoningParserImpl> m_impl;
public:
ReasoningParser(bool starts_with_thinking = true,
bool keep_original_content = true);

std::string parse(
ParsedMessage& msg,
const std::string& previous_text,
std::string& delta_text,
const std::optional<std::vector<int64_t>>& previous_tokens = std::nullopt,
const std::optional<std::vector<int64_t>>& delta_tokens = std::nullopt
) override;
bool is_active() const override;
};

class DeepSeekR1ReasoningParser : public ReasoningParser {
public:
DeepSeekR1ReasoningParser(bool starts_with_thinking = true) : ReasoningParser(starts_with_thinking) {};
static std::string name() { return "DeepSeekR1ReasoningParser"; }
};

class Phi4ReasoningParser : public ReasoningParser {
public:
Phi4ReasoningParser(bool starts_with_thinking = false) : ReasoningParser(starts_with_thinking) {};
static std::string name() { return "Phi4ReasoningParser"; }
};

class ParserBase {
public:
ParserBase() = default;

virtual ParsedMessage parse(ParsedMessage& text) = 0;
static std::shared_ptr<ParserBase> get_parser(std::string name);
};

using ParserVariant = std::variant<std::shared_ptr<IncrementalParserBase>, std::string>;

class Llama32PythonicToolParser : public ParserBase {
// Does not modify original content, only extracts and adds tool calls
public:
// TODO: Check that vLLM has the same default.
Llama32PythonicToolParser(bool keep_original_content = true) : m_keep_original_content(keep_original_content) {}

ParsedMessage parse(ParsedMessage& input) override;
static std::string name() { return "Llama32PythonicToolParser"; }
private:
bool m_keep_original_content = true;
};

class BaseReasoningParser : public ParserBase{
public:
BaseReasoningParser(bool expect_open_tag = true, bool keep_original_content = true, std::string open_tag = "<think>", std::string close_tag = "</think>") :
m_expect_open_tag(expect_open_tag),
m_keep_original_content(keep_original_content),
m_open_tag(open_tag),
m_close_tag(close_tag) {}

ParsedMessage parse(ParsedMessage& input) override;

private:
bool m_expect_open_tag = true;
bool m_keep_original_content = true;
std::string m_open_tag = "<think>";
std::string m_close_tag = "</think>";
};


} // namespace genai
} // namespace ov
17 changes: 17 additions & 0 deletions src/cpp/include/openvino/genai/text_streamer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "openvino/genai/streamer_base.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "openvino/genai/parsers.hpp"

namespace ov {
namespace genai {
Expand Down Expand Up @@ -46,5 +47,21 @@ class OPENVINO_GENAI_EXPORTS TextStreamer : public StreamerBase {
void compute_decoded_length_for_position(size_t cache_position);
};

class TextParserStreamer : public TextStreamer {
public:
TextParserStreamer(const Tokenizer& tokenizer, std::vector<ParserVariant> parsers = {});

virtual StreamingStatus write(ParsedMessage& message) = 0;

CallbackTypeVariant write(std::string message);

ParsedMessage get_parsed_message() const { return m_parsed_message; }
std::vector<std::shared_ptr<IncrementalParserBase>> get_parsers() const { return m_parsers; }
private:
ParsedMessage m_parsed_message;
std::string m_text_buffer;
std::vector<std::shared_ptr<IncrementalParserBase>> m_parsers;
};

} // namespace genai
} // namespace ov
4 changes: 2 additions & 2 deletions src/cpp/src/continuous_batching/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p

auto model = utils::read_model(models_path, properties);
auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;
// properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;
auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
auto generation_config = utils::from_config_json_if_exists(models_path);

Expand Down Expand Up @@ -98,7 +98,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(

auto model = utils::read_model(models_path, properties_without_draft_model);
auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;
// properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;

auto generation_config = utils::from_config_json_if_exists(models_path);

Expand Down
1 change: 1 addition & 0 deletions src/cpp/src/generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) {

// Structured output
read_anymap_param(properties, "structured_output_config", structured_output_config);
read_anymap_param(properties, "parsers", parsers);
}


Expand Down
48 changes: 47 additions & 1 deletion src/cpp/src/llm/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,53 @@ DecodedResults LLMPipeline::generate(
StringInputs inputs,
OptionalGenerationConfig generation_config,
StreamerVariant streamer) {
return m_pimpl->generate(inputs, generation_config, streamer);
auto res = m_pimpl->generate(inputs, generation_config, streamer);

// If streamer is of StreamerBase type, and it is TextParserStreamer, get parsed message
if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
if (auto parser_streamer = std::dynamic_pointer_cast<TextParserStreamer>(*streamer_obj)) {
res.parsed.resize(res.texts.size());
res.parsed[0] = parser_streamer->get_parsed_message();
}
}

if (!generation_config.has_value() || (*generation_config).parsers.empty()) {
return res;
}

std::vector<std::shared_ptr<ParserBase>> parsers;
if (generation_config.has_value() && !(*generation_config).parsers.empty()) {
for (auto& parser_variant : (*generation_config).parsers) {
if (std::holds_alternative<std::string>(parser_variant)) {
auto parser_name = std::get<std::string>(parser_variant);
auto parser = ParserBase::get_parser(parser_name);
if (!parser) {
OPENVINO_THROW("Parser with name ", parser_name, " is not registered");
}
parsers.push_back(parser);
} else if (std::holds_alternative<std::shared_ptr<ParserBase>>(parser_variant)) {
auto parser = std::get<std::shared_ptr<ParserBase>>(parser_variant);
parsers.push_back(parser);
}
}
}

res.parsed.resize(res.texts.size());

// Apply Base parsers sequentially even if IncrementalParser has run.
if (!parsers.empty()) {
for (size_t i = 0; i < res.texts.size(); ++i) {
ParsedMessage msg;
msg["content"] = res.texts[i];
for (auto& parser: parsers) {
// TODO: check if is_active() is needed here
// TODO: Check the state of incremental parser and reset if necessary
msg = parser->parse(msg);
}
}
}

return res;
}

DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) {
Expand Down
Loading
Loading