42
42
"max_output_tokens" ,
43
43
"max_input_tokens" ,
44
44
"use_phoenix" ,
45
+ "via_streaming" ,
45
46
"min_delay" , # minimum delay between queries for that model
46
47
]
47
48
@@ -641,6 +642,7 @@ def query(
641
642
debug = False ,
642
643
litellm_debug = None ,
643
644
stream = False ,
645
+ via_stream = False ,
644
646
recursive_call_info : Optional [Dict [str , any ]] = None ,
645
647
** kwargs ,
646
648
) -> Dict [str , any ]:
@@ -657,8 +659,10 @@ def query(
657
659
return_response: whether or not the complete reponse should get returned
658
660
debug: if True, emits debug messages to aid development and debugging
659
661
litellm_debug: if True, litellm debug logging is enabled, if False, disabled, if None, use debug setting
660
- stream: if True, the returned object containst the stream that can be iterated over. Streaming
662
+ stream: if True, the returned object contains the stream that can be iterated over. Streaming
661
663
may not work for all models.
664
+ via_stream: if True, ignores the stream parameters, the response data is retrieved internally via streaming.
665
+ This may be useful if the non-streaming response keeps timing out.
662
666
recursive_call_info: internal use only
663
667
kwargs: any additional keyword arguments to pass on to the LLM
664
668
@@ -708,7 +712,10 @@ def query(
708
712
fmap = toolnames2funcs (tools )
709
713
else :
710
714
fmap = {}
711
- if stream :
715
+ if via_stream :
716
+ # TODO: check if model supports streaming
717
+ completion_kwargs ["stream" ] = True
718
+ elif stream :
712
719
# TODO: check if model supports streaming
713
720
# if streaming is enabled, we always return the original response
714
721
return_response = True
@@ -743,7 +750,36 @@ def query(
743
750
model = llm ["llm" ],
744
751
messages = messages ,
745
752
** completion_kwargs )
746
- if stream :
753
+ if via_stream :
754
+ # retrieve the response using streaming, return once we have everything
755
+ try :
756
+ answer = ""
757
+ for chunk in response :
758
+ choice0 = chunk ["choices" ][0 ]
759
+ if choice0 .finish_reason == "stop" :
760
+ logger .debug (f"DEBUG: streaming got stop. Chunk { chunk ['index' ]} : { chunk ['value' ]} " )
761
+ break
762
+ content = choice0 ["delta" ].get ("content" , "" )
763
+ logger .debug (f"DEBUG: streaming content: { content } " )
764
+ answer += content
765
+ answer += content
766
+ if return_response :
767
+ ret ["response" ] = response
768
+ ret ["cost" ] = None
769
+ ret ["elapsed_time" ] = time .time () - start
770
+ ret ["ok" ] = True
771
+ ret ["error" ] = ""
772
+ return ret
773
+ except Exception as e :
774
+ tb = traceback .extract_tb (e .__traceback__ )
775
+ filename , lineno , funcname , text = tb [- 1 ]
776
+ ret ["error" ] = str (e ) + f" in { filename } :{ lineno } { funcname } "
777
+ if debug :
778
+ logger .error (f"Returning error: { e } " )
779
+ ret ["answer" ] = ""
780
+ ret ["ok" ] = False
781
+ return ret
782
+ elif stream :
747
783
def chunk_generator (model_generator , retobj ):
748
784
try :
749
785
for chunk in model_generator :
0 commit comments