20
20
from litellm import completion , completion_cost , token_counter
21
21
from litellm .utils import get_model_info , get_supported_openai_params , supports_response_schema
22
22
from litellm .utils import supports_function_calling , supports_parallel_function_calling
23
+ from litellm ._logging import _enable_debugging as litellm_enable_debugging
24
+ from litellm ._logging import _disable_debugging as litellm_disable_debugging
23
25
from llms_wrapper .utils import dict_except
24
26
from llms_wrapper .model_list import model_list
25
27
@@ -642,8 +644,8 @@ def query(
642
644
debug = False ,
643
645
litellm_debug = None ,
644
646
stream = False ,
645
- via_stream = False ,
646
- recursive_call_info : Optional [Dict [str , any ]] = None ,
647
+ via_streaming = False ,
648
+ recursive_call_info : Optional [Dict [str , any ]] = None ,
647
649
** kwargs ,
648
650
) -> Dict [str , any ]:
649
651
"""
@@ -661,7 +663,7 @@ def query(
661
663
litellm_debug: if True, litellm debug logging is enabled, if False, disabled, if None, use debug setting
662
664
stream: if True, the returned object contains the stream that can be iterated over. Streaming
663
665
may not work for all models.
664
- via_stream : if True, ignores the stream parameters, the response data is retrieved internally via streaming.
666
+ via_streaming : if True, ignores the stream parameters, the response data is retrieved internally via streaming.
665
667
This may be useful if the non-streaming response keeps timing out.
666
668
recursive_call_info: internal use only
667
669
kwargs: any additional keyword arguments to pass on to the LLM
@@ -672,12 +674,30 @@ def query(
672
674
otherwise answer contains the response and error is the empty string.
673
675
The boolean key "ok" is True if there is no error, False otherwise.
674
676
"""
677
+ def cleaned_args (args : dict ):
678
+ """If there is an API key in the dict, censor it"""
679
+ args = args .copy ()
680
+ if "api_key" in args :
681
+ args ["api_key" ] = "***"
682
+ return args
675
683
if self .debug :
676
684
debug = True
677
685
if litellm_debug is None and debug or litellm_debug :
678
686
# litellm.set_verbose = True ## deprecated!
679
687
os .environ ['LITELLM_LOG' ] = 'DEBUG'
688
+ litellm_enable_debugging ()
689
+ litellm ._turn_on_debug ()
690
+ else :
691
+ # make sure we turn off debugging if it is still on from a previous call
692
+ litellm_disable_debugging ()
693
+ os .environ ['LITELLM_LOG' ] = 'INFO'
680
694
llm = self .llms [llmalias ].config
695
+ logger .debug (f"llm config: { cleaned_args (llm )} " )
696
+ # allow to specify via_streaming and stream in the llm config as well, the value in the config will override the call
697
+ if "via_streaming" in llm and llm ["via_streaming" ]:
698
+ via_streaming = True
699
+ if "stream" in llm and llm ["stream" ]:
700
+ stream = True
681
701
if not messages :
682
702
raise ValueError (f"Error: No messages to send to the LLM: { llmalias } , messages: { messages } " )
683
703
if debug :
@@ -688,6 +708,8 @@ def query(
688
708
KNOWN_LLM_CONFIG_FIELDS ,
689
709
ignore_underscored = True ,
690
710
)
711
+ logger .debug (f"Options: via_streaming: { via_streaming } , stream: { stream } " )
712
+ logger .debug (f"Initial completion kwargs: { cleaned_args (completion_kwargs )} " )
691
713
if recursive_call_info is None :
692
714
recursive_call_info = {}
693
715
if llm .get ("api_key" ):
@@ -712,22 +734,24 @@ def query(
712
734
fmap = toolnames2funcs (tools )
713
735
else :
714
736
fmap = {}
715
- if via_stream :
737
+ if via_streaming :
716
738
# TODO: check if model supports streaming
717
739
completion_kwargs ["stream" ] = True
740
+ logger .debug (f"completion kwargs after detecting via_streaming: { cleaned_args (completion_kwargs )} " )
718
741
elif stream :
719
742
# TODO: check if model supports streaming
720
743
# if streaming is enabled, we always return the original response
721
744
return_response = True
722
745
completion_kwargs ["stream" ] = True
746
+ logger .debug (f"completion kwargs after detecting stream: { cleaned_args (completion_kwargs )} " )
723
747
ret = {}
724
748
# before adding the kwargs, save the recursive_call_info and remove it from kwargs
725
749
if debug :
726
- print (f"DEBUG: Received recursive call info: { recursive_call_info } " )
750
+ logger . debug (f"Received recursive call info: { recursive_call_info } " )
727
751
if kwargs :
728
752
completion_kwargs .update (dict_except (kwargs , KNOWN_LLM_CONFIG_FIELDS , ignore_underscored = True ))
729
753
if debug :
730
- print (f"DEBUG: Calling completion with kwargs { completion_kwargs } " )
754
+ logger . debug (f"calling query with completion kwargs: { cleaned_args ( completion_kwargs ) } " )
731
755
# if we have min_delay set, we look at the _last_request_time for the LLM and caclulate the time
732
756
# to wait until we can send the next request and then just wait
733
757
min_delay = llm .get ("min_delay" , kwargs .get ("min_delay" , 0.0 ))
@@ -749,23 +773,29 @@ def query(
749
773
response = litellm .completion (
750
774
model = llm ["llm" ],
751
775
messages = messages ,
752
- drop_params = True ,
776
+ drop_params = False , # we do not drop, so typos in the query call can be detected easier!
753
777
** completion_kwargs )
754
- if via_stream :
778
+ logger .debug (f"Received response from litellm" )
779
+ if via_streaming :
755
780
# retrieve the response using streaming, return once we have everything
756
781
try :
757
782
answer = ""
783
+ logger .debug (f"Retrieving chunks ..." )
784
+ n_chunks = 0
758
785
for chunk in response :
759
786
choice0 = chunk ["choices" ][0 ]
760
787
if choice0 .finish_reason == "stop" :
761
- logger .debug (f"DEBUG: streaming got stop. Chunk { chunk [ 'index' ] } : { chunk [ 'value' ] } " )
788
+ logger .debug (f"Streaming got stop. Chunk { chunk } " )
762
789
break
790
+ n_chunks += 1
763
791
content = choice0 ["delta" ].get ("content" , "" )
764
- logger .debug (f"DEBUG: streaming content: { content } " )
792
+ logger .debug (f"Got streaming content: { content } " )
765
793
answer += content
766
794
answer += content
767
795
if return_response :
768
796
ret ["response" ] = response
797
+ ret ["answer" ] = answer
798
+ ret ["n_chunks" ] = n_chunks
769
799
ret ["cost" ] = None
770
800
ret ["elapsed_time" ] = time .time () - start
771
801
ret ["ok" ] = True
@@ -808,6 +838,7 @@ def chunk_generator(model_generator, retobj):
808
838
logger .debug (f"Full Response: { response } " )
809
839
llm ["_elapsed_time" ] += elapsed
810
840
ret ["elapsed_time" ] = elapsed
841
+ ret ["n_chunks" ] = 1
811
842
if return_response :
812
843
ret ["response" ] = response
813
844
# prevent the api key from leaking out
@@ -825,7 +856,7 @@ def chunk_generator(model_generator, retobj):
825
856
messages = messages ,
826
857
)
827
858
if debug :
828
- print (f"DEBUG: cost for this call { ret ['cost' ]} " )
859
+ logger . debug (f"Cost for this call { ret ['cost' ]} " )
829
860
except Exception as e :
830
861
logger .debug (f"Error in completion_cost for model { llm ['llm' ]} : { e } " )
831
862
ret ["cost" ] = 0.0
@@ -839,7 +870,7 @@ def chunk_generator(model_generator, retobj):
839
870
if recursive_call_info .get ("cost" ) is not None :
840
871
ret ["cost" ] += recursive_call_info ["cost" ]
841
872
if debug :
842
- print (f"DEBUG: cost for this and previous calls { ret ['cost' ]} " )
873
+ logger . debug (f"Cost for this and previous calls { ret ['cost' ]} " )
843
874
if recursive_call_info .get ("n_completion_tokens" ) is not None :
844
875
ret ["n_completion_tokens" ] += recursive_call_info ["n_completion_tokens" ]
845
876
if recursive_call_info .get ("n_prompt_tokens" ) is not None :
@@ -861,7 +892,7 @@ def chunk_generator(model_generator, retobj):
861
892
# TODO: if feasable handle all tool calling here or in a separate method which does
862
893
# all the tool calling steps (up to a specified maximum).
863
894
if debug :
864
- print (f"DEBUG: checking for tool_calls: { response_message } , have tools: { tools is not None } " )
895
+ logger . debug (f"Checking for tool_calls: { response_message } , have tools: { tools is not None } " )
865
896
if tools is not None :
866
897
# TODO: if streaming is enabled we need to gather the complete response before
867
898
# we can process the tool calls
@@ -872,17 +903,17 @@ def chunk_generator(model_generator, retobj):
872
903
if stream :
873
904
raise ValueError ("Error: streaming is not supported for tool calls yet" )
874
905
if debug :
875
- print (f"DEBUG: got { len (tool_calls )} tool calls:" )
906
+ logger . debug (f"Got { len (tool_calls )} tool calls:" )
876
907
for tool_call in tool_calls :
877
- print (f"DEBUG : { tool_call } " )
908
+ logger . debug (f"Tool call : { tool_call } " )
878
909
if len (tool_calls ) > 0 : # not an empty list
879
910
if debug :
880
- print (f"DEBUG: appending response message: { response_message } " )
911
+ logger . debug (f"Appending response message: { response_message } " )
881
912
messages .append (response_message )
882
913
for tool_call in tool_calls :
883
914
function_name = tool_call .function .name
884
915
if debug :
885
- print (f"DEBUG: tool call { function_name } " )
916
+ logger . debug (f"Tool call { function_name } " )
886
917
fun2call = fmap .get (function_name )
887
918
if fun2call is None :
888
919
ret ["error" ] = f"Unknown tooling function name: { function_name } "
@@ -892,15 +923,15 @@ def chunk_generator(model_generator, retobj):
892
923
function_args = json .loads (tool_call .function .arguments )
893
924
try :
894
925
if debug :
895
- print (f"DEBUG: calling { function_name } with args { function_args } " )
926
+ logger . debug (f"Calling { function_name } with args { function_args } " )
896
927
function_response = fun2call (** function_args )
897
928
if debug :
898
- print (f"DEBUG: got response { function_response } " )
929
+ logger . debug (f"Got response { function_response } " )
899
930
except Exception as e :
900
931
tb = traceback .extract_tb (e .__traceback__ )
901
932
filename , lineno , funcname , text = tb [- 1 ]
902
933
if debug :
903
- print (f"DEBUG: function call got error { e } " )
934
+ logger . debug (f"Function call got error { e } " )
904
935
ret ["error" ] = f"Error executing tool function { function_name } : { str (e )} in { filename } :{ lineno } { funcname } "
905
936
if debug :
906
937
logger .error (f"Returning error: { e } " )
@@ -914,10 +945,10 @@ def chunk_generator(model_generator, retobj):
914
945
content = json .dumps (function_response )))
915
946
# recursively call query
916
947
if debug :
917
- print (f"DEBUG: recursively calling query with messages:" )
948
+ logger . debug (f"Recursively calling query with messages:" )
918
949
for idx , msg in enumerate (messages ):
919
- print (f"DEBUG: Message { idx } : { msg } " )
920
- print (f"DEBUG: recursively_call_info is { recursive_call_info } " )
950
+ logger . debug (f"Message { idx } : { msg } " )
951
+ logger . debug (f"Recursively_call_info is { recursive_call_info } " )
921
952
return self .query (
922
953
llmalias ,
923
954
messages ,
@@ -929,6 +960,7 @@ def chunk_generator(model_generator, retobj):
929
960
recursive_call_info = recursive_call_info ,
930
961
** kwargs )
931
962
except Exception as e :
963
+ logger .debug (f"Exception in query from litellm: { e } " )
932
964
tb = traceback .extract_tb (e .__traceback__ )
933
965
filename , lineno , funcname , text = tb [- 1 ]
934
966
ret ["error" ] = str (e ) + f" in { filename } :{ lineno } { funcname } "
0 commit comments