
    rh[                       d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZmZ d dlmZ d dl
mZ d dlmZmZmZmZ d dlmZ d dlmZ d d	lmZ d dlZd d
l m!Z!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3m4Z4 ddl)m5Z5  e3       rd dl6Z6d dlm7Z7m8Z8m9Z9m:Z:  e%       rd dl;Z; e'       xr  e$       xr  e(       xr  e&       Z<e<rd dl=Z=d dl>m?Z?m@Z@ d dlAmBZB d dlCmDZDmEZE d dlFmGZG d dlHmIZI d dlJmKZK d dlLmMZMmNZNmOZOmPZPmQZQ d dlRmSZS d dlTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZc d dldmeZe d dlfmgZgmhZhmiZi  G d deed !      Zj G d" d#eSd !      Zk G d$ d%eId !      Zl ehej      Zm ehek      Zn ehel      Zoh d&Zph d'Zqh d(Zr e4j                  et      Zud)d*d+d,iZv ewevj                               Zy G d- d.ej                        Z{d/efd0Z|d1e}d2d3d4d3fd5Z~ G d6 d7      Z G d8 d9      Ze G d: d;             Z G d< d=e5      Zetd>k(  r e       Zej	                          yy)?    N)ArgumentParser	Namespace)	dataclassfield)BytesIO)Thread)	GeneratorIterableOptionalUnion)
model_info)HF_HUB_OFFLINE)Image)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_available   )
AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)ContinuousBatchingManagerRequestStatus)is_torch_availablelogging   )BaseTransformersCLICommand)AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoiceChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                       e Zd ZU dZeed<   y))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__     p/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/commands/serving.pyrI   rI   r       	 rS   rI   F)totalc                       e Zd ZU dZeed<   y)+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with an additional field for the generation config (as a json string).
        rJ   NrK   rR   rS   rT   rX   rX   y   rU   rS   rX   c                   :    e Zd ZU dZeed<   eed<   dZee	   ed<   y)%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerJ   FstreamN)
rL   rM   rN   rO   bytesrQ   rP   r\   r   boolrR   rS   rT   rZ   rZ      s"    	 !&&rS   rZ   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstopr`   audiora   logprobsmetadata	functions
modalities
predictionrg   rh   ri   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   rb   rc   languagerv   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendc                       e Zd ZdZdZdZdZy)ModalityLLMVLMSTTTTSN)rL   rM   rN   r   r   r   r   rR   rS   rT   r   r      s    
C
C
C
CrS   r   argsc                     t        |       S )z~
    Factory function used to instantiate serving server from provided command line arguments.

    Returns: ServeCommand
    )ServeCommand)r   s    rT   serve_command_factoryr      s     rS   reqmodel_generation_configr%   returnc                 `   | j                  d      "t        di t        j                  | d         }nt	        j
                  |      } |j                  di |}|j                         D ]  \  }}|	t        |||        | j                  d      t        | d         |_
        | j                  d      t        | d         |_
        | j                  d      t        | d         |_        | j                  d      
| d   |_        | j                  d      
| d   |_        | j                  d      +t        | d         |_        t        | d         dk(  rd	|_        | j                  d
      t        | d
         |_        | j                  d      t%        j&                  | d          |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rJ   max_output_tokens
max_tokensfrequency_penalty
logit_biasrm   temperatureg        Ftop_pseedrR   )getr%   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   torchmanual_seed)r   r   kwargsrJ   non_standard_kwargskvs          rT   !create_generation_config_from_reqr      s   . ww"#/,Ttzz#>Q:R/ST MM*AB2+22<V<#))+ -1=%q!,-
 ww"#/+.s3F/G+H( ww|(+.s</@+A(
ww"#//4S9L5M/N,
ww|(*-l*;'
wwv"),V&
ww})(-c-.@(A%]#$+*/'
www#"'G"5
wwv"#f+&rS   c                       e Zd ZdZd Zd Zy)	ToolStatez7Lightweight class to keep track of the tool call state.c                 $    | j                          y N)resetselfs    rT   __init__zToolState.__init__  s    

rS   c                 <    d| _         d| _        d| _        d| _        y)z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   s    rT   r   zToolState.reset  s!     %%*"!"rS   N)rL   rM   rN   rO   r   r   rR   rS   rT   r   r     s    ArS   r   c            	       F    e Zd ZdZ	 ddddedeed      fdZd	 Zd
 Z	d Z
y)
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr&   timeout_seconds	processor)r   r   c                     || _         t        |j                        | _        || _        || _        t        j                  | j
                  | j                        | _	        | j                  j                          y r   )r   rP   name_or_path_name_or_pathr   r   	threadingTimer_delete_model_timerr   )r   r   r   r   s       rT   r   zTimedModel.__init__-  s[     
 !3!34".ood&:&:D<N<NOrS   c                     | j                   j                          t        j                  | j                  | j
                        | _         | j                   j                          y)z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r   r   s    rT   reset_timerzTimedModel.reset_timer:  s@    ood&:&:D<N<NOrS   c                 Z   t        | d      r| j                  | `| `d| _        d| _        t        j                          t
        j                  j                         rt
        j                  j                          t        j                  | j                   d| j                   d       yyy)z>Delete the wrapped model and processor and clean up resources.r   Nz was removed from memory after z seconds of inactivity)hasattrr   r   gccollectr   cudais_availableempty_cacheloggerinfor   r   r   s    rT   r   zTimedModel._delete_model@  s    4!djj&<
DJ!DNJJL zz&&(

&&(KK%%&&EdFZFZE[[qr '=!rS   c                 <    t        | d       xs | j                  du S )z)Check if the instances have been deleted.r   N)r   r   r   s    rT   
is_deletedzTimedModel.is_deletedQ  s     4))?TZZ4-??rS   r   )rL   rM   rN   rO   r   r   r   r   r   r   r   rR   rS   rT   r   r   '  sJ     SW	   E"MNO	"@rS   r   c                   V   e Zd ZU dZ edddi      Zeed<    eddg dd	      Ze	e   ed
<    edddi      Z
eed<    edddi      Ze	e   ed<    edddi      Zeed<    edddi      Zeed<    eddddgd	      Zeed<    edddi      Zeed<    edddi      Zeed<    edddi      Zeed <    ed!dd"i      Zeed#<    ed$dd%i      Zeed&<    eddd'i      Ze	e   ed(<    eddd)i      Zeed*<    eddd+i      Zeed,<    eddd-i      Ze	e   ed.<   y)/ServeArgumentsz
    Arguments for the serve CLI.

    See the metadata arg for each argument's description -- the metadata will be printed with
    `transformers serve --help`
    autohelpzfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.)defaultrp   devicezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.)r   bfloat16float16float32)r   choicestorch_dtypeFz2Whether to trust remote code when loading a model.trust_remote_codeNzWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA.load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA.load_in_4bitnf4zQuantization type.fp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant	localhostz$Interface the server will listen to.hosti@  zPort the server will listen to.porti,  z@Time in seconds after which a model will be removed from memory.model_timeoutr   z8Logging level as a string. Example: 'info' or 'warning'.	log_levelz1The default seed for torch, should be an integer.default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.force_model)rL   rM   rN   rO   r   r   rP   rQ   r   r   r   r^   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rR   rS   rT   r   r   V  s
     >
FC  "'PA
"K#  $)] ^t  */ r
*#  efL$  efL$   %UFZhmotgu=vww!&uHm?n!o$o kV=c4deD#edf6W-XYD#Y\]M3  &*d!eIs  #(([\#L(3-  &
K  #B
d  "'2
"K# rS   r   c                   D   e Zd Zedefd       ZdefdZdedddd	d
e	fdZ
defdZdefdZdefdZ	 	 	 	 	 	 d0dee   dee   dee   dee   dee   deed      defdZdddefdZd Z ej*                  d      deeeef      fd       Zdedeeddf   fd Zededefd!       Zed"efd#       Zdedeeddf   fd$Zdedeeddf   fd%Zdedeeddf   fd&Z dede!fd'Z"ededed(   fd)       Z#d*edefd+Z$d,efd-Z%d,ede&ee'f   fd.Z(d,ede&ee)f   fd/Z*y)1r   parserc                 d    t         f}| j                  d|      }|j                  t               y)z
        Register this command to argparse so it's available for the transformer-cli

        Args:
            parser: Root parser to register command-specific arguments
        serve)dataclass_types)funcN)r   
add_parserset_defaultsr   )r   r   serve_parsers      rT   register_subcommandz ServeCommand.register_subcommand  s3     *+((/(R!!'<!=rS   r   c                    t         st        d      || _        | j                  j                  dk(  | _        | j                  j
                  | _        | j                  j                  )t        j                  | j                  j                         t        j                  d      }|j                  t        j                  | j                  j                  j                                   t        j                  d      }|j                  t        j                  | j                  j                  j                                   i | _        d | _        d | _        d | _        d | _        y )NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`
sdpa_pagedtransformersz+transformers.generation.continuous_batching)serve_dependencies_availableImportErrorr   r   use_continuous_batchingr   r   r   r   r    
get_loggersetLevel
log_levelsr   lowerloaded_models#running_continuous_batching_managerlast_messageslast_kv_cache
last_model)r   r   transformers_logger	cb_loggers       rT   r   zServeCommand.__init__  s   +s 
 	'+yy'D'D'T$990099!!-dii445 &00@$$W%7%7		8K8K8Q8Q8S%TU&&'TU	7--dii.A.A.G.G.IJK 57X\0 "!rS   requestschema_TypedDictMeta	validatorrF   unused_fieldsc                 0   t         j                  d|        t        |j                               }|j                  }||z
  }|r(t         j                  d|        t        dd|       | j                  j                  rB	 |j                  |       ||z  }	|	r(t         j                  d|	        t        dd|	       yy# t        $ rF}t         j                  d|j                                 t        d|j                               d}~ww xY w)a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`_TypedDictMeta`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   debugsetkeys__mutable_keys__errorr(   r   r   validate_pythonrG   errors)
r   r  r  r  r  
input_keyspossible_keysunexpected_keyseunused_fields_in_requests
             rT   _validate_requestzServeCommand._validate_request  s   . 	+G956 (
//$}4LL;O;LMNC:Z[jZk8lmm99%%H))'2 (2M'A$'=>V=WXY# #.LMeLf,g  ( & # H1!((*>?#AHHJGGHs   C 	DADDc                 F    | j                  |t        t        t               y N)r  r  r  r  )r   rI   response_validatorUNUSED_RESPONSE_FIELDSr   r  s     rT   validate_response_requestz&ServeCommand.validate_response_request  s!    <(0	 	 	
rS   c                 F    | j                  |t        t        t               y r"  )r   rX   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr%  s     rT    validate_chat_completion_requestz-ServeCommand.validate_chat_completion_request  s!    >*7	 	 	
rS   c                 F    | j                  |t        t        t               y r"  )r   rZ   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr%  s     rT   validate_transcription_requestz+ServeCommand.validate_transcription_request  s!    8-5	 	 	
rS   N
request_idcontentr   rolefinish_reason
tool_callsr2   r   c                     t        |t        t        j                               |t        t	        |||      d|      gdd      }d|j                  d	       d
S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        )r0  r1  r3  r   )deltaindexr2  r   zchat.completion.chunk)idcreatedr   r   system_fingerprintobjectdata: Texclude_none

)r/   r   timer0   r1   model_dump_json)r   r/  r0  r   r1  r2  r3  chunks           rT   build_chat_completion_chunkz(ServeCommand.build_chat_completion_chunk'  sq    @ $		$% '!#-
 "/
  "*!
$ --4-@AFFrS   responserE   c                 .    d|j                  d       dS )a  
        Builds a event of a streaming OpenAI Response response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            response (`BaseModel`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        r;  Tr<  r>  )r@  )r   rC  s     rT   build_response_eventz!ServeCommand.build_response_event[  s"     00d0CDDIIrS   c                 4    t               } j                  r|j                  t        dgddgdg       |j	                  d      dt
        f fd       }|j	                  d      dt
        f fd       }d	d
lm} |j	                  d      d|f fd       }|j                  d      |j                  d       fd              }t        j                  | j                  j                   j                  j                   j                  j                         y )N*T)allow_originsallow_credentialsallow_methodsallow_headersz/v1/chat/completionsr  c                     j                  |        j                  rj                  |       }nj                  |       }t	        |d      S Nr  text/event-stream
media_type)r*  r   #continuous_batching_chat_completiongenerate_chat_completionr+   r  outputr   s     rT   chat_completionz)ServeCommand.run.<locals>.chat_completiony  sK    11'1B++AA'J66w?$V8KLLrS   z/v1/responsesc                 d    j                  |        j                  |       }t        |d      S rM  )r&  generate_responser+   rT  s     rT   	responsesz#ServeCommand.run.<locals>.responses  s2    **7*;++G4F$V8KLLrS   r   )Requestz/v1/audio/transcriptionsc           
        K   | j                         4 d {   }t        |d   j                          d {   |d         }t        j	                  d|d   j
                   d|d   j                   d|d   j                  dz  dd	       d d d       d {    j                  
       j                  |      }t        |d      S 7 7 7 8# 1 d {  7  sw Y   HxY ww)Nr[   r   )r[   r   zReceived file: z; MIME type: z; size:    z.2fz KiBrN  rO  rP  )formrZ   readr   r  filenamecontent_typesizer.  generate_transcriptionr+   )r  r]  parsed_requestrU  r   s       rT   audio_transcriptionsz.ServeCommand.run.<locals>.audio_transcriptions  s      ||~ 	 	!F#F|0022w-"
 %d6l&;&;%<M$v,JcJcId e!&\..5c:$@	 	 ///G00@F$V8KLL	2	 	 	 	sU   C+CC+CC
ACC+C5C+CC+C(CC($C+z
/v1/modelsc                  <    t        d j                         d      S )Nlist)r:  data)r*   get_gen_modelsr   s   rT   get_all_modelsz(ServeCommand.run.<locals>.get_all_models  s      64;N;N;P QRRrS   )r   r   r   )r'   r   add_middlewarer)   postdictfastapirZ  optionsr   uvicornrunr   r   r   r   )r   apprV  rY  rZ  rd  ri  s   `      rT   rp  zServeCommand.runk  s   i "e"&"e"e   
(	)	MT 	M 
*	M 
/	"	Mt 	M 
#	M 	$	,	-	M 	M 
.	M" 
\	"			S 
 
#	S 	Cdiinn499>>TYYM`M`arS   )maxsizec           	         g d}t         rQ|D cg c]E  }|dt        j                  j                         j                         |j	                  d      d   dG c}S |D cg c]  }t        |       }}|D cg c]5  }|j                  d|j                  j                         |j                  d7 c}S c c}w c c}w c c}w )a.  
        This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
        model working with generate can work.

        This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
        integrations.
        )zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructr   /r   )r7  r:  r8  owned_by)	r   datetimenow	timestampsplitr   r7  
created_atauthor)r   modelsr   model_infoss       rT   rh  zServeCommand.get_gen_models  s    
  $   %'00446@@B %C 0 3	  ;AA:e,AKA )   ((%$//99; %	  Bs   A
B5 B:8:B?r   c                    	  j                  d         		 j                  k7  }	 _        |r0 j                  $ j                  j                  dd       d _         j	                  	      \  }}t        |d      r|j                  n|}t        |j                  |j                  |j                  ddd	dd
d
       j                  K|j                  d       _        t                j                  _         j                  j                          |j                  d   dd      j!                  |j"                        }	 fd} ||d         S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   )blocktimeout	tokenizerFr!   r\  
   fifo)	r   eos_token_idpad_token_id	use_cache
num_blocks
block_sizer   max_batch_tokens	scheduler)rJ   	streamingmessagespt)return_tensorsadd_generation_promptc              3     K   	 	j                   j                  | j                  d      j                        }d}	j	                  |d       	j                   D ]  }|j
                  |k7  rj                  d      "|s |j                  t        j                  k(  rDd}|j                  t        j                  k(  rdnd }|j                  t        j                  k(  r	j	                  ||        y 	j	                  ||j                  	        y # t        $ r9}t        j                  t        |             d
t        |       d Y d }~y d }~ww xY ww)Nr/  )r/  r   F	assistantr1  r   Trm   r2  r   )r/  r0  r   data: {"error": ""})r  add_requestr   r   rB  r/  statusr   FINISHED
next_token	Exceptionr   r  rP   )
_inputsr/  queue_is_flushedresultr2  r  rJ   model_id_and_revisionr   r   s
         rT   stream_chat_completionzPServeCommand.continuous_batching_chat_completion.<locals>.stream_chat_completion  s_    !7!EEQQ(=N_NnNn R 
 $)  66z[p6qq"FF F((J6 ww|,8AQ!==M,B,BB$/3,.4mm}?U?U.UF[_M}}(>(>>">>&mK` ?   ">>'16;L;LTi ?  !(  7SV$*3q6(#667s;   E$C6D ;E$<"D E$	E!(/EE$E!!E$r   )process_model_namer	  r  rm   load_model_and_processorr   r  r   rJ   r  r  init_continuous_batchingr   logit_processorr   apply_chat_templatetor   )
r   r   must_discard_cacher   r   r  inputsr  rJ   r  s
   ``      @@rT   rR  z0ServeCommand.continuous_batching_chat_completion  sc    !% 7 7G E2dooE/77C88==DRS=T;?8889NOy+29k+JI''PY	=$)$;$;"//"//
 33;7<7U7U"3t 8V 8D4 H[G\D44D44::< ..s:tko.pssLL
"	7H &fQi00rS   c                     | j                   j                  }|t        j                         v rt        j
                  }|S |t        j                         v rt        j                  }|S t        d|       )NzUnknown modality: )		__class__rL   r   valuesr   r   r   r   
ValueError)r   model_classnamemodalitys      rT   get_model_modalityzServeCommand.get_model_modality,  sm    //22HOOQQ||H   A H H JJ||H  1/1BCDDrS   r  c           	         g }| D ]~  }|d   g d}|t         j                  k(  r't        |d   t              r|d   n|d   d   }||d<   n(|t         j                  k(  rt        |d   t              r|d   j                  d|d   d       n|d   D ]  }|d   dk(  r|d   j                  |        |d   dk(  s)d|d   d	   v rt        j                  d
d|d   d	         }t        j                  t        t        j                  |                  }t        j                  dd      }|j                  }	|j!                  |j                         n|d   d	   }	|d   j                  d|	d        |j                  |        |S )Nr1  r1  r0  r0  r_   )typer_   r  	image_urlbase64urlz^   z.pngF)suffixdeleteimage)r  r  )r   r   
isinstancerP   r   appendresubr   openr   r  	b64decodetempfileNamedTemporaryFilenamesave)
r  r  processor_inputsmessageparsed_messager0  
image_datar  r[   r  s
             rT   *get_processor_inputs_from_inbound_messagesz7ServeCommand.get_processor_inputs_from_inbound_messages8  s    	4G&-fo"EN8<<'0:79;Ms0S'),Y`ajYklrYs,3y)X\\) gi0#6"9-44fgV_N`5ab#*9#5 \"6?f4*95<<WE$V_;'7;+?+FF-/VV4LbRYZeRfglRm-n
(-

76;K;KJ;W3X(Y'/'B'B&Y^'_&*ii %

499 5&-k&:5&A*95<<gVY=Z[\  ##N3=	4>  rS   c                      j                   j                   j                   j                  |d<   |d   }|d   d   dk(  ry j                  |d          j                  k7  } _         j	                        \  } j                        } j                  ||      }dt        D ]/  }|j                  j                  d   j                         v s-| n |j                  |d|j                  d	d      d
dd      }|j                  j                        }|j                  dd      d}	dj                  j                  d   j                         v rd}	t        ||	d      }
t!        |j"                        }d} j%                  |      r|s j&                  }i ||
|d|d fd} ||
      S )a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r1  r  r   Ttoolsr  )r  r  r  return_dicttokenizer/  req_0gptossFskip_special_tokensskip_promptr   )streamerrJ   return_dict_in_generatepast_key_valuesc              3     K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}	 |j	                          t               }j                  d	
       | D ]2  }dj                   j                  d   j                         v r |j                  d      r|d t        d        }||z  }|r||v rd}[\|j                         t           d   k(  rd|_        |j                         t           d   k(  r(|j                          j                  |d d       |j                  r@|xj                  |z  c_        |j                  sYt        j                   d|j                        }	|	|	j#                  d      }	d|_        t%        t'        |	      dd|dz         }
n|dk(  rWd|j                  vrg|xj(                  |j+                  d      z  c_        |xj(                  |j+                  d      z  c_        |j(                  dk  r&dj-                  |j/                  d      d d       dz   }t%        t'        |      dd      }
j                  |d |
g       |dk7  sj                  ||       5 j                  |d       |j-                          |j-                          y # t0        $ r9}t2        j5                  t7        |             d t7        |       d! Y d }~Nd }~ww xY w# |j-                          w xY ww)"NFr  r   T<|channel|>final<|message|>c                  L     j                   di | }|j                  _        y NrR   generater  r  r   generate_outputr   r   s     rT   generate_with_cachezbServeCommand.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cache  $    "0%..":6":%4%D%D"rS   targetr   r   r  r  
<|return|>r   r   r3  )r/  r1  r2  r   z\"name\": \"(.*?)\"r!   )r  function
_tool_call)r  r6  r  r7  z"arguments": {{})	arguments)r  r6  r  )r/  r1  r3  r   )r0  r   rm   r  r  r  )configarchitecturesr  r   r   r   rB  endswithlenstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr2   r3   r   countjoinry  r  r   r  rP   )r  _request_id
filter_cotcot_trace_endr  threadresults
tool_stater  	tool_nametoolr  generation_kwargsr   r  r/  r   tool_model_familys               rT   r  zEServeCommand.generate_chat_completion.<locals>.stream_chat_completion  s_     J M5<<55a8>>@@!
 =E #6?PQFGh&[
 66z[p6qq& WF5<<#=#=a#@#F#F#HH!??<8%+,@s</@.@%AFv%G "(G3).J$$ )4!<<>->?P-QRY-ZZ:>J7$ "<<>->?P-QRW-XX&,,."&"B"B+6%).:&;	 #C #  %%66&--7- $.#C#C,.II6LjN_N_,`	#,#4$,090BICG
 @':-Hi-X*+)3'2\'A	(" $*R<$, $4:;L;L#L$, !+ < <S@Q Q < * < <S@Q Q <#-#?#?!#C-/WWV\\#5Fs5K-Ls-RF':-HSY-Z*+)3(" #'"B"B+6Ttf\q #C #  % |">>'?T ?  kWp 66{RX`u6vv   7SV$*3q6(#667
 sC   AMIK* ?K* M*	L,3/L'"L/ 'L,,L/ /MM)r   r   r  r	  r  r  r  _MODELS_WITH_TOOL_SUPPORTr  r  r  r  r   r  r   r   r   rJ   is_continuationr  )r   r   r  r  r   r  r  supported_model_familiesr  r  generation_streamerrJ   r  r  r  r   r  r/  r  s   `             @@@@@rT   rS  z%ServeCommand.generate_chat_completion]  s    99  ,9900CL9<Z B<;. $ 7 7G E2dooE/889NOy**51JJ8U]^ !(A 	$'5<<+E+Ea+H+N+N+PP$<!	 .."&'''4( / 
 5<<(WW\73
 #u||11!4::<<"'2 3

 >c[`[r[rs$-? ..M

+!2'+,
y	 y	v &&9:FFrS   c                    
  j                  d          j                  k7  } _         j                        \  }t        d   t              r'dv r	dd   dgng }|j                  dd   d       nt        d   t              r8dv r.d   d   d   dk7  rdd   dgd   }nYd   }d   |d   d	<   nHd   }nBt        d   t              r$dv r	dd   dgng }|j                  d          nt        d
      |j                  |dd      }|j                  j                        }j                  dd      d}dj                  j                  d   j                         v rd}t!        ||d      }t#        j$                        }d} j'                        r|s j(                  }|t+        j,                  |      ||d|d

 fd}	 |	|      S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr  r`   r   r1  r0  z%inputs should be a list, dict, or strTr  )r  r  rk   r  r  Fr  r  N)r  attention_maskr  rJ   r  r  c              3   L	  K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}d}d}	 |j	                          t        j
                         }	t        d|t        d	 |	d
j                  d      dddiidg g j                  dd      dj                  d                  }
|dz  }j                  |
       t        d|t        d	 |	dj                  d      dddiidg g j                  dd      dj                  d                  }|dz  }j                  |       t        d||t        d dddg             }|dz  }j                  |       t        dd |||t        dd g !      "      }|dz  }j                  |       d }| D ]  }dj                   j                  d   j                         v r |j                  d#      r|d t!        d#        }||z  }|r
||v rd}d }\]t#        d$d ||||d d%d&g'      }|dz  }j                  |        t%        d(d ||d|d d%d&g)      }|dz  }j                  |       t'        d*d |||t        d|j(                  g !      "      }|dz  }|dz  }j                  |       t+        d+||t        d dd,d|j,                  gg -            }|dz  }|dz  }j                  |       t/        d.|t        d	 |	d,j                  d      dddii|j0                  gdg j                  dd      dj                  d      /            }|dz  }j                  |       |j3                          |j3                          y # t4        $ r}t6        j9                  d0t;        |              t=        d1|t;        |      2      }|dz  }j                  |       t?        d3|t        d	 	d4j                  d      dddiig dg ddj                  d      tA        d5t;        |      6      7            }|dz  }j                  |       Y d }~d }~ww xY w# |j3                          w xY ww)8NFr  r   Tr  c                  L     j                   di | }|j                  _        y r  r  r  s     rT   r  zTServeCommand.generate_response.<locals>.stream_response.<locals>.generate_with_cachen  r  rS   r  zresponse.createdresp_queuedr  formatr  r_   rC  rz   r   rp   )r7  rz  r  r   r  r_   r:  r  rU  rz   rg   rp   )r  sequence_numberrC  r!   zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )r7  r  r  r1  r0  )r  r
  output_indexitemzresponse.content_part.addedoutput_textr   )r  r_   annotations)r  item_idr
  r  content_indexpartr  zresponse.output_text.deltagX@)tokenlogprob)r  r  r
  r  r  r5  ro   zresponse.output_text.done)r  r  r
  r  r  r_   ro   zresponse.content_part.donezresponse.output_item.done	completed)r7  r  r  r1  r0  r  zresponse.completed)r7  rz  r  r   r  r_   rU  r:  r  rz   rg   rp   z"Exception in response generation: r  )r  r
  r  zresponse.failedfailedserver_error)coder  )r7  rz  r  r   r  r_   rU  r:  r  rz   rg   rp   r  )!r  r  r  r   r   r?  r9   r5   r   rE  r=   r>   r@   r7   rA   r  r  rB   rC   r8   r_   r?   r  r6   r  r  r  r   r  rP   r;   r<   r:   )r  r  r  r  r  r  r
  r  r  rz  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedr  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedr  error_eventresponse_failedr  r   r  r   r/  r   s                           rT   stream_responsez7ServeCommand.generate_response.<locals>.stream_responsed  s     J M5<<55a8>>@@!
 =E #6?PQFOLMN!YY[
 $8+$3%":,/#-'3%(WW^%<&(89) !,/GG4I5,Q$*!$!4$ $  1$//0@AA'>/$3%":,/#-,3%(WW^%<&(89) !,/GG4I5,Q$*!$!4($$  1$//0DEE .J5$3!-.!*.Y}[fpr	.*  1$//0JKK /L6":,/$3!-"/+RUWX/+  1$//0KLL & PF5<<#=#=a#@#F#F#HH!??<8%+,@s</@.@%AFv%G "(G3).J&(G$$1G9"&zl 3(7%1&3$,.4"@!A2. $q(O334NOO5P: -B4":,/$3!-"# (*t<=-)  1$//0IJJ .J5":,/$3!-"/+E^EcEcqst.*  1$"//0JKK -H4$3!-.!*.&*(!;!@!@ A$&	-)  1$!//0IJJ &<-$3%":,/#-*3%(WW^%<&(89 9 > >?) ,/GG4I5,Q$*!$!4&"$  1$//0BCCJ I  !AA#a&JK0 $3F
  1$//<<"5*$3%":,/#-'3%(WW^%<&(89!) ,1$*!$!4+!/$'F#,  1$//@@C!AH s>   AR$MN4 #R$4	R=CRR RR R!!R$)r  r	  r  r  rP   r  rf  rl  r  r  r  r   r   r  r  r  r   r   rJ   r  r  r   	ones_like)r   r   r  r   r  r  r  rJ   r  r%  r  r   r  r/  s   ``        @@@@rT   rX  zServeCommand.generate_response%  s1    !% 7 7G E2dooE/889NOyc'lC(M[_bMbxC4GHIhjFMM6c'lCDGd+$w<?6*h6'/C<OP`SVW^S_`F \F+.~+>F1Ii(WGd+M[_bMbxC4GHIhjFMM#g,'DEE..vTbf.g5<<(WW3W=
 #u||11!4::<<"'2 3

 >c[`[r[rs$-? ..M #oof5+!2'+,
a	 a	F 2J??rS   c                 (  
 t               st        d      | j                  |d         }| j                  |      \  t	        j
                  dd      }t        |j                        }j                  j                  }t        j                  |d         }t        j                  ||d      \  }} ||d	      j                  j                        

d
   j                  j                         
d
<   ||dd
fd}	 |	       S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr  r  r[   )srmonor  )sampling_rater  input_features)r  rJ   r  c               3      K    j                   di } j                  | j                  d      d   }t        |      }|j	                  d        y w)NT)r  r   )r_   r<  rR   )r  batch_decode	sequencesr,   r@  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorr  s      rT   _generate_transcriptionzDServeCommand.generate_transcription.<locals>._generate_transcriptionq  sg     0K00U<UCTUM!0!=!=m>U>Uko!=!pqr!s)/ABM"222EFGs   AA)r   r   r  load_audio_model_and_processorr   r  r   rJ   feature_extractorr*  ior   librosaloadr  r   dtype)r   r   r  r  rJ   model_sampling_rateaudio_bytesaudio_array_r5  r2  r3  r4  r  s             @@@@rT   rb  z#ServeCommand.generate_transcriptionI  s"    $%o  !% 7 7G E'+'J'JK`'a$_2%%4T
 >)F)F

 .??MMjjV- k6IPTUQ&{BUfjknn
 *66F)G)J)J;K\K\)]%& ,!2'+
	H '((rS   c                 >   |j                  d      xs |j                  d      }d}| j                  d}n`t        | j                        t        |      k\  rd}n<t        t        | j                              D ]  }| j                  |   ||   k7  sd} n || _        |S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  r  TF)r   r  r  range)r   r   r  req_continues_last_messagesis        rT   r  zServeCommand.is_continuationy  s     77:&:#'''*:&*# %*/'##$H5*/' 3t1123 %%a(HQK727/
 &**rS   r$   c                     | j                   r:t        d| j                  | j                  | j                  | j                        }|S | j
                  rt        d      }|S d}|S )a  
        Returns the quantization config for the given CLI arguments.

        Args:
            args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        T)r   bnb_4bit_compute_dtyper   bnb_4bit_use_double_quantbnb_4bit_quant_storage)r   N)r   r$   r   r   r   r   )r   quantization_configs     rT   get_quantization_configz$ServeCommand.get_quantization_config  sv     "4!'+'7'7$($<$<*.*C*C'+'7'7# #" "4!# #" #'""rS   model_idc                 p    | j                   j                  | j                   j                  }d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        @z@main)r   r   )r   rJ  s     rT   r  zServeCommand.process_model_name  s<     99  ,yy,,H(?O5!!rS   r  c                    | j                   }t        j                  d|        d|v r|j                  dd      \  }}n|d}}t	        j
                  |||j                        }|j                  dv r|j                  nt        t        |j                        }| j                  |      }||j                  |d|j                  d}|||d
<   t        j
                  |fi |}	t        t        |	j                  d         }
 |
j
                  |fi |}t        |dd	      |j                  |j                         }|j"                  j$                  d	u xr |j"                  j&                  dk(  }|j"                  j$                  d	uxr |j"                  j$                  dk  }|s|rd|j"                  _        t        j                  d|        ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        zLoading rL  r!   main)revisionr   )r   Nr   )rO  r   r   
device_mapr   NrH  r   hf_device_map   r\  zLoaded model )r   r   r   ry  r#   from_pretrainedr   r   getattrr   rI  r   r   r   r  r  r   rJ   r   
max_length)r   r  r   rJ  rO  data_processorr   rH  model_kwargsr  architecturer   has_default_max_lengthhas_short_max_new_tokenss                 rT   _load_model_and_data_processorz+ServeCommand._load_model_and_data_processor  s    yyh4567''!6!<!<S!!DHh!6hH&66"44
 +/*:*:n*Ld&&RYZ_aeaqaqRr"::4@ !#'#;#;& !%!7!7
 *2EL./++HEE|V-A-A!-DE,,,XFF5/408HHT[[)E ##22d:gu?V?V?a?aeg?g 	 ##22$>p5CZCZCiCilpCp 	! "%=59E##2m$9#:;<n$$rS   c                    || j                   vs| j                   |   j                         rG| j                  |      \  }}t        || j                  j
                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   	r  r   r[  r   r   r   r   r   r   )r   r  r   r   s       rT   r  z%ServeCommand.load_model_and_processor  s     !(:(::d>P>PQf>g>r>r>t#BBCXYE98B $		 7 7#9D45 i	 45AAC&&'<=CCE**+@AKKIirS   c                    || j                   vs| j                   |   j                         rG| j                  |      \  }}t        || j                  j
                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r]  r^  )r   r  r3  r4  s       rT   r6  z+ServeCommand.load_audio_model_and_processor  s     !(:(::d>P>PQf>g>r>r>t+/+N+NOd+e(K8B $		 7 7)9D45 O++	 45AAC,,-BCIIK"001FGQQOO++rS   )r   NNNNN)+rL   rM   rN   staticmethodr   r   r   r   rl  r  r   r&  r*  r.  r   rP   rf  rB  rE  rp  	functools	lru_cacheanyrh  r	   rR  r&   r   r  r  rS  rX  rb  r^   r  rI  r  r[  tupler   r  r   r6  rR   rS   rT   r   r     s   	>N 	> 	>^ @// !/ !	/
 /b
 

 

d 
 %'!%#"'+<@2GSM2G #2G }	2G
 sm2G  }2G T"7892G 
2GhJ[ JS J 8bt Y&+T#s(^ 4 + '+ZW1t W1	#tUY/@Z W1r 	/ 	h 	 	 " x "  " HFGD FGYsD$5O FGPb@T b@iT4.H b@H	.)$ .)9S$_3M .)`+4 +D +< #n #BV9W # #8"3 "3 "";%C ;%z c  eO]tLtFu  2,C ,ERacqRqLr ,rS   r   __main__)r  r   rv  enumra  r   r8  r   r  r  r   r?  argparser   r   dataclassesr   r   r   r   typingr	   r
   r   r   huggingface_hubr   huggingface_hub.constantsr   PILr   r   &transformers.models.auto.modeling_autor   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   r   r   generation.continuous_batchingr   r   utilsr   r    r"   r   r#   r$   r%   r&   r9  r   ro  rm  r'   r(   fastapi.middleware.corsr)   fastapi.responsesr*   r+    openai.types.audio.transcriptionr,   .openai.types.audio.transcription_create_paramsr-   openai.types.chatr.   'openai.types.chat.chat_completion_chunkr/   r0   r1   r2   r3   *openai.types.chat.completion_create_paramsr4   openai.types.responsesr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   -openai.types.responses.response_create_paramsrD   pydanticrE   rF   rG   rI   rX   rZ   r#  r(  r,  r$  r)  r-  r  rL   r   r  rf  r  r  Enumr   r   rl  r   r   r   r   r   r   rp  rR   rS   rT   <module>r|     sU        	 	  	    . (   7 7 & 4     V / (    k 4 6k;O;QkViVk   .6A>\<  [    " \@@4QY^ 6U]b '0MUZ ' %%NO&'RS)*OP %!.# 
		H	%
   !!2!7!7!9: tyy 	 8	8/8 	8v ,@ ,@^ W W WtA,- A,H$ zNE	IIK rS   