
    rh	\                     N   d dl mZ d dlmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ  ed       G d dej@                               Z! G d dej@                        Z" G d dej@                        Z#e ed       G d de                    Z$e ed       G d d e                    Z%e G d! d"e             Z& ed#       G d$ d%e&             Z' ed&       G d' d(e&e             Z(g d)Z)y)*    )	dataclass)OptionalUnionN)nn   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)FlashAttentionKwargs)BaseModelOutputWithPastModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple   )	AutoModel   )Mistral3ConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Mistral3RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z>
        Mistral3RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mistral3/modeling_mistral3.pyr   zMistral3RMSNorm.__init__+   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   T)keepdim)	dtypetor   float32powmeanrsqrtr"   r!   )r#   hidden_statesinput_dtypevariances       r'   forwardzMistral3RMSNorm.forward3   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r(   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler!   shaper"   r#   s    r'   
extra_reprzMistral3RMSNorm.extra_repr:   s*    ))*+6$2G2G1HIIr(   )gư>)__name__
__module____qualname__r   r5   r:   __classcell__r&   s   @r'   r   r   )   s    $;Jr(   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    configc                 "   t         |           || _        |j                  j                  }|j
                  | _        | j                  j                  j                  | _        t        j                  || j
                  dz  z  |d      | _	        y )Nr   Fbias)
r   r   rB   vision_configr$   spatial_merge_size
patch_sizer   Linearmerging_layer)r#   rB   r$   r&   s      r'   r   zMistral3PatchMerger.__init__C   sr    **66"(";";++33>>YY{T5L5La5O'OQ\chir(   image_featuresimage_sizesreturnc                    |D cg c]&  }|d   | j                   z  |d   | j                   z  f( }}|D cg c]
  \  }}||z   }}}|j                  d   }g }t        |j                  |            D ]  \  }	}
||	   \  }}|
j	                  |||      j                  ddd      j                  d      }t        j                  j                  j                  || j                  | j                        }|j	                  || j                  dz  z  d      j                         }|j                  |        t        j                  |d      }| j                  |      }|S c c}w c c}}w )Nr   r   r*   r   )kernel_sizestridedim)rH   r8   	enumeratesplitviewpermute	unsqueezer   r   
functionalunfoldrG   tappendcatrJ   )r#   rK   rL   
image_sizehwtokens_per_imagedpermuted_tensorimage_indeximage_tokens
image_gridgrids                r'   r5   zMistral3PatchMerger.forwardL   sl   cn
U_Z]doo-z!}/OP
 
 /::daAE::  $)2>3G3GHX3Y)Z 	)%K{+DAq%**1a3;;Aq!DNNqQJ88&&--(?(?H_H_ . D 99Q!8!8!!;;R@BBDD""4(	) ?:++N;)
 ;s
   +E"E')
r;   r<   r=   __doc__r   r   r   Tensorr5   r>   r?   s   @r'   rA   rA   >   s?    j~ jell  RWR^R^ r(   rA   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Mistral3MultiModalProjectorrB   c                    t         |           t        |j                  j                  |j
                  j                        | _        t        |      | _	        t        |j                  t              rdnt        |j                        }t        j                  |j                  j                  |z  |j
                  j                  |j                         | _        t$        |j&                     | _        t        j                  |j
                  j                  |j
                  j                  |j                         | _        y )N)r%   r   rD   )r   r   r   rF   r$   text_configrms_norm_epsnormrA   patch_merger
isinstancevision_feature_layerintlenr   rI   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)r#   rB   num_feature_layersr&   s      r'   r   z$Mistral3MultiModalProjector.__init__e   s    #F$8$8$D$D&J\J\JiJij	/7",V-H-H#"NQTWX^XsXsTt		  ,,/AA**11

 &556		**F,>,>,J,JQWQqQq
r(   rK   rL   c                     | j                  |      }| j                  ||      }| j                  |      }| j                  |      }| j	                  |      }|S N)rn   ro   ru   rw   rx   )r#   rK   rL   r2   s       r'   r5   z#Mistral3MultiModalProjector.forwardu   sR    >2**>;Gn5/m4r(   )	r;   r<   r=   r   r   r   rh   r5   r>   r?   s   @r'   rj   rj   d   s*    
~ 
 ell  r(   rj   zT
    Base class for Mistral3 causal language model (or autoregressive) outputs.
    )custom_introc                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	Mistral3CausalLMOutputWithPasta]  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valuesr2   
attentionsimage_hidden_states)r;   r<   r=   rg   r   r   r   FloatTensor__annotations__r   r   listr2   r7   r   r    r(   r'   r~   r~   ~   s      )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r(   r~   zM
    Base class for Mistral3 outputs, with hidden states and attentions.
    c                   :    e Zd ZU dZdZeej                     ed<   y)Mistral3ModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr   )	r;   r<   r=   rg   r   r   r   r   r   r   r(   r'   r   r      s    
 8<%"3"34;r(   r   c                   8    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZy)Mistral3PreTrainedModelrB    Tr   N)r;   r<   r=   r   r   base_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr   r(   r'   r   r      s7    &*#"3N!"&r(   r   zx
    The Mistral3 model which consists of a vision backbone and a language model, without a language modeling head.
    c            #       F    e Zd ZddiZdef fdZd Zd Zd Zd Z		 dd	e
j                  d
e
j                  deeeee   f      fdZde
j$                  de
j                  de
j                  fdZee	 	 	 	 	 	 	 	 	 	 	 	 	 dde
j$                  d	e
j                  dee
j                     dee
j$                     dee   dee
j                     deeeee   f      dee   dee   dee   dee   dee
j$                     d
e
j                  dee   deeef   fd              Z xZS )Mistral3Modelzlanguage_model.modellanguage_modelrB   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y r{   )r   r   r   from_configrF   vision_towerrj   multi_modal_projectorrl   r   	post_initr#   rB   r&   s     r'   r   zMistral3Model.__init__   sY     %11&2F2FG%@%H"'33F4F4FGr(   c                 6    | j                   j                         S r{   )r   get_input_embeddingsr9   s    r'   r   z"Mistral3Model.get_input_embeddings   s    ""7799r(   c                 :    | j                   j                  |       y r{   )r   set_input_embeddingsr#   values     r'   r   z"Mistral3Model.set_input_embeddings   s    007r(   c                     || _         y r{   r   r#   decoders     r'   set_decoderzMistral3Model.set_decoder   s
    %r(   c                     | j                   S r{   r   r9   s    r'   get_decoderzMistral3Model.get_decoder   s    """r(   pixel_valuesrL   rq   c                    ||n| j                   j                  }|j                         D ci c]  \  }}|	|| }}} | j                  |f|dd|}t	        |t
              r|j                  |   }n3|D 	cg c]  }	|j                  |	    }
}	t        j                  |
d      }| j                  |j                  d      |      }| j                  j                  | j                   j                  z  }|D cg c]  \  }}||z  ||z  z   }}}t        j                  |j                  d      |      }|S c c}}w c c}	w c c}}w )aU  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, list[int]]`, *optional*):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            image_sizes (`torch.Tensor`, *optional*):
                Tensor containing the image sizes as returned by the processor.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        T)rL   output_hidden_statesr*   rQ   r   )rB   rq   itemsr   rp   rr   r2   r   r\   r   squeezerH   rG   rT   )r#   r   rL   rq   kwargskvimage_outputsselected_image_feature	layer_idxhs_poolrK   downsample_ratioheightwidthsplit_sizess                   r'   get_image_featuresz Mistral3Model.get_image_features   sR   . %9$D $++JjJj 	 $*<<>C41aQ]!Q$CC))),uKfjuntu *C0%2%@%@AU%V"Ocd)}229=dGd%*YYwB%?"334J4R4RST4UWbc,,77$++:X:XXgrsVcV\^c"22u@P7PQss^%;%;A%>L D e
 ts   
D<D<;E=E	input_idsinputs_embedsrK   c                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }||   j                         |j                         k7  rt        d| d|       |S )z
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )r,   devicer*   r   r   z6Image features and image tokens do not match: tokens: z, features )r   r   tensorrB   image_token_idlongr   allsumrW   	expand_asr-   r8   numel
ValueError)r#   r   r   rK   special_image_maskn_image_tokensn_image_featuress          r'   get_placeholder_maskz"Mistral3Model.get_placeholder_mask	  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL+,2248L8L8NNHHXXcdtcuv  "!r(   attention_maskposition_idsr   	use_cacheoutput_attentionsr   return_dictcache_positionr   rM   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt        d      | | j                         |      }|u| j                  |||      }t        j                  |d      j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d	||||||	|
d|d	|}t!        |j"                  |j$                  |j&                  |j(                  |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embeds)r   rq   rL   r   rQ   )r   rK   T)	r   r   r   r   r   r   r   r   r   )last_hidden_stater   r2   r   r   r   )rB   r   r   use_return_dictrq   r   r   r   r   r\   r-   r   r,   r   masked_scatterr   r   r   r   r2   r   )r#   r   r   r   r   r   r   rq   r   r   r   r   r   rL   r   rK   r   outputss                     r'   r5   zMistral3Model.forward!  s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	 -t";<YZZ 7D557	BM#!44)%9' 5 N
 #YY~1=@@AUAUWdWjWjkN!%!:!:~ "; " *889K^\M%$%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r(   r{   )NNNNNNNNNNNNN)r;   r<   r=   _checkpoint_conversion_mappingr   r   r   r   r   r   r   r   rh   r   r   rr   r   r   
LongTensorr   r   r   r	   boolr   r   r7   r   r5   r>   r?   s   @r'   r   r      s    '=>N%O"~ :8&# AE	)'') \\) 'uS$s)^'<=	)V"))":?:K:K"]b]n]n"0  '+*.1537+/59@D$(,0/3&*59$(?
##?
 ''?
 !.	?

 u//0?
 "%?
   1 12?
 'uS$s)^'<=?
 D>?
 $D>?
 'tn?
 d^?
 !!1!12?
 \\?
 -.?
  
u11	2!?
  ?
r(   r   zV
    The MISTRAL3 model which consists of a vision backbone and a language model.
    c            %           e Zd ZdddddZdgZdef fdZd	 Zd
 Zde	j                  fdZd Zd Z	 d%dej                  dej                   deeeee   f      fdZed        Zed        Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&dej8                  dej                  deej                      deej8                     dee   deej                     deej8                     dee   dee   dee   dee   d eej8                     d!eeej                   f   deej                      d"ee    dee!e"f   f d#              Z#	 	 	 	 	 	 d' fd$	Z$ xZ%S )( Mistral3ForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightrB   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFrD   )r   r   r   modelr   rI   rl   r$   
vocab_sizer   r   r   s     r'   r   z)Mistral3ForConditionalGeneration.__init__s  sS     "6*
yy!3!3!?!?ASASA^A^ejkr(   c                 6    | j                   j                         S r{   )r   r   r9   s    r'   r   z5Mistral3ForConditionalGeneration.get_input_embeddingsy  s    zz..00r(   c                 :    | j                   j                  |       y r{   )r   r   r   s     r'   r   z5Mistral3ForConditionalGeneration.set_input_embeddings|  s    

''.r(   rM   c                     | j                   S r{   )r   r9   s    r'   get_output_embeddingsz6Mistral3ForConditionalGeneration.get_output_embeddings  s    ||r(   c                 :    | j                   j                  |       y r{   )r   r   r   s     r'   r   z,Mistral3ForConditionalGeneration.set_decoder  s    

w'r(   c                 6    | j                   j                         S r{   )r   r   r9   s    r'   r   z,Mistral3ForConditionalGeneration.get_decoder  s    zz%%''r(   r   rL   rq   c                 B     | j                   j                  d|||d|S )N)r   rL   rq   r   )r   r   )r#   r   rL   rq   r   s        r'   r   z3Mistral3ForConditionalGeneration.get_image_features  s5     -tzz,, 
%#!5
 	
 	
r(   c                 .    | j                   j                  S r{   )r   r   r9   s    r'   r   z/Mistral3ForConditionalGeneration.language_model  s    zz(((r(   c                 .    | j                   j                  S r{   )r   r   r9   s    r'   r   z-Mistral3ForConditionalGeneration.vision_tower  s    zz&&&r(   c                 .    | j                   j                  S r{   )r   r   r9   s    r'   r   z6Mistral3ForConditionalGeneration.multi_modal_projector  s    zz///r(   r   r   r   r   r   labelsr   r   r   r   r   logits_to_keepr   c                 <   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  } | j                  d||||||||	|
d||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                         S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is the image?The image depicts two cats lying on a pink blanket."
        ```NT)r   r   r   r   r   r   r   r   r   r   r   rL   r   )r   r   r   )r   r   r   r2   r   r   r   )rB   r   r   r   r   rp   rr   slicer   loss_functionrl   r   r~   r   r2   r   r   )r#   r   r   r   r   r   r   r   r   r   r   r   r   r   rL   r   r   r2   slice_indicesr   r   s                        r'   r5   z(Mistral3ForConditionalGeneration.forward  sP   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%)%+'/!5)#
 
   
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r(   c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r   r   r   r   r   r   r   )r   prepare_inputs_for_generation)r#   r   r   r   r   r   r   r   r   model_inputsr&   s             r'   r   z>Mistral3ForConditionalGeneration.prepare_inputs_for_generation  sV     w<
+')))
 
 !! ,8L(r(   r{   )NNNNNNNNNNNNr   N)NNNNNN)&r;   r<   r=   r   _tied_weights_keysr   r   r   r   r   Moduler   r   r   r   r   rh   r   r   rr   r   r   propertyr   r   r   r   r   r   r	   r   r   r   r7   r~   r5   r   r>   r?   s   @r'   r   r   e  sh    "8-"?#,	&" ++~ 1/ryy (( AE	
''
 \\
 'uS$s)^'<=	
 ) ) ' ' 0 0  '+*.1537+/59-1$(,0/3&*5934.2U
##U
 ''U
 !.	U

 u//0U
 "%U
   1 12U
 ))*U
 D>U
 $D>U
 'tnU
 d^U
 !!1!12U
 c5<</0U
 ell+U
  +,!U
" 
u44	5#U
  U
t  r(   r   )r   r   r   )*dataclassesr   typingr   r   r   r   activationsr   cache_utilsr	   
generationr
   integrationsr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   autor   configuration_mistral3r   r   r   rA   rj   r~   r   r   r   r   __all__r   r(   r'   <module>r     sS  , " "   !   ) 7 B D - & I I  2 Y'Jbii J (J(#")) #L")) 4 
<[ < <2 
<"9 < <  'o ' ' 
[
+ [

[
| 
n'> n
nb [r(   