
    rhP                        d dl mZ d dlmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ e ed       G d de                    Ze ed       G d de                    Z G d dej6                        Ze G d de             Z ed       G d de             Z ed       G d d ee             Zg d!Z y)"    )	dataclass)OptionalUnionN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastModelOutput)PreTrainedModel)auto_docstringcan_return_tuple   )	AutoModel   )VipLlavaConfigzM
    Base class for VipLlava outputs, with hidden states and attentions.
    )custom_introc                   :    e Zd ZU dZdZeej                     ed<   y)VipLlavaModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   r   &   s    
 8<%"3"34;r    r   zT
    Base class for VipLlava causal language model (or autoregressive) outputs.
    c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	VipLlavaCausalLMOutputWithPasta]  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r$   r   r   r   r   r%   r&   listr'   tupler(   r   r   r    r!   r#   r#   <   s      )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r    r#   c                   *     e Zd Zdef fdZd Z xZS )VipLlavaMultiModalProjectorconfigc                 H   t         |           t        |j                  t              rdnt        |j                        }t        j                  ||j                  j                  z  |j                        | _        t        j                  ||j                  j                  z  |j                  j                  d      | _        t        |j                      | _        t        j                  |j                  j                  |j                  j                  d      | _        y )Nr   )epsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr-   num_feature_layers	__class__s      r!   r3   z$VipLlavaMultiModalProjector.__init__\   s    ",V-I-I3"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr    c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)r<   r?   rA   rB   )rC   r'   s     r!   forwardz#VipLlavaMultiModalProjector.forwardk   sB    00?m4/m4r    )r   r   r   r   r3   rH   __classcell__rE   s   @r!   r,   r,   [   s    m~ mr    r,   c                   8    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZy)VipLlavaPreTrainedModelr-    Tr&   N)r   r   r   r   r   base_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr   r    r!   rL   rL   s   s7    &*#"3N!"&r    rL   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    c                        e Zd ZddiZdef fdZd Zd Zd Zd Z		 dd	e
j                  d
eeeee   f      fdZde
j"                  de
j                  de
j                  fdZe	 	 	 	 	 	 	 	 	 	 	 	 dde
j"                  d	e
j                  dee
j(                     dee
j"                     dee   dee
j                     d
eeeee   f      dee   dee   dee   dee   dee
j"                     deeef   fd       Z xZS )VipLlavaModelzlanguage_model.modellanguage_modelr-   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y rG   )r2   r3   r   from_configr9   vision_towerr,   multi_modal_projectorr>   rX   	post_initrC   r-   rE   s     r!   r3   zVipLlavaModel.__init__   sY     %11&2F2FG%@%H"'33F4F4FGr    c                 6    | j                   j                         S rG   )rX   get_input_embeddingsrC   s    r!   r`   z"VipLlavaModel.get_input_embeddings   s    ""7799r    c                 :    | j                   j                  |       y rG   )rX   set_input_embeddingsrC   values     r!   rc   z"VipLlavaModel.set_input_embeddings   s    007r    c                     || _         y rG   rX   rC   decoders     r!   set_decoderzVipLlavaModel.set_decoder   s
    %r    c                     | j                   S rG   rg   ra   s    r!   get_decoderzVipLlavaModel.get_decoder   s    """r    pixel_valuesr5   c                 V   ||n| j                   j                  }| j                  |d      }t        |t              r|j
                  |   ddddf   }n<|D cg c]  }|j
                  |   ddddf    }}t        j                  |d      }| j                  |      }|S c c}w )aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, list[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        NT)output_hidden_statesr   )dim)	r-   r5   r[   r4   r6   r'   r   catr\   )rC   rm   r5   image_outputsimage_featuresindexs         r!   get_image_featuresz VipLlavaModel.get_image_features   s      &;%F!DKKLmLm 	 )),T)R +S1*889NOPQSTSUPUVN VkkEm99%@ABGkNk"YY~2>N33NC ls   B&	input_idsinputs_embedsrt   c                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }||   j                         |j                         k7  rt        d| d|       |S )z
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )dtypedevicerp   r   r   z6Image features and image tokens do not match: tokens: z, features )r`   r   tensorr-   image_token_idlongr{   allsum	unsqueeze	expand_astoshapenumel
ValueError)rC   rw   rx   rt   special_image_maskn_image_tokensn_image_featuress          r!   get_placeholder_maskz"VipLlavaModel.get_placeholder_mask   s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL+,2248L8L8NNHHXXcdtcuv  "!r    attention_maskposition_idsr&   	use_cacheoutput_attentionsro   return_dictcache_positionreturnc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      | | j                         |      }|_| j                  ||      }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d||||||	|
d|d	|}t        |j                  |j                   |j"                  |j$                  |nd      }|r|S |j'                         S )	z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsrm   r5   )rx   rt   T)	r   r   r&   rx   r   r   ro   r   r   )last_hidden_stater&   r'   r(   r   r   )r-   r   ro   use_return_dictr5   r   r`   rv   r   r{   rz   r   masked_scatterrX   r   r   r&   r'   r(   to_tuple)rC   rw   rm   r   r   r&   rx   r5   r   r   ro   r   r   	lm_kwargsrt   r   outputsoutputs                     r!   rH   zVipLlavaModel.forward   s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ 7D557	BM#!44)AV 5 N ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%$%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r    rG   )NNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r3   r`   rc   rj   rl   r   r   r   r   r6   r)   rv   
LongTensorr   r   Tensorr	   boolr*   r   rH   rI   rJ   s   @r!   rW   rW      s    '=>N%O"~ :8&# im!--FNuUXZ^_bZcUcOdFe>"))":?:K:K"]b]n]n"0  '+*.1537+/59AE$(,0/3&*59B<##B< ''B< !.	B<
 u//0B< "%B<   1 12B<  (c49n(=>B< D>B< $D>B< 'tnB< d^B< !!1!12B< 
u11	2B< B<r    rW   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c            #       r    e Zd ZdddddZdgZdef fdZd	 Zd
 Zde	j                  fdZd Zd Z	 d#dej                  deeeee   f      fdZed        Zed        Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$dej6                  dej                  deej8                     deej6                     dee   deej                     deeeee   f      deej6                     dee   dee   dee   dee   deej6                     d eeej8                  f   deee f   fd!              Z!	 	 	 	 	 	 d% fd"	Z" xZ#S )& VipLlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr-   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFr0   )r2   r3   rW   modelr   r=   r>   r:   
vocab_sizer   r]   r^   s     r!   r3   z)VipLlavaForConditionalGeneration.__init__)  sS     "6*
yy!3!3!?!?ASASA^A^ejkr    c                 6    | j                   j                         S rG   )r   r`   ra   s    r!   r`   z5VipLlavaForConditionalGeneration.get_input_embeddings/  s    zz..00r    c                 :    | j                   j                  |       y rG   )r   rc   rd   s     r!   rc   z5VipLlavaForConditionalGeneration.set_input_embeddings2  s    

''.r    r   c                     | j                   S rG   )r   ra   s    r!   get_output_embeddingsz6VipLlavaForConditionalGeneration.get_output_embeddings5  s    ||r    c                 :    | j                   j                  |       y rG   )r   rj   rh   s     r!   rj   z,VipLlavaForConditionalGeneration.set_decoder8  s    

w'r    c                 6    | j                   j                         S rG   )r   rl   ra   s    r!   rl   z,VipLlavaForConditionalGeneration.get_decoder;  s    zz%%''r    rm   r5   c                 <    | j                   j                  ||      S )Nr   )r   rv   )rC   rm   r5   s      r!   rv   z3VipLlavaForConditionalGeneration.get_image_features>  s     zz,,,^s,ttr    c                 .    | j                   j                  S rG   )r   rX   ra   s    r!   rX   z/VipLlavaForConditionalGeneration.language_modelD  s    zz(((r    c                 .    | j                   j                  S rG   )r   r[   ra   s    r!   r[   z-VipLlavaForConditionalGeneration.vision_towerH  s    zz&&&r    c                 .    | j                   j                  S rG   )r   r\   ra   s    r!   r\   z6VipLlavaForConditionalGeneration.multi_modal_projectorL  s    zz///r    rw   r   r   r&   rx   labelsr   r   ro   r   r   logits_to_keepc                 l   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j
                  d|||||||	||
|d|d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|2| j                  ||| j                   j                  j                        }t        |||j                  |j                  |j                   |j"                        S )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)rw   rm   r   r   r&   rx   r   r5   r   ro   r   r   r   )r%   r   r   )r$   r%   r&   r'   r(   r   r   )r-   r   ro   r   r5   r   r4   r6   slicer   loss_functionr>   r   r#   r&   r'   r(   r   )rC   rw   rm   r   r   r&   rx   r5   r   r   r   ro   r   r   r   r   r   r'   slice_indicesr%   r$   s                        r!   rH   z(VipLlavaForConditionalGeneration.forwardP  s[   l 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 $** 
%)%+'"7/!5)
 
   
8B>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r    c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r&   rx   r   r   r   r   rm   )r2   prepare_inputs_for_generation)rC   rw   r&   rx   rm   r   r   r   kwargsmodel_inputsrE   s             r!   r   z>VipLlavaForConditionalGeneration.prepare_inputs_for_generation  sV     w<
+')))
 
 !! ,8L(r    rG   )NNNNNNNNNNNNNr   )NNNNNN)$r   r   r   r   _tied_weights_keysr   r3   r`   rc   r   Moduler   rj   rl   r   r   r   r   r6   r)   rv   propertyrX   r[   r\   r   r   r   r   r	   r   r*   r#   rH   r   rI   rJ   s   @r!   r   r     sX    "8-"?#,	&" ++~ 1/ryy (( imu!--uFNuUXZ^_bZcUcOdFeu ) ) ' ' 0 0  '+*.1537+/59AE-1$(,0/3&*5934]
##]
 '']
 !.	]

 u//0]
 "%]
   1 12]
  (c49n(=>]
 ))*]
 D>]
 $D>]
 'tn]
 d^]
 !!1!12]
 c5<</0]
" 
u44	5#]
  ]
D  r    r   )rW   r   rL   )!dataclassesr   typingr   r   r   r   activationsr   cache_utilsr	   
generationr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   autor   configuration_vipllavar   r   r#   r   r,   rL   rW   r   __all__r   r    r!   <module>r      s  , " "   !   ) D - 5  2 
<"9 < <  
<[ < <2")) 0 'o ' ' 
Q<+ Q<
Q<h 
m'> m
m` [r    