
    rh0                     <   d dl mZmZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ d	d
lmZ  ej(                  e      Z G d de	      Z G d de      Z G d dej2                        Z G d de
      Z G d de      Z G d de      Zg dZy)    )OptionalUnionN)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)Cache)auto_docstringlogging   )VipLlavaConfigc                       e Zd Zy)VipLlavaModelOutputWithPastN__name__
__module____qualname__     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/vipllava/modular_vipllava.pyr   r   &       r   r   c                       e Zd Zy)VipLlavaCausalLMOutputWithPastNr   r   r   r   r   r   *   r   r   r   c                   *     e Zd Zdef fdZd Z xZS )VipLlavaMultiModalProjectorconfigc                 H   t         |           t        |j                  t              rdnt        |j                        }t        j                  ||j                  j                  z  |j                        | _        t        j                  ||j                  j                  z  |j                  j                  d      | _        t        |j                      | _        t        j                  |j                  j                  |j                  j                  d      | _        y )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr    num_feature_layers	__class__s      r   r%   z$VipLlavaMultiModalProjector.__init__/   s    ",V-I-I3"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)r.   r1   r3   r4   )r5   hidden_statess     r   forwardz#VipLlavaMultiModalProjector.forward>   sB    00?m4/m4r   )r   r   r   r   r%   r;   __classcell__)r7   s   @r   r   r   .   s    m~ mr   r   c                       e Zd Zy)VipLlavaPreTrainedModelNr   r   r   r   r>   r>   F   r   r   r>   c                   x   e Zd Z	 ddej                  deeeee   f      fdZ	e
	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  deej                     deej                     dee   d	eej                     deeeee   f      d
ee   dee   dee   dee   deej                     deeef   fd       Zy)VipLlavaModelNpixel_valuesr'   c                 V   ||n| j                   j                  }| j                  |d      }t        |t              r|j
                  |   ddddf   }n<|D cg c]  }|j
                  |   ddddf    }}t        j                  |d      }| j                  |      }|S c c}w )aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, list[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        NT)output_hidden_statesr   )dim)	r    r'   vision_towerr&   r(   r:   torchcatmulti_modal_projector)r5   rA   r'   image_outputsimage_featuresindexs         r   get_image_featuresz VipLlavaModel.get_image_featuresK   s      &;%F!DKKLmLm 	 )),T)R +S1*889NOPQSTSUPUVN VkkEm99%@ABGkNk"YY~2>N33NC ls   B&	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrC   return_dictcache_positionreturnc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      | | j                         |      }|_| j                  ||      }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d||||||	|
d|d	|}t        |j                  |j                   |j"                  |j$                  |nd      }|r|S |j'                         S )	z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsrA   r'   )rR   rK   T)	rO   rP   rQ   rR   rS   rT   rC   rU   rV   )last_hidden_staterQ   r:   
attentionsimage_hidden_statesr   )r    rT   rC   use_return_dictr'   
ValueErrorget_input_embeddingsrM   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelr   rZ   rQ   r:   r[   to_tuple)r5   rN   rA   rO   rP   rQ   rR   r'   rS   rT   rC   rU   rV   	lm_kwargsrK   special_image_maskoutputsoutputs                     r   r;   zVipLlavaModel.forwardj   s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ 7D557	BM#!44)AV 5 N ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%$%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r   r9   )NNNNNNNNNNNN)r   r   r   rG   FloatTensorr   r   r(   listrM   r   
LongTensorTensorr   booltupler   r;   r   r   r   r@   r@   J   sg   hl!--FNuUXZ^_bZcUcOdFe>  '+*.1537+/59AE$(,0/3&*59B<##B< ''B< !.	B<
 u//0B< "%B<   1 12B<  (c49n(=>B< D>B< $D>B< 'tnB< d^B< !!1!12B< 
u11	2B< B<r   r@   c            !          e Zd Z	 ddej                  deeeee   f      fdZ		 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  deej                     deej                     dee   d	eej                     deeeee   f      d
eej                     dee   dee   dee   dee   deej                     deeej                  f   deeef   fdZy) VipLlavaForConditionalGenerationNrA   r'   c                 <    | j                   j                  ||      S )NrY   )modelrM   )r5   rA   r'   s      r   rM   z3VipLlavaForConditionalGeneration.get_image_features   s     zz,,,^s,ttr   rN   rO   rP   rQ   rR   labelsrS   rT   rC   rU   rV   logits_to_keeprW   c                 l   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j
                  d|||||||	||
|d|d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|2| j                  ||| j                   j                  j                        }t        |||j                  |j                  |j                   |j"                        S )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)rN   rA   rO   rP   rQ   rR   rS   r'   rT   rC   rU   rV   r   )logitsru   
vocab_size)lossrx   rQ   r:   r[   r\   r   )r    rT   rC   r]   r'   rt   r&   r(   slicelm_headloss_functionr0   ry   r   rQ   r:   r[   r\   )r5   rN   rA   rO   rP   rQ   rR   r'   ru   rS   rT   rC   rU   rV   rv   rg   ri   r:   slice_indicesrx   rz   s                        r   r;   z(VipLlavaForConditionalGeneration.forward   s[   h 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 $** 
%)%+'"7/!5)
 
   
8B>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r   r9   )NNNNNNNNNNNNNr   )r   r   r   rG   rk   r   r   r(   rl   rM   rm   rn   r   ro   rp   r   r;   r   r   r   rr   rr      s   hlu!--uFNuUXZ^_bZcUcOdFeu '+*.1537+/59AE-1$(,0/3&*5934]
##]
 '']
 !.	]

 u//0]
 "%]
   1 12]
  (c49n(=>]
 ))*]
 D>]
 $D>]
 'tn]
 d^]
 !!1!12]
 c5<</0]
" 
u44	5#]
r   rr   )r@   rr   r>   )typingr   r   rG   r   (transformers.models.llava.modeling_llavar   r   r   r	   r
   activationsr   cache_utilsr   utilsr   r   configuration_vipllavar   
get_loggerr   loggerr   r   Moduler   r>   r@   rr   __all__r   r   r   <module>r      s     #    "   , 2 
		H	%	": 		%@ 	")) 0	2 	c<J c<Lc
'D c
L [r   