
    rh,F                     :   d Z ddlmZmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ  ej.                  e      Ze G d de             Z ed       G d de             Z ed       G d dee
             Zg dZy)zPyTorch Fuyu model.    )OptionalUnionN)nn   )Cache)GenerationMixin)CausalLMOutputWithPast)PreTrainedModel)	AutoModel)auto_docstringcan_return_tuplelogging   )
FuyuConfigc                   >    e Zd ZU eed<   dZdZdZdZdZ	dZ
g ZdZd Zy)FuyuPreTrainedModelconfigfuyuTpast_key_valuesc                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          y y y )Ng        )meanstd)r   initializer_range
isinstancer   Linearweightdatanormal_biaszero_	Embeddingpadding_idx)selfmoduler   s      y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/fuyu/modeling_fuyu.py_init_weightsz!FuyuPreTrainedModel._init_weights/   s    kk++fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .    N)__name__
__module____qualname__r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placementr&    r'   r%   r   r   #   s<    &*#"&N"3	?r'   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                   .    e Zd ZddiZdef fdZd Zd Zd Zd Z	d	e
j                  d
ee
j                     de
j                  de
j                  fdZde
j                  fdZde
j                   de
j                  de
j                  fdZe	 	 	 	 	 	 	 	 	 	 	 dde
j                   de
j                  de
j                  dee
j                     dee
j                      dee   dee
j                     dee   dee   dee   dee   deeef   fd       Z xZS )	FuyuModelzlanguage_model.modellanguage_modelr   c                    t         |   |       |j                  | _        |j                  j
                  | _        t        j                  |j                        | _        t        j                  |j                  |j                  z  |j                  z  |j                        | _        d| _        | j!                          y )NF)super__init__pad_token_idr"   text_config
vocab_sizer   from_configr8   r   r   
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initr#   r   	__class__s     r%   r;   zFuyuModel.__init__C   s     !.. ,,77'33F4F4FG#%99 1 11F4G4GGI[I[$
  ',#r'   c                 6    | j                   j                         S N)r8   get_input_embeddingsr#   s    r%   rJ   zFuyuModel.get_input_embeddingsP   s    ""7799r'   c                 :    | j                   j                  |       y rI   )r8   set_input_embeddingsr#   values     r%   rM   zFuyuModel.set_input_embeddingsS   s    007r'   c                     || _         y rI   r8   r#   decoders     r%   set_decoderzFuyuModel.set_decoderV   s
    %r'   c                     | j                   S rI   rQ   rK   s    r%   get_decoderzFuyuModel.get_decoderY   s    """r'   word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc           
         |j                   d   t        |      k(  s't        dt        |      d|j                   d         |j                         }t	        |j                   d         D ]  }t        j                  ||   dk\  d      d   }||   |   }|j                   d   ||   j                   d   kD  r,t        d||   j                   d|j                   d| d	      ||   |   j                  |j                        |||f<    |S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r#   rW   rX   rY   output_embeddings	batch_idxdst_indicessrc_indicess           r%   gather_continuous_embeddingsz&FuyuModel.gather_continuous_embeddings\   sZ   (  %%a(C0E,FFJs3H/I.KKjQ`QfQfghQiPkl  ,11344Q78 	I  --(A)(LPQ(Q\`abcdK 4I>{KK  #&;I&F&L&LQ&OO ^7LY7W7]7]6_ `I6A6G6G5II[\e[ffgi  9Ni8XYd8e8h8h!((9i45	  ! r'   pixel_valuesc                     |D cg c]O  }| j                  |j                  | j                   j                  j                              j	                  d      Q }}|S c c}w )a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        r   )rC   re   r   dtypesqueeze)r#   rl   kwargspatchpatch_embeddingss        r%   get_image_featureszFuyuModel.get_image_features   sa     &
 $$UXXd.F.F.M.M.S.S%TU]]^_`
 
  	
s   AA	input_idsinputs_embedsimage_featuresc                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }||   j                         |j                         k7  rt        d| d|       |S )z
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        rn   rf   r   r   z6Image features and image tokens do not match: tokens: z, features )rJ   rc   tensorr   image_token_idlongrf   allsum	unsqueeze	expand_asre   r^   numelr`   )r#   rt   ru   rv   special_image_maskn_image_tokensn_image_featuress          r%   get_placeholder_maskzFuyuModel.get_placeholder_mask   s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL+,2248L8L8NNHHXXcdtcuv  "!r'   image_patchesimage_patches_indicesattention_maskposition_idsr   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t        d      ||j                  \  }}n||j                  \  }}}nt        d      |i||j                  n|j                  }||j                         nd}t        j                  |||z   t        j                  |      }|j                  d      }|  | j                  j                         |      }|r| j                  |      }t        j                   |d      j#                  |j                  |j$                        }| j'                  |||      }|j)                  ||      } | j                  d|||||	|
||d|}|S )	a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        zDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr   rx   )dim)ru   rv   )ru   r   r   r   r   r   r   r   r4   )r   r   r   r   use_return_dictr`   r^   rf   get_seq_lengthrc   aranger|   r   r8   rJ   rs   catre   rn   get_placeholder_tokensmasked_scatter)r#   rt   r   r   r   r   r   ru   r   r   r   r   rp   
batch_size
seq_length_rf   past_key_values_lengthrr   r   outputss                        r%   forwardzFuyuModel.forward   s   . 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%J
ASTT)2)>Y%%MDXDXFIXId_%C%C%Ejk" <<&
5K(KSXS]S]flL (11!4L FD//DDFyQM$#66}E$yy)9qADD]EYEY[h[n[no!%!<!<GW "= " *889KM]^M%$%% 

')%+/!5#

 

 r'   )NNNNNNNNNNN)r(   r)   r*   _checkpoint_conversion_mappingr   r;   rJ   rM   rT   rV   rc   Tensorlistrk   FloatTensorrs   
LongTensorr   r   r   r   boolr   tupler	   r   __classcell__rG   s   @r%   r7   r7   ;   s    '=>N%O"z :8&#*!*!  $ELL1*! $)<<	*!
 
*!X u/@/@  "))":?:K:K"]b]n]n"0  '+&*.21537+/59$(,0/3&*F##F ||F  %||	F
 !.F u//0F "%F   1 12F D>F $D>F 'tnF d^F 
u,,	-F Fr'   r7   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c            !           e Zd ZddddZdgZdef fdZd Zd	 Zd
 Z	d Z
ee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dej                  deej                     deej                     dee   deej$                     dee   deej                     dee   dee   dee   dee   deeef   fd              Z	 	 	 	 	 d fd	Z xZS )FuyuForCausalLMzmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NF)r   )r:   r;   r7   modelr   r   r=   rB   r>   r   rE   rF   s     r%   r;   zFuyuForCausalLM.__init__  sS     v&
yy!3!3!?!?ASASA^A^ejkr'   c                 6    | j                   j                         S rI   )r   rJ   rK   s    r%   rJ   z$FuyuForCausalLM.get_input_embeddings  s    zz..00r'   c                 :    | j                   j                  |       y rI   )r   rM   rN   s     r%   rM   z$FuyuForCausalLM.set_input_embeddings  s    

''.r'   c                 :    | j                   j                  |       y rI   )r   rT   rR   s     r%   rT   zFuyuForCausalLM.set_decoder  s    

w'r'   c                 6    | j                   j                         S rI   )r   rV   rK   s    r%   rV   zFuyuForCausalLM.get_decoder  s    zz%%''r'   rt   r   r   r   r   r   ru   r   labelsr   r   r   logits_to_keeprZ   c                 T   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  ||||||||
||d      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	4 | j                  d||	| j                   j                  j                  d|}t        |||j                  |j                  |j                         S )a@  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import requests

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```NT)rt   r   r   ru   r   r   r   r   r   r   r   r   )logitsr   r>   )lossr   r   hidden_states
attentionsr4   )r   r   r   r   r   r   r   intslicer   loss_functionr=   r>   r	   r   r   r   )r#   rt   r   r   r   r   r   ru   r   r   r   r   r   r   rp   r   r   slice_indicesr   r   s                       r%   r   zFuyuForCausalLM.forward  sN   j 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]**'"7')%+/!5  
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD &#33!//))
 	
r'   c           	      L    t        	|   |f|||||d|}|
d |d<   d |d<   |S )N)r   r   ru   r   r   r   r   )r:   prepare_inputs_for_generation)
r#   rt   r   r   ru   r   r   rp   model_inputsrG   s
            r%   r   z-FuyuForCausalLM.prepare_inputs_for_generationv  sX     w<
+)''"7
 
 &48L01,0L)r'   )NNNNNNNNNNNNr   )NNNNN)r(   r)   r*   r   _tied_weights_keysr   r;   rJ   rM   rT   rV   r   r   rc   r   r   r   r   r   r   r   r   r   r	   r   r   r   r   s   @r%   r   r      s    "8 ;#,&"
 ++z 1/((  '+&*.21537+/59$()-,0/3&*()[
##[
 ||[
  %||	[

 !.[
 u//0[
 "%[
   1 12[
 D>[
 &[
 $D>[
 'tn[
 d^[
 ![
  
u,,	-![
  [
@ " r'   r   )r   r   r7   )__doc__typingr   r   rc   torch.utils.checkpointr   cache_utilsr   
generationr   modeling_outputsr	   modeling_utilsr
   models.auto.modeling_autor   utilsr   r   r   configuration_fuyur   
get_loggerr(   loggerr   r7   r   __all__r4   r'   r%   <module>r      s     "      ) 6 - 2 > > * 
		H	% ?/ ? ?. 
u# u
up 
S)? S
Sl Br'   