Ë
    ¹rœhƒI  ã                   ó¢  — d dl mZmZ d dlZd dlZd dlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZmZmZ d	d
lmZmZ d	dlmZ d	dlmZ d	dlmZmZmZmZmZ  ej<                  e«      Z  G d„ de«      Z! G d„ de«      Z" G d„ de«      Z# G d„ de«      Z$ G d„ de«      Z% G d„ de«      Z& G d„ de«      Z' G d„ de«      Z( G d„ de«      Z)g d ¢Z*y)!é    )ÚOptionalÚUnionN)Únné   )ÚCacheÚDynamicCache)ÚFlashAttentionKwargs)ÚUnpack)Úauto_docstringÚcan_return_tupleÚloggingé   )ÚIdefics3ConfigÚIdefics3VisionConfig)ÚIdefics3ImageProcessor)ÚIdefics3ImageProcessorFast)ÚIdefics3BaseModelOutputWithPastÚ Idefics3ForConditionalGenerationÚIdefics3ModelÚIdefics3PreTrainedModelÚIdefics3VisionTransformerc                   ó   — e Zd ZdZdZy)ÚSmolVLMVisionConfiga¿  
    This is the configuration class to store the configuration of a [`SmolVLMVisionModel`]. It is used to instantiate a
    SmolVLM vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
    [google/siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) used in SmolVLM
    [HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1152):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    Example:

    ```python
    >>> from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
    >>> from transformers.models.smolvlm.configuration_smolvlm import SmolVLMVisionConfig

    >>> # Initializing a SmolVLMVisionConfig with google/siglip-so400m-patch14-384 style configuration
    >>> configuration = SmolVLMVisionConfig()

    >>> # Initializing a SmolVLMVisionTransformer (with random weights) from the google/siglip-so400m-patch14-384 style configuration
    >>> model = SmolVLMVisionTransformer(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Úsmolvlm_visionN©Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_type© ó    ú~/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/smolvlm/modular_smolvlm.pyr   r   )   s   „ ñ1ðf "€JØr"   r   c                   ó   — e Zd Zy)ÚSmolVLMPreTrainedModelN©r   r   r   r!   r"   r#   r%   r%   a   ó   „ Ør"   r%   c                   ó   — e Zd Zy)ÚSmolVLMVisionTransformerNr&   r!   r"   r#   r)   r)   e   r'   r"   r)   c                   ó   — e Zd ZdZdZy)ÚSmolVLMConfigaÃ  
    This is the configuration class to store the configuration of a [`SmolVLMModel`]. It is used to instantiate a
    SmolVLM model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the model of the SmolVLM
    [HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should cache the key/value pairs of the attention mechanism. Only
            relevant if `config.is_decoder=True`.
        image_token_id (`int`, *optional*, defaults to 128257):
            The id of the "image" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether or not to tie the word embeddings with the token embeddings.
        vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
            Custom vision config or dict for the vision tower
        text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
            Custom text config or dict for the text model
        scale_factor (`int`, *optional*, defaults to 2):
            The scale factor for the image encoder.
        pad_token_id (`int`, *optional*, defaults to 128002):
            The id of the padding token.

    Example:
    ```python
    >>> from transformers import SmolVLMModel, SmolVLMConfig
    >>> # Initializing configuration
    >>> configuration = SmolVLMConfig()
    >>> # Initializing a model from the configuration
    >>> model = SmolVLMModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```ÚsmolvlmNr   r!   r"   r#   r+   r+   i   s   „ ñ#ðJ €JØr"   r+   c                   ó   — e Zd Zy)ÚSmolVLMImageProcessorNr&   r!   r"   r#   r.   r.   “   r'   r"   r.   c                   ó   — e Zd Zy)ÚSmolVLMImageProcessorFastNr&   r!   r"   r#   r0   r0   —   r'   r"   r0   c                   ó   — e Zd Zy)ÚSmolVLMBaseModelOutputWithPastNr&   r!   r"   r#   r2   r2   ›   r'   r"   r2   c            #       ó  — e Zd ZdZdej
                  dej                  dej                  fd„Zddej                  dej
                  fd	„Z	e
 ed
¬«      	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej
                     deej                     deej
                     dee   deej                     deej                     deej                     deej                     dee   dee   dee   dee   deej
                     dee   deeef   fd„«       «       Zy)ÚSmolVLMModelz§
    A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
    in forward. Instead, we override inputs_merger here with custom logic.
    Ú	input_idsÚinputs_embedsÚimage_hidden_statesc                 ó"  — |j                   \  }}}|€a| | j                  «       t        j                  | j                  j
                  t        j                  |j                  ¬«      «      k(  }|d   }n|| j                  j
                  k(  }|j                  d¬«      }t        j                  ||z  dk(  «      st        d«      ‚||z  }t        j                  j                  j                  |j                  d¬«      dd¬«      }	|	d d	 }
|j                  d	¬«      }|dz
  |z  }|dz
  |z  }|
j                  d«      |z   }t        j                   |«      }|||   ||   d d …f   ||<   t        j"                  |j                  d	«      ||«      }|S )
N©ÚdtypeÚdevice).r   é   ©Údimr   zCAt least one sample has <image> tokens not divisible by patch_size.)r<   r   )Úvalueéÿÿÿÿ)ÚshapeÚget_input_embeddingsÚtorchÚtensorÚconfigÚimage_token_idÚlongr;   ÚsumÚallÚ
ValueErrorr   Ú
functionalÚpadÚcumsumÚ	unsqueezeÚ
zeros_likeÚwhere)Úselfr5   r6   r7   Ú_Ú
patch_sizeÚ
image_maskÚnum_image_tokensÚblocks_per_sampleÚoffsetsÚblock_offsetÚrow_cumÚ	chunk_idxÚ	local_idxÚ	block_idxÚimage_embedsÚmerged_embedss                    r#   Úinputs_mergerzSmolVLMModel.inputs_merger¥   s’  € ð /×4Ñ4Ñˆˆ:qàÐØ&Ð*E¨$×*CÑ*CÓ*EÜ—‘˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÔgó+ñ ˆJð $ FÑ+‰Jà" d§k¡k×&@Ñ&@Ñ@ˆJà%Ÿ>™>¨a˜>Ó0ÐÜy‰yÐ)¨JÑ6¸!Ñ;Ô<ÜÐbÓcÐcà,°
Ñ:Ðä—(‘(×%Ñ%×)Ñ)Ð*;×*BÑ*BÀqÐ*BÓ*IÈ6ÐYZÐ)Ó[ˆØ˜s |ˆØ×#Ñ#¨Ð#Ó+ˆØ˜q‘[ ZÑ/ˆ	Ø˜q‘[ JÑ.ˆ	Ø ×*Ñ*¨1Ó-°	Ñ9ˆ	ä×'Ñ'¨Ó6ˆØ#6°yÀÑ7LÈiÐXbÑNcÒefÐ7fÑ#gˆZÑ äŸ™ J×$8Ñ$8¸Ó$<¸lÈMÓZˆØÐr"   NÚpixel_valuesÚpixel_attention_maskc                 óh  — |j                   \  }}}}} |j                  ||z  g|j                   dd ¢­Ž }|j                   dd j                  «       }|dk(  j                  d¬«      |k7  }	t	        |	«      sd|	d<   ||	   j                  «       }|€Lt        j                  d	D 
cg c]  }
|j                   |
   ‘Œ c}
t        j                  |j                  ¬
«      }n6 |j                  ||z  g|j                   dd ¢­Ž }||	   j                  «       }| j                  j                  j                  }|j                  d||¬«      }|j                  d||¬«      }|j                  d¬«      dkD  j                  «       }| j                  ||¬«      }|j                  }| j!                  |«      }|S c c}
w )a²  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            pixel_attention_mask (`torch.LongTensor`, *optional*):
                The attention mask indicating padded regions in the image.
        r   Nr<   g        )r@   éþÿÿÿéýÿÿÿr=   Tr   )r   r   r   )Úsizer:   r;   )Ú	dimensionre   Ústep)r@   rc   )r`   Úpatch_attention_mask)rA   ÚviewÚnumelrH   ÚanyÚ
contiguousrC   ÚonesÚboolr;   rE   Úvision_configrS   ÚunfoldÚvision_modelÚlast_hidden_stateÚ	connector)rQ   r`   ra   Ú
batch_sizeÚ
num_imagesÚnum_channelsÚheightÚwidthÚnb_values_per_imageÚreal_images_indsÚirS   Úpatches_subgridrh   r7   s                  r#   Úget_image_featureszSmolVLMModel.get_image_featuresÅ   sØ  € ð ?K×>PÑ>PÑ;ˆ
J ¨f°eØ(|×(Ñ(¨°jÑ)@ÐZÀ<×CUÑCUÐVWÐVXÐCYÒZˆð +×0Ñ0°°Ð4×:Ñ:Ó<ÐØ(¨CÑ/×4Ñ4¸Ð4ÓFÐJ]Ñ]ÐäÐ#Ô$à"&Ð˜QÑà#Ð$4Ñ5×@Ñ@ÓBˆàÐ'Ü#(§:¡:Ø5>Ö?°l×(Ñ(¨Ó+Ò?Ü—j‘jØ#×*Ñ*ô$Ñ ð $=Ð#7×#<Ñ#<¸ZÈ*Ñ=TÐ#vÐWk×WqÑWqÐrsÐrtÐWuÒ#vÐ Ø#7Ð8HÑ#I×#TÑ#TÓ#VÐ Ø—[‘[×.Ñ.×9Ñ9ˆ
Ø.×5Ñ5ÀÈ
ÐYcÐ5ÓdˆØ)×0Ñ0¸1À:ÐT^Ð0Ó_ˆØ /× 3Ñ 3¸Ð 3Ó AÀAÑ E×KÑKÓMÐð #×/Ñ/¸\Ð`tÐ/ÓuÐØ1×CÑCÐð #Ÿn™nÐ-@ÓAÐØ"Ð"ùò' @s   Â#F/aØ  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        )Úcustom_introÚattention_maskÚposition_idsÚpast_key_valuesÚ	use_cacheÚoutput_attentionsÚoutput_hidden_statesÚreturn_dictÚcache_positionÚkwargsÚreturnc                 óÚ  — |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }| j
                  r/| j                  j                  r|	rt        j                  d«       d}	||j                  \  }}n||j                  \  }}}nt        d«      ‚|	r|€
t        «       }|€9 | j                  j                  «       |«      j                  |j                  «      }||t        d«      ‚|,| j!                  ||«      j                  |j                  «      }n)|'|j                  | j"                  |j                  ¬«      }|| j%                  |||¬«      } | j                  d
|||||	|
|d|dœ	|¤Ž}t'        |j(                  |j*                  |j,                  |j.                  |¬	«      S )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz5You have to specify either input_ids or inputs_embedszMYou cannot specify both pixel_values and image_hidden_states at the same timer9   )r5   r6   r7   T)	r6   r   r€   r   r‚   rƒ   r„   r…   r†   )rr   r   Úhidden_statesÚ
attentionsr7   r!   )rE   rƒ   r„   r‚   Úuse_return_dictÚtrainingÚ
text_modelÚgradient_checkpointingÚloggerÚwarning_oncerA   rJ   r   rB   Útor;   r}   r:   r_   r2   rr   r   rŠ   r‹   )rQ   r5   r   r€   r   r6   r`   ra   r7   r‚   rƒ   r„   r…   r†   r‡   rt   Ú
seq_lengthrR   Úoutputss                      r#   ÚforwardzSmolVLMModel.forwardó   s  € ð: 2CÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð "+Ð!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø%0Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆà=Š=˜TŸ_™_×CÒCÉ	Ü×ÑØlôð ˆIð Ð Ø%.§_¡_Ñ"ˆJ™
ØÐ&Ø(5×(;Ñ(;Ñ%ˆJ˜
¡AäÐTÓUÐUá˜Ð0Ü*›nˆOàÐ ØB˜DŸO™O×@Ñ@ÓBÀ9ÓM×PÑPÐQZ×QaÑQaÓbˆMð Ð#Ð(;Ð(GÜÐlÓmÐmàÐ#Ø"&×"9Ñ"9¸,ÐH\Ó"]×"`Ñ"`Ðan×auÑauÓ"vÑØ Ð,Ø"5×"8Ñ"8¸t¿z¹zÐR_×RfÑRfÐ"8Ó"gÐàÐ*ð !×.Ñ.Ø#Ø+Ø$7ð /ó ˆMð "$—/‘/ð 
Ø'Ø)Ø%Ø+ØØ/Ø!5ØØ)ñ
ð ñ
ˆô .Ø%×7Ñ7Ø#×3Ñ3Ø!×/Ñ/Ø×)Ñ)Ø 3ô
ð 	
r"   )N)NNNNNNNNNNNNN)r   r   r   r   rC   Ú
LongTensorÚTensorr_   ÚFloatTensorr}   r   r   r   r   Ú
BoolTensorrn   r
   r	   r   Útupler2   r•   r!   r"   r#   r4   r4   Ÿ   sÅ  „ ñð
Ø×)Ñ)ðØ:?¿,¹,ðØ]b×]iÑ]ióñ@,#¨u×/@Ñ/@ð ,#ÐX]×XhÑXhó ,#ð\ Ùðô
ð 15Ø15Ø37Ø+/Ø59Ø48Ø;?Ø;?Ø$(Ø,0Ø/3Ø&*Ø59ñQ
à˜E×,Ñ,Ñ-ðQ
ð ! §¡Ñ.ðQ
ð ˜u×/Ñ/Ñ0ð	Q
ð
 " %™ðQ
ð   × 1Ñ 1Ñ2ðQ
ð ˜u×0Ñ0Ñ1ðQ
ð ' u×'7Ñ'7Ñ8ðQ
ð & e×&7Ñ&7Ñ8ðQ
ð ˜D‘>ðQ
ð $ D™>ðQ
ð ' t™nðQ
ð ˜d‘^ðQ
ð ! ×!1Ñ!1Ñ2ðQ
ð Ð-Ñ.ðQ
ð  
ˆuÐ4Ð4Ñ	5ò!Q
ó
ó ñQ
r"   r4   c                   ó(   ‡ — e Zd Zˆ fd„Zˆ fd„Zˆ xZS )ÚSmolVLMForConditionalGenerationc                 óî   •— t         ‰|   |«       t        |«      | _        t	        j
                  |j                  j                  |j                  j                  d¬«      | _	        | j                  «        y )NF)Úbias)ÚsuperÚ__init__r4   Úmodelr   ÚLinearÚtext_configÚhidden_sizeÚ
vocab_sizeÚlm_headÚ	post_init)rQ   rE   Ú	__class__s     €r#   r    z(SmolVLMForConditionalGeneration.__init__T  sS   ø€ Ü‰Ñ˜Ô Ü! &Ó)ˆŒ
Ü—y‘y ×!3Ñ!3×!?Ñ!?À×ASÑAS×A^ÑA^ÐejÔkˆŒØ‰Õr"   c                 ó$   •— t        ‰|   di |¤Ž y)aº	  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from io import BytesIO

        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
        >>> from transformers.image_utils import load_image

        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

        >>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
        >>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")

        >>> # Create inputs
        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {"type": "video", "path": path/to/video},
        ...             {"type": "text", "text": "What is happening in this video?"},
        ...         ]
        ...     }
        ... ]

        >>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)

        >>> # Generate
        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

        >>> print(generated_texts)
        ```Nr!   )rŸ   r•   )rQ   Úsuper_kwargsr¨   s     €r#   r•   z'SmolVLMForConditionalGeneration.forwardZ  s   ø€ ôb 	‰‰Ñ'˜,Ó'r"   )r   r   r   r    r•   Ú__classcell__)r¨   s   @r#   rœ   rœ   S  s   ø„ ô÷1(ð 1(r"   rœ   )r   r+   r.   r0   rœ   r%   r4   r)   )+Útypingr   r   rC   Útorch.utils.checkpointr   Úcache_utilsr   r   Úmodeling_flash_attention_utilsr	   Úprocessing_utilsr
   Úutilsr   r   r   Úidefics3.configuration_idefics3r   r   Ú"idefics3.image_processing_idefics3r   Ú'idefics3.image_processing_idefics3_fastr   Úidefics3.modeling_idefics3r   r   r   r   r   Ú
get_loggerr   r   r   r%   r)   r+   r.   r0   r2   r4   rœ   Ú__all__r!   r"   r#   ú<module>r¸      sÉ   ð÷  #ã Û Ý ç .Ý BÝ &ß >Ñ >ß RÝ GÝ P÷õ ð 
ˆ×	Ñ	˜HÓ	%€ô5	Ð.ô 5	ôp	Ð4ô 	ô	Ð8ô 	ô'	Nô '	ôT	Ð2ô 	ô	Ð :ô 	ô	Ð%Dô 	ôq
=ô q
ôh8(Ð&Fô 8(òv	r"   