
    rhj                        d dl mZmZ d dlZd dlZd dlmZmZ d dlm	Z	m
Z
mZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ  ej:                  e      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de
      Z& G d de      Z' G d de	      Z(g dZ)y)     )OptionalUnionN)InstructBlipQFormerConfigInstructBlipVisionConfig)$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelTransformersKwargs   )PretrainedConfig)FlashAttentionKwargs)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)Unpack)logging   )CONFIG_MAPPING
AutoConfigc                       e Zd Zy)InstructBlipVideoVisionConfigN__name__
__module____qualname__     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   .       r   r   c                       e Zd Zy)InstructBlipVideoQFormerConfigNr   r   r   r   r"   r"   2   r    r   r"   c                   d     e Zd ZdZdZddiZeeedZ		 	 	 	 	 d fd	Z
ededed	efd
       Z xZS )InstructBlipVideoConfiga
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
    ```instructblipvideovideo_token_idvideo_token_index)text_configqformer_configvision_configc                    t        |   di | |i }t        j                  d       |i }t        j                  d       |i }t        j                  d       t	        di || _        t        di || _        |j                  dd      }t        |   di || _
        || _        || _        | j
                  j                  | j                  _        | j                  j                  t         v | _        d| _        d| _        y )	NzZvision_config is None. initializing the InstructBlipVideoVisionConfig with default values.z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.zTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typeoptg      ?g{Gz?r   )super__init__loggerinfor   r*   r"   r)   getr   r(   num_query_tokensr'   hidden_sizeencoder_hidden_sizer,   r   use_decoder_only_language_modelinitializer_factorinitializer_range)	selfr*   r)   r(   r3   r'   kwargstext_model_type	__class__s	           r   r/   z InstructBlipVideoConfig.__init__x   s     	"6" MKKtu!NKKvwKKKno:K]K<N~N%//,>)/:I[I 0!2262D2D2P2P//3/?/?/J/JNo/o,"%!%r   r*   r)   r(   c                 n     | d|j                         |j                         |j                         d|S )a  
        Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
        language model configurations.

        Returns:
            [`InstructBlipVideoConfig`]: An instance of a configuration object
        )r*   r)   r(   r   )to_dict)clsr*   r)   r(   r:   s        r    from_vision_qformer_text_configsz8InstructBlipVideoConfig.from_vision_qformer_text_configs   sD       
'//1)113#++-
 	
 	
r   )NNN    N)r   r   r   __doc__r,   attribute_mapr   r"   r   sub_configsr/   classmethodr   r@   __classcell__)r<   s   @r   r$   r$   6   sv    5n %J-M "86K !&F 
4
 7
 &	
 
r   r$   c                       e Zd Zy) InstructBlipVideoPreTrainedModelNr   r   r   r   rH   rH      r    r   rH   c                       e Zd Zy)InstructBlipVideoVisionModelNr   r   r   r   rJ   rJ      r    r   rJ   c                       e Zd Zy)InstructBlipVideoQFormerModelNr   r   r   r   rL   rL      r    r   rL   c                       e Zd Zy)4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r   r   rN   rN      r    r   rN   c            !       T   e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   dee   dedee   de	e
   deeef   fdZy)InstructBlipVideoModelNpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskinputs_embedsoutput_attentionsoutput_hidden_statesreturn_dictinterpolate_pos_encoding	use_cacher:   returnc                    ||n| j                   j                  }|j                  \  }}}}}|j                  ||z  |||      }| j	                  ||	|
||      }|d   }t        j                  |j                         d d t
        j                  |j                        }| j                  j                  |j                  d   dd      }t        j                  |j                         d d t
        j                  |j                        }|t        j                  |      }|j                  |d      }|j                  |d      }t        j                  ||gd      }| j                  ||||||	|
|      }|d   d d d |j                  d      d d f   }| j!                  |      }|j                  || j                   j"                  |z  d      }|Q | j$                  j'                         |      }|| j                   j(                  k(  }|t        j                  |      }nl| | j'                         t        j*                  | j                   j(                  t
        j                  |j                              k(  }|j-                  d      }|j/                  d      j1                  |      j3                  |j                        }|j3                  |j                  |j4                        }|j7                  ||      }| j                   j8                  r | j$                  d|||	|
||d|}n | j$                  d|||||	|
||d	|}t;        |||
      S )N)rQ   rY   rZ   r[   r\   r   dtypedevicedim   )rT   rU   query_embedsencoder_hidden_statesencoder_attention_maskrY   rZ   r[   rX   rU   rY   rZ   r[   r]   )rX   rU   rV   rW   rY   rZ   r[   r]   )vision_outputsqformer_outputslanguage_model_outputsr   )configuse_return_dictshapereshapevision_modeltorchonessizelongrc   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr3   language_modelget_input_embeddingsr&   tensorall	unsqueeze	expand_astorb   masked_scatterr6   rN   )r9   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r:   
batch_sizeframeschannelheightwidthrk   image_embedsimage_attention_maskrw   query_attention_maskquery_outputsquery_outputlanguage_model_inputsspecial_image_maskoutputss                                 r   forwardzInstructBlipVideoModel.forward   s   " &1%<k$++B]B] 6B5G5G2
FGVU#++J,?&RWX**%/!5#%= + 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7/!5# % 	
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j$++JfJfioJoqs t FD//DDFyQM!*dkk.H.H!H%!&!;!.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=GGVYYZgZnZno 5 8 89M9M}ObOb c%445GI^_;;66)d)) +-"3%9'# G *d)) 
+-"3'="3%9'#
 
G D))#*
 	
r   )NNNNNNNNNFN)r   r   r   rs   FloatTensorr   
LongTensorTensorboolr   r   r   tuplerN   r   r   r   r   rP   rP      s<   
 >B15598<=A04,0/3&*).$(i
''i
 !,,i
 !))9)9 :	i

 E--.i
 !!1!12i
 $E$4$45i
 !))9)9 :i
  -i
 $D>i
 'tni
 d^i
 #'i
 D>i
 -.i
  
uJJ	K!i
r   rP   c            #          e Zd Z	 	 	 ddej                  dej
                  deej
                     dee   dee   f
dZ	 	 	 ddej                  dej
                  deej
                     dee   dee   f
dZ	d	ej
                  d
ej                  fdZ
	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  deej
                     d	eej                     deej
                     deej
                     deej
                     d
eej                     dee   dee   deej
                     dee   dedee   dee   deeef   f dZ ej"                         	 	 	 	 	 	 ddej                  deej
                     deej
                     d	eej
                     deej
                     d
eej                     dedej
                  fd       Zy))InstructBlipVideoForConditionalGenerationNrQ   rR   rS   r\   r[   c                    |j                   \  }}}}	}
|j                  ||z  ||	|
      }| j                  ||d      }|d   }t        j                  |j                         dd t        j                  |j                        }| j                  j                  |j                   d   dd      }t        j                  |j                         dd t        j                  |j                        }|t        j                  |      }|j                  |d      }|j                  |d      }t        j                  ||gd      }| j                  |||||d	      }|d   ddd|j                  d      ddf   }| j                  |      }|j                  || j                  j                   |z  d      }|r|||fS |S )
a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        T)rQ   r\   r[   r   Nr`   ra   rd   rf   )rT   rU   rg   rh   ri   r[   )rp   rq   rr   rs   rt   ru   rv   rc   rw   rx   ry   rz   r{   r|   r}   rn   r3   )r9   rQ   rR   rS   r\   r[   r   r   r   r   r   rk   r   r   rw   r   r   r   r   s                      r   get_video_featuresz<InstructBlipVideoForConditionalGeneration.get_video_features1  s   " 6B5G5G2
FGVU#++J,?&RWX**%%= + 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7 % 
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j$++JfJfioJoqs t(.-GG$$r   c                      y )Nr   )r9   rQ   rR   rS   r\   r[   s         r   get_image_featuresz<InstructBlipVideoForConditionalGeneration.get_image_featuresm  s     	r   rT   rX   c                    |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                  d      j                  |      j                  |j                        }|S )zY
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`.
        ra   r`   )r   rs   r   rn   r&   rv   rc   r   r   r   r   )r9   rT   rX   r   s       r   get_placeholder_maskz>InstructBlipVideoForConditionalGeneration.get_placeholder_maskw  s     !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H/99"=GGVYYZgZnZno!!r   rU   rV   rW   rY   rZ   labelsr]   r:   r^   c                 H   ||n| j                   j                  }| j                  ||||d      \  }}}|s|j                         n|}|s|j                         n|}| | j	                         |      }|t        j                  |      }|j                  |j                  |j                        }| j                  ||      }|j                  ||      }| j                   j                  re | j                  d|||	|
||d|}|r|j                  n|d   }d}|w | j                  d||| j                   j                   j"                  d|}nB | j                  d|||||	|
|||d	|}|r|j$                  n|d   }|r|j                  n|d	   }t'        |||||
      S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTrR   rS   r\   r[   rX   rj   r   )logitsr   
vocab_size)	rX   rU   rV   rW   rY   rZ   r[   r   r]   rf   )lossr   rk   rl   rm   r   )rn   ro   r   to_tupler   rs   ry   r   rc   rb   r   r   r6   r~   r   loss_functionr(   r   r   rN   )r9   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r   r[   r\   r]   r:   r   rk   r   r   r   r   r   s                          r   r   z1InstructBlipVideoForConditionalGeneration.forward  s   b &1%<k$++B]B]?C?V?V/#9%= @W @
<~} ;F002>8C..0 7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_;;66)d)) +-"3%9'# G (3W^^
FD!)t)) !&T[[=T=T=_=_ci
 *d)) +-"3'="3%9'# G $/7<<GAJD'2W^^
FC))#*
 	
r   c                 H   t        | d      r| j                          |j                  d   }	| j                  ||||d      \  }
}}||| j                  j
                  g| j                  j                  z  dz  }|| j                  j                  j                  gz   }t        j                  |gt        j                  |j                        }|j                  |	d      } | j                         |      }|t        j                  |      }|
j!                  |j                  |j"                        }
| j%                  ||      }|j'                  ||
      }||d	}| j(                  j                  j*                  s||d
<    | j(                  j,                  di ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        hf_device_mapr   Tr      ra   rf   r   )rX   rU   rT   r   )hasattr_preprocess_acceleraterp   r   rn   r'   r3   r(   bos_token_idrs   r   rv   rc   repeatr   ry   r   rb   r   r   r~   is_encoder_decodergenerate)r9   rQ   rR   rS   rT   rU   rX   r\   generate_kwargsr   r   rk   r   video_tokensstart_tokensr   inputsr   s                     r   r   z2InstructBlipVideoForConditionalGeneration.generate  s   D 4)'')!''*
?C?V?V/#9%= @W @
<~}    $ = =>A]A]]`aa+t{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_#0NS""))<<"+F;.$%%..KK?Kr   )NFF)NNNNNNNNNNFN)NNNNNF)r   r   r   rs   r   r   r   r   r   r   r   r   r   r   r   rN   r   no_gradr   r   r   r   r   r   0  s   
 >B38&+9%''9% !++9% !))9)9 :	9%
 #+4.9% d^9%@ >B38&+'' !++ !))9)9 :	
 #+4. d^"e.>.> "uO`O` "& >B15598<=A59,0/3-1&*).$(N
''N
 !,,N
 !))9)9 :	N

 E--.N
 !!1!12N
 $E$4$45N
 !))9)9 :N
   1 12N
 $D>N
 'tnN
 ))*N
 d^N
 #'N
 D>N
  +,!N
" 
uJJ	K#N
` U]]_ 9==A045959).C''C $E$4$45C !))9)9 :	C
 E,,-C !!1!12C   1 12C #'C 
		C Cr   r   )r$   r"   r   rJ   rH   rL   rP   r   )*typingr   r   rs   torch.utils.checkpoint;transformers.models.instructblip.configuration_instructblipr   r   6transformers.models.instructblip.modeling_instructblipr   r   r	   r
   r   r   r   configuration_utilsr   modeling_flash_attention_utilsr   models.auto.modeling_autor   processing_utilsr   utilsr   autor   r   
get_loggerr   r0   r   r"   r$   rH   rJ   rL   rN   rP   r   __all__r   r   r   <module>r      s     #     4 B J &  - 
		H	%	$< 		%> 	z
. z
z	'B 		#: 		$< 		;j 	j
. j
Zj0T jZ		r   