
    rh'                     *   d Z ddlmZ ddlZddlmZ ddlmZ  G d d	ej                  j                  j                        Z G d
 dej                  j                  j                        Z G d dej                  j                  j                        Zy)a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalN   )
shape_list   )IdeficsConfigc                   ~     e Zd Zdededededededdf fd	Z fd
Zdej                  dej                  fdZ	 xZ
S )TFIdeficsPerceiverResamplerconfig	embed_dimdepthn_headshead_dim	n_latentsreturnNc                    t        	|   di | ||||f\  | _        | _        | _        | _        |j                  j                  | _        t        |j                  d      s| j                  dz  n|j                  j                  dz  | _        g | _        t        |      D ]s  }| j                  j                  t        | j                  | j                  | j                  | j                  d| d      t!        | j                  |d| d      g       u t"        j$                  j&                  j)                  dd	      | _        y
)ao  
        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

        Args:
            config (`IdeficsConfig`): config object
            embed_dim (`int`): The size of each embedding vector
            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
            n_latents (`int`):
                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

        r      zblocks.z.0namez.1h㈵>
layer_normepsilonr   N )super__init__r   r   r   r   perceiver_configqk_layer_norms_perceiverqk_layer_normshasattrvision_configintermediate_dimblocksrangeappendTFIdeficsPerceiverAttentionTFIdeficsMLPtfkeraslayersLayerNormalizationr   )
selfr
   r   r   r   r   r   kwargsi	__class__s
            {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/idefics/perceiver_tf.pyr   z$TFIdeficsPerceiverResampler.__init__1   s+   ( 	"6"FOQXZbdmFmCdmT^$55NN 6//= NNQ%%//!3 	 u 	AKK/dmmTEXEXahijhkkm_n !!6!6wqcQS_U		 ((//<<TP\<]    c                     | j                  | j                  | j                  fddd      | _        t        |   |       y )Nrandom_normalTlatents)shapeinitializer	trainabler   )
add_weightr   r   r3   r   build)r+   input_shaper.   s     r/   r8   z!TFIdeficsPerceiverResampler.build\   s>    >>4>>2[_fo ' 
 	k"r0   contextc                    t        j                  | j                  d      }t        j                  |t        j                  |      d   ddg      }| j
                  D ]  \  }} |||      |z   } ||      |z   } | j                  |      S )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   axisr   )r'   expand_dimsr3   tiler4   r"   r   )r+   r:   r3   attnffs        r/   callz TFIdeficsPerceiverResampler.callc   s     ..A6'''BHHW$5a$8!Q#?@ 	,HD"7G,w6GkG+G	, w''r0   )__name__
__module____qualname__r   intr   r8   r'   TensorrB   __classcell__r.   s   @r/   r	   r	   0   si    )^#)^03)^<?)^JM)^Y\)^il)^	)^V#	(BII 	(")) 	(r0   r	   c            
            e Zd Zdededededdf
 fdZdej                  d	ej                  dej                  fd
Z xZ	S )r%   r   r   r   r   r   Nc                 0   t        |   di | |||c| _        | _        | _        || _        t        j                  j                  j                  dd      | _
        t        j                  j                  j                  dd      | _        | j
                  r`t        j                  j                  j                  dd      | _        t        j                  j                  j                  dd      | _        | j                  dz  | _        t        j                  j                  j                  | j                  | j                  z  dd	
      | _        t        j                  j                  j                  | j                  | j                  z  dd
      | _        t        j                  j                  j                  | j                  | j                  z  dd
      | _        t        j                  j                  j                  |dd
      | _        y)ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`r   context_layer_normr   latents_layer_normq_layer_normk_layer_normg      Fq_projuse_biasr   k_projv_projoutput_projNr   )r   r   r   r   r   r   r'   r(   r)   r*   rL   rM   rN   rO   qk_scaleDenserP   rS   rT   rU   )r+   r   r   r   r   r,   r.   s         r/   r   z$TFIdeficsPerceiverAttention.__init__p   s   "6"6?(3dm,"$((//"D"DTXl"D"m"$((//"D"DTXl"D"m " B B4Vd B eD " B B4Vd B eDt+ hhoo++DLL4==,HSX_g+hhhoo++DLL4==,HSX_g+hhhoo++DLL4==,HSX_g+h88??00UQ^0_r0   r:   r3   c                    | j                  |      }| j                  |      }t        |      \  }}}| j                  |      }| j	                  t        j                  ||gd            }| j                  t        j                  ||gd            }|||fD 	cg c]T  }	t        j                  t        j                  |	||	j                  d   | j                  | j                  f      g d      V c}	\  }}}| j                  r"| j                  |      }| j                  |      }t        j                   d|| j"                  z  |      }
|
t        j$                  |
dd	      z
  }t
        j&                  j)                  |d      }t        j                   d
||      }| j+                  t        j                  t        j                  |g d      |d| j                  | j                  z  f            S c c}	w )a=  
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            context (`tf.Tensor`):
                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
            latents (`tf.Tensor`):
                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

        Returns:
            `tf.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
            from context.
        r<   r   )r      r   r   )permz... i d, ... j d -> ... i jT)r=   keepdimsz... i j, ... j d -> ... i d)rL   rM   r   rP   rS   r'   concatrT   	transposereshaper4   r   r   r   rN   rO   einsumrV   
reduce_maxnnsoftmaxrU   )r+   r:   r3   
batch_size
seq_lengthr   qkvxscoresstabilized_scoresr@   	resampleds                 r/   rB   z TFIdeficsPerceiverAttention.call   s    ))'2))'2,6w,?)
J	 KK KK		7G"42>?KK		7G"42>? AY
 LLA
AGGAJdmm'\]dpq
1a
 !!!$A!!!$A8!dmm:KQO"R]]6T%RRuu}}.R}8 II;T1E	JJr||ILAJPRTXT`T`cgcpcpTpCqr
 	

s   AG5)
rC   rD   rE   rF   boolr   r'   rG   rB   rH   rI   s   @r/   r%   r%   o   sY    `# ` `s `TX `gk `*+
BII +
		 +
bii +
r0   r%   c                   h     e Zd Zdef fdZdeeej                        dej                  fdZ	 xZ
S )r&   r
   c                    t        |   di | |j                  j                  | _        t        j
                  j                  j                  dd      | _        t        j
                  j                  j                  |dd      | _
        t        j
                  j                  j                  d      | _        t        j
                  j                  j                  | j                  dd	      | _        y
)z:Simple MLP block with intermediate_size and embedding sizer   lnr   FfcrQ   actr   c_projNr   )r   r   r    r   r'   r(   r)   r*   rq   rW   rr   ReLUrs   rt   )r+   intermediate_sizer
   r,   r.   s       r/   r   zTFIdeficsMLP.__init__   s    "6"--77((//44T4M((//''(9EPT'U88??''U'3hhoo++DNNUQY+Zr0   hidden_statesr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S )N)rq   rr   rs   rt   )r+   rw   s     r/   rB   zTFIdeficsMLP.call   s@    ../M2r0   )rC   rD   rE   r   r   r   tupler'   rG   rB   rH   rI   s   @r/   r&   r&      s6    [- [(5+;"<  r0   r&   )__doc__typingr   
tensorflowr'   modeling_tf_utilsr   configuration_ideficsr   r(   r)   Layerr	   r%   r&   r   r0   r/   <module>r      sj   4   + 0<("((//"7"7 <(~A
"((//"7"7 A
H288??(( r0   