
    rh*                    |   d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dlm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6  e       rddlm7Z7  e1jp                  e9      Z:d Z; G d de
jx                        Z=d Z>d Z? G d de
jx                        Z@ G d de
jx                        ZA G d d e
jx                        ZB G d! d"eA      ZCeAeCd#ZD G d$ d%e
jx                        ZEd& ZF G d' d(e
jx                        ZG G d) d*e
jx                        ZH G d+ d,e      ZI G d- d.e
jx                        ZJ G d/ d0e
jx                        ZKe/ G d1 d2e'             ZL G d3 d4eL      ZM G d5 d6e
jx                        ZN G d7 d8e
jx                        ZO G d9 d:e
jx                        ZPee/ G d; d<e                     ZQ G d= d>e
jx                        ZR G d? d@e
jx                        ZS edA       G dB dCe
jx                               ZT G dD dEe
jx                        ZU G dF dGe
jx                        ZVdH ZWdbdIZXdJej                  dKeYdLej                  fdMZZ	 dcdNe
jx                  dOej                  dPej                  dQej                  dReej                     dSe[dTe[dUe,e.   fdVZ\ G dW dXe
jx                        Z] G dY dZe      Z^e/ G d[ d\e'             Z_ G d] d^e_      Z` G d_ d`e_e      Zag daZby)d    N)	dataclass)CallableOptionalUnion)Tensornn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)!flash_attn_supports_top_left_maskis_flash_attn_available)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsBaseModelOutputWithPast,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSModuleUtilsMixinPreTrainedModel find_pruneable_heads_and_indicesget_parameter_dtypeprune_linear_layer)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)check_model_inputs   )EvollaConfigSaProtConfig)_flash_attention_forwardc                     | j                  |      j                         }t        j                  |d      j	                  |      |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r&   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxmaskincremental_indicess       }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/evolla/modeling_evolla.py"create_position_ids_from_input_idsr8   B   sP     <<$((*D,,t3;;DADH##%33    c                   8     e Zd ZdZ fdZ	 	 	 	 ddZd Z xZS )EvollaSaProtEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        |j                  r1t        j                  |j
                  |j                        | _        nd | _        t        j                  |j                        | _        t        |dd      | _        | j#                  dt%        j&                  |j(                        j+                  d      d       |j                  | _        | j                   dk(  r;t        j                  |j(                  |j
                  | j,                        | _        |j0                  | _        |j2                  | _        d | _        y )	N)r4   epsposition_embedding_typeabsoluteposition_ids)r&   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr?   register_bufferr/   arangemax_position_embeddingsexpandr4   position_embeddingstoken_dropoutmask_token_idrA   selfconfig	__class__s     r7   rF   zEvollaSaProtEmbeddings.__init__W   s2   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11 r9   c                    |*|t        || j                        }n| j                  |      }|| j                  |      }|}| j                  r|j                  || j                  k(  j                  d      d      }d}|j                  d      }|| j                  k(  j                  d      j                         |z  }|d|z
  z  d|z
  d d d d f   z  j                  |j                        }| j                  dk(  r| j                  |      }	||	z   }| j                  | j                  |      }|-||j                  d      z  j                  |j                        }|S )NrB           gQ?r&   r@   )r8   r4   &create_position_ids_from_inputs_embedsrK   rY   masked_fillrZ   	unsqueezesumfloattodtyper?   rX   rO   )
r\   r3   attention_maskrA   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedrX   s
             r7   forwardzEvollaSaProtEmbeddings.forwardp   s{    $A)TM]M]^#JJ=Y  00;M #
 #//d>P>P1P0[0[\^0_adeJ)(,,R0K#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#&99J??&4J%$~'?'?'CCGG
HXHXYJ r9   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrB   r&   rg   devicer   )sizer/   rU   r4   r2   rq   rc   rW   )r\   ri   input_shapesequence_lengthrA   s        r7   ra   z=EvollaSaProtEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r9   )NNNN)__name__
__module____qualname____doc__rF   rn   ra   __classcell__r^   s   @r7   r;   r;   R   s&    !6 /b=r9   r;   c                 b    | j                  dd      \  }}t        j                  | |fd      S )N   rB   r+   )chunkr/   catxx1x2s      r7   rotate_half_esmr      s/    WWQBWFB99rc2YB''r9   c                     |d d d d d | j                   d   d d f   }|d d d d d | j                   d   d d f   }| |z  t        |       |z  z   S )N)shaper   )r   cossins      r7   apply_rotary_pos_emb_esmr      sY    
aMaggbkM1$
%C
aMaggbkM1$
%CG*S011r9   c                        e Zd ZdZdef fdZd	dZdej                  dej                  de	ej                  ej                  f   fdZ
 xZS )
EvollaSaProtRotaryEmbeddingz
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    r,   c                     t         |           ddt        j                  d|dt        j                        j                         |z  z  z  }|}| j                  d|       d | _        d | _        d | _	        y )N      ?i'  r   r|   rg   inv_freq)
rE   rF   r/   rU   int64re   rT   _seq_len_cached_cos_cached_sin_cached)r\   r,   r   r^   s      r7   rF   z$EvollaSaProtRotaryEmbedding.__init__   sl    %ELLC%++$N$T$T$VY\$\]^Z2#r9   c                 t   |j                   |   }|| j                  k7  s#| j                  j                  |j                  k7  r|| _        t	        j
                  |j                   |   |j                        j                  | j                        }t	        j                  || j                        }t	        j                  ||fd      j                  |j                        }|j                         d d d d d d f   | _        |j                         d d d d d d f   | _        | j                  | j                  fS )Nrq   rB   r+   )r   r   r   rq   r/   rU   r1   r   outerr~   rf   r   r   r   )r\   r   seq_dimensionseq_lentfreqsembs          r7   _update_cos_sin_tablesz2EvollaSaProtRotaryEmbedding._update_cos_sin_tables   s    ''-( d***d.>.>.E.E.Q#*D QWW]3AHHEMMdmm\AKK4==1E))UEN366qxx@C"wwytQ)9:D"wwytQ)9:D!1!111r9   qkreturnc                     | j                  |d      \  | _        | _        t        || j                  | j                        t        || j                  | j                        fS )Nr   )r   )r   r   r   r   )r\   r   r   s      r7   rn   z#EvollaSaProtRotaryEmbedding.forward   s_    -1-H-HZ\-H-]*$* %Q(8(8$:J:JK$Q(8(8$:J:JK
 	
r9   )r|   )ru   rv   rw   rx   r.   rF   r   r/   r   tuplern   ry   rz   s   @r7   r   r      sM    	 C 	 2 
 
%,, 
5u||A[;\ 
r9   r   c                        e Zd Zd
 fd	Z	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     dee   de	ej
                     fd	Z
 xZS )EvollaSaProtSelfAttentionc                 b   t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |xs t%        |dd      | _        d | _        | j&                  dk(  s| j&                  d	k(  rG|j*                  | _        t        j,                  d
|j*                  z  dz
  | j                        | _        n*| j&                  dk(  rt1        | j                        | _        |j2                  | _        || _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r?   r@   relative_keyrelative_key_queryr|   r&   rotaryr+   )rE   rF   r]   rI   num_attention_headshasattr
ValueErrorr.   attention_head_sizeall_head_sizer   LinearquerykeyvaluerP   attention_probs_dropout_probrR   rS   r?   rotary_embeddingsrV   rG   distance_embeddingr   
is_decoder	layer_idxr\   r]   r?   r   r^   s       r7   rF   z"EvollaSaProtSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ "&''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD#))X5%@TE]E]%^D" ++"r9   hidden_statesrh   	head_maskencoder_hidden_statesencoder_attention_maskoutput_attentionsr   c                    |j                   d   d| j                  | j                  f}| j                  |      j	                  |      j                  dd      }|d u}	|	rc| j                  |      j	                  |      j                  dd      }
| j                  |      j	                  |      j                  dd      }|}n`| j                  |      j	                  |      j                  dd      }
| j                  |      j	                  |      j                  dd      }|| j                  dz  z  }| j                  dk(  r| j                  ||
      \  }}
t        j                  ||
j                  dd            }| j                  dk(  s| j                  d	k(  rF|j                         d   }t        j                  |t        j                  |j                  
      j	                  dd      }t        j                  |t        j                  |j                  
      j	                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                        }| j                  dk(  rt        j(                  d||      }||z   }nE| j                  d	k(  r6t        j(                  d||      }t        j(                  d|
|      }||z   |z   }|||z   }t*        j,                  j/                  |d      }| j1                  |      }|||z  }t        j                  |j%                  |j&                        |      }|j3                  dddd      j5                         }|j                         d d | j6                  fz   }|j	                  |      }|r||fn|f}| j8                  r|dz   }|S )Nr   rB   r&   r|         r   r   r   r   rp   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr+   r	   N)r   r   r   r   view	transposer   r   r?   r   r/   matmulrr   rU   r2   rq   r   rV   rf   rg   einsumr   
functionalsoftmaxrR   permute
contiguousr   r   )r\   r   rh   r   r   r   r   hidden_shapequery_layeris_cross_attention	key_layervalue_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                            r7   rn   z!EvollaSaProtSelfAttention.forward  s    &++A.D4L4LdNfNfgjj/44\BLLQPQR
 3$>!67<<\JTTUVXYZI**%:;@@NXXYZ\]^K3N/44\BLLQPQRI**]388FPPQRTUVK "D$<$<d$BB''83%)%;%;K%S"K !<<Y5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s %/.@ --//0@b/I ,,7  -	9O_%7%78I8I%JKX%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]??'Gr9   NNNNNNF)ru   rv   rw   rF   r/   r   r   FloatTensorboolr   rn   ry   rz   s   @r7   r   r      s    #F 7;15=A>B,1O||O !!2!23O E--.	O
  ((9(9:O !)):): ;O $D>O 
u||	Or9   r   c                   $     e Zd Z fdZd Z xZS )EvollaSaProtSelfOutputc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r   )	rE   rF   r   r   rI   denserP   rQ   rR   r[   s     r7   rF   zEvollaSaProtSelfOutput.__init__^  sB    YYv1163E3EF
zz&"<"<=r9   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   rR   r\   r   input_tensors      r7   rn   zEvollaSaProtSelfOutput.forwardc  .    

=1]3%4r9   ru   rv   rw   rF   rn   ry   rz   s   @r7   r   r   ]      >
r9   r   c                        e Zd ZdZd fd	Z	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	   d	e
ej                     f fd
Z xZS )EvollaSaProtFlashAttention2aZ  
    EVOLLA_SA_PROT flash attention module. This module inherits from `EvollaSaProtSelfAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 j    t         |   |||       t               | _        |j                  | _        y )N)r?   r   )rE   rF   r   _flash_attn_uses_top_left_maskr   dropout_probr   s       r7   rF   z$EvollaSaProtFlashAttention2.__init__q  s5    9P\ef
 /P.Q+"??r9   r   rh   r   r   r   r   r   c                    |s||)t         j                  d       t        |   ||||||      S |j	                         \  }}}	| j                  | j                  |            }
| j                  | j                  |            }| j                  | j                  |            }|
j                  }|
j                  j                  dk7  r|
j                  j                  nd}|t        j                  k(  rt        j                         r:t        t        d      rt        j                   |      nt        j"                         }nMt        | j$                  d      r| j$                  j&                  }n | j                  j(                  j                  }t         j                  d| d       |
j+                  |      }
|j+                  |      }|j+                  |      }|
| j,                  dz  z  }
| j.                  d	k(  r| j1                  |
|      \  }
}n7| j.                  d
k(  s| j.                  dk(  rt3        d| j.                   d      t5        |
j7                  dddd      |j7                  dddd      |j7                  dddd      ||| j8                  d| j:                  r| j<                  nd| j>                  	      }|jA                  ||d      }|d f}| j8                  r|dz   }|S )NzEvollaSaProtFlashAttention2 does not support output_attentions, head_mask, or cross_attention. Falling back to the manual attention implementation. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.mpscpuget_autocast_dtype_pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r   r   r   r   z%ESM flash attention does not support z embeddingsr   r|   r&   r	   r   r`   )query_length	is_causalsoftmax_scalerR   use_top_left_maskrB   r   )!loggerwarning_oncerE   rn   rr   transpose_for_scoresr   r   r   rg   rq   typer/   float32is_autocast_enabledr   r   get_autocast_gpu_dtyper]   r   weightrf   r   r?   r   r   r)   r   r   trainingr   r   reshape)r\   r   rh   r   r   r   r   bszq_len_r   r   r   input_dtypedevice_typetarget_dtypeattn_outputr   r^   s                     r7   rn   z#EvollaSaProtFlashAttention2.forwardz  s    	 59N9ZU
 7?%&!  &**,UA//

=0IJ--dhh}.EF	//

=0IJ "''1<1C1C1H1HE1Qk((--W\%--'((* u&:; ,,[9557  &?@#{{BB#zz0066 >$ &..6K!\2I%..6K "D$<$<d$BB''83%)%;%;K%S"K))^;t?[?[_s?sDTEaEaDbbmnoo
 /1a+aAq)1a+oo)-D%%C"AA

 "))#ub9%??'Gr9   r   r   )ru   rv   rw   rx   rF   r/   r   r   r   r   r   rn   ry   rz   s   @r7   r   r   j  s    @ 7;15=A>B,1]||] !!2!23] E--.	]
  ((9(9:] !)):): ;] $D>] 
u||	] ]r9   r   )eagerflash_attention_2c                   :     e Zd Zd fd	Zd Z	 	 	 	 	 	 ddZ xZS )EvollaSaProtAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        t        j                  |j                  |j                        | _
        y )N)r   r=   )rE   rF    EVOLLA_SA_PROT_ATTENTION_CLASSES_attn_implementationr\   r   outputsetpruned_headsr   rM   rI   rN   r\   r]   r   r^   s      r7   rF   zEvollaSaProtAttention.__init__  s]    4V5P5PQRXdmn	,V4Ef&8&8f>S>STr9   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r&   r+   )lenr   r\   r   r   r  r   r   r   r   r  r   r   union)r\   headsindexs      r7   prune_headsz!EvollaSaProtAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r9   c                     | j                  |      }| j                  ||||||      }	| j                  |	d   |      }
|
f|	dd  z   }|S )Nrh   r   r   r   r   r   r&   )rM   r\   r  )r\   r   rh   r   r   r   r   cache_positionhidden_states_lnself_outputsattention_outputr   s               r7   rn   zEvollaSaProtAttention.forward  sh      >>-8yy)"7#9/ ! 
  ;;|AF#%QR(88r9   r   NNNNFN)ru   rv   rw   rF   r  rn   ry   rz   s   @r7   r  r    s'    U;* "#r9   r  c                 j    | dz  dt        j                  | t        j                  d      z        z   z  S )zz
    This is the gelu implementation from the original EVOLLA_SA_PROT repo. Using F.gelu yields subtly wrong results.
    g      ?r   g       @)r/   erfmathsqrt)r   s    r7   gelur&    s.     s7cEIIa$))C.&899::r9   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )EvollaSaProtIntermediatec                     t         |           t        j                  |j                  |j
                        | _        y r   )rE   rF   r   r   rI   intermediate_sizer   r[   s     r7   rF   z!EvollaSaProtIntermediate.__init__  s,    YYv1163K3KL
r9   r   r   c                 >    | j                  |      }t        |      }|S r   )r   r&  )r\   r   s     r7   rn   z EvollaSaProtIntermediate.forward  s     

=1]+r9   ru   rv   rw   rF   r/   r   rn   ry   rz   s   @r7   r(  r(    s$    MU\\ ell r9   r(  c                   $     e Zd Z fdZd Z xZS )EvollaSaProtOutputc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
rE   rF   r   r   r*  rI   r   rP   rQ   rR   r[   s     r7   rF   zEvollaSaProtOutput.__init__%  sB    YYv779K9KL
zz&"<"<=r9   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r7   rn   zEvollaSaProtOutput.forward*  r   r9   r   rz   s   @r7   r.  r.  $  r   r9   r.  c                   8     e Zd Z fdZ	 	 	 	 	 	 ddZd Z xZS )EvollaSaProtLayerc                    t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r*| j                  st        |  d      t	        |      | _	        t        |      | _        t        |      | _        t        j                  |j                   |j"                        | _        y )Nr&   z> should be used as a decoder model if cross attention is addedr=   )rE   rF   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionRuntimeErrorcrossattentionr(  intermediater.  r  r   rM   rI   rN   r[   s     r7   rF   zEvollaSaProtLayer.__init__2  s    '-'E'E$.v6 ++#)#=#= ##??"dV+i#jkk"7"?D4V<(0f&8&8f>S>STr9   c                 R   | j                  ||||      }|d   }	| j                  r|dd }
n|dd  }
| j                  rA|?t        | d      st        d|  d      | j	                  |	|||||      }|d   }	|
|dd z   }
| j                  |	      }|f|
z   }
| j                  r|
d	z   }
|
S )
N)rh   r   r   r   r&   rB   r9  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r  r   )r6  r   r   AttributeErrorr9  feed_forward_chunk)r\   r   rh   r   r   r   r   r  self_attention_outputsr   r   cross_attention_outputslayer_outputs                r7   rn   zEvollaSaProtLayer.forwardA  s	    "&)/	 "0 "
 2!4 ??,Qr2G,QR0G??4@4!12$=dV D` ` 
 '+&9&9 -#&;'="3 ': '#  7q9 7" ==G../?@/G+ ??'Gr9   c                 n    | j                  |      }| j                  |      }| j                  ||      }|S r   )rM   r:  r  )r\   r   attention_output_lnintermediate_outputr@  s        r7   r=  z$EvollaSaProtLayer.feed_forward_chunkt  s<    "nn-=>"//0CD{{#68HIr9   r!  )ru   rv   rw   rF   rn   r=  ry   rz   s   @r7   r2  r2  1  s(    U$ "#1fr9   r2  c                   @     e Zd Z fdZe	 	 	 	 	 	 	 	 dd       Z xZS )EvollaSaProtEncoderc                 0   t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t        j                  |j                  |j                        | _        d| _        y c c}w )Nr=   F)rE   rF   r]   r   
ModuleListrangenum_hidden_layersr2  layerrM   rI   rN   emb_layer_norm_aftergradient_checkpointing)r\   r]   r  r^   s      r7   rF   zEvollaSaProtEncoder.__init__|  sn    ]]uVMeMeGf#g!$5f$=#gh
$&LL1C1CI^I^$_!&+# $hs   Bc
           	         |rdnd }
|rdnd }|r| j                   j                  rdnd }t        | j                        D ]U  \  }}|r|
|fz   }
|||   nd } |||||||      }|d   }|s-||d   fz   }| j                   j                  sM||d   fz   }W | j                  r| j	                  |      }|r|
|fz   }
t        ||
||      S )N )r   rh   r   r   r   r   r   r&   r|   last_hidden_stater   
attentionscross_attentions)r]   r7  	enumeraterJ  rK  r   )r\   r   rh   r   r   r   r   output_hidden_statesreturn_dictr  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputss                    r7   rn   zEvollaSaProtEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d(4 	VOA|#$58H$H!.7.CilO(+-)&;'="3M *!,M &9]1=M<O&O#;;22+?=QRCSBU+U('	V* $$ 55mDM 1]4D D1++*1	
 	
r9   )NNNNFFTN)ru   rv   rw   rF   r#   rn   ry   rz   s   @r7   rE  rE  {  s6    ,  "#"0
 0
r9   rE  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )EvollaSaProtPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rE   rF   r   r   rI   r   Tanh
activationr[   s     r7   rF   zEvollaSaProtPooler.__init__  s9    YYv1163E3EF
'')r9   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   ra  )r\   r   first_token_tensorpooled_outputs       r7   rn   zEvollaSaProtPooler.forward  s6     +1a40

#566r9   r,  rz   s   @r7   r^  r^    s#    $
U\\ ell r9   r^  c                   (    e Zd ZU eed<   dgZdZd Zy)EvollaSaProtPreTrainedModelr]   r2  Tc                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          yyt        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yy)zInitialize the weightsr`   meanstdNr   )r]   initializer_range
isinstancer   r   r   datanormal_biaszero_rG   r4   rM   fill_)r\   modulerj  s      r7   _init_weightsz)EvollaSaProtPreTrainedModel._init_weights  s    kk++fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-KK""$MM$$S) .r9   N)ru   rv   rw   r(   __annotations___no_split_modules_supports_flash_attnrs  rN  r9   r7   rf  rf    s    ,-*r9   rf  c                        e Zd Zdef fdZd Zd Zd Ze	 dde	e
j                     de	e
j                     deee
j                     ef   fd	       Z	 dded
ee   de
j"                  de
j$                  def
dZ xZS )EvollaSaProtProteinEncoderr]   c                 d    t         |   |       t        |      | _        t	        |      | _        y r   )rE   rF   r;   rj   rE  encoderr[   s     r7   rF   z#EvollaSaProtProteinEncoder.__init__  s(     08*62r9   c                 .    | j                   j                  S r   rj   rK   r\   s    r7   get_input_embeddingsz/EvollaSaProtProteinEncoder.get_input_embeddings  s    ...r9   c                 &    || j                   _        y r   r|  r\   r   s     r7   set_input_embeddingsz/EvollaSaProtProteinEncoder.set_input_embeddings  s    */'r9   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrz  rJ  r6  r  )r\   heads_to_prunerJ  r  s       r7   _prune_headsz'EvollaSaProtProteinEncoder._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr9   r3   rh   r   c                 N   |j                         }|\  }}|j                  }|t        j                  ||f|      }| j	                  ||      }| j                  ||      }| j                  ||      }	|	d   }
t        |
|	j                  |	j                  |	j                        S )Nr   r3   rh   )rh   r   rO  )rr   rq   r/   onesrj   get_extended_attention_maskrz  r   r   rQ  rR  )r\   r3   rh   rs   
batch_sizer   rq   ri   extended_attention_maskencoder_outputssequence_outputs              r7   rn   z"EvollaSaProtProteinEncoder.forward  s      nn&!,
J!!!"ZZ*j)A6RN)N["&"B"B>S^"_,,}E\,])!,;-)77&11,==	
 	
r9   rs   rq   rg   c                 4   |t        |       }|j                         dk(  r| j                  j                  s|t	        j
                  dt               |j                         dk(  r|dddddddf   }nk|j                         dk(  r<| j                  j                  rt        j                  |||      }n*|ddddddf   }nt        d| d|j                   d      |j                  |      }d	|z
  t        j                  |      j                  z  }|S )
a  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        Nr|   zNThe `device` argument is deprecated and will be removed in v5 of Transformers.r	   z!Wrong shape for input_ids (shape z) or attention_mask (shape r   r   r   )r   r,   r]   r   warningswarnFutureWarningr   *create_extended_attention_mask_for_decoderr   r   rf   r/   finfomin)r\   rh   rs   rq   rg   r  s         r7   r  z6EvollaSaProtProteinEncoder.get_extended_attention_mask	  s"    ='-E""$)dkk.D.D!dfs
 1$&4Qa]&C#!Q& {{%%*:*e*e+' +9D$9I*J'3K=@[\j\p\p[qqrs  #:"<"<5"<"I#&)@#@EKKPUDVDZDZ"Z&&r9   r   r   )ru   rv   rw   r(   rF   r~  r  r  r#   r   r/   r   r   r   r   rn   r.   rq   re   r  ry   rz   s   @r7   rx  rx    s    3| 3
/0C  26
ELL)
 !.
 
uU\\"$PP	Q	
 
2 rv2'$2'38:2'GL||2'chcncn2'	2'r9   rx  c                   &     e Zd Zd fd	Zd Z xZS )!EvollaSequenceCompressorAttentionc                 j   t         |           |dz  | _        || _        ||z  }t	        j
                  |      | _        t	        j
                  |      | _        t	        j                  ||d      | _	        t	        j                  ||dz  d      | _
        t	        j                  ||d      | _        y )Nr   Fro  r|   )rE   rF   scaler  r   rM   
norm_medianorm_latentsr   to_qto_kvto_out)r\   r,   dim_headr  	inner_dimr^   s        r7   rF   z*EvollaSequenceCompressorAttention.__init__?  s    t^

u$	,,s+LL-IIc959	YYsIM>
ii	3U;r9   c                 F   | j                  |      }| j                  |      }| j                  }| j                  |      }t	        j
                  ||fd      }| j                  |      j                  dd      \  }}|j                  |j                  d      |j                  d      |d      j                  dddd      }|j                  |j                  d      |j                  d      |d      j                  dddd      }|j                  |j                  d      |j                  d      |d      j                  dddd      }|| j                  z  }t	        j                  ||j                  dd            }	|	|	j                  dd	      j                         z
  }	|	j                   \  }
}}}t	        j"                  ||      j%                  |j&                        }|d
d
d
d
d
d
f   }|d
d
d
d
d
d
f   }||z  }|	j)                  d|z
  j+                         d      }	|	j-                  d      }t	        j                  ||      }|j                  dddd      }|j/                  |j                  d      |j                  d      d      }| j1                  |      S )z
        Args:
            x (torch.Tensor): image features
                shape (b, n1, D)
            latent (torch.Tensor): latent features
                shape (b, n2, D);  n2: num of latent tokens
        r   r+   r|   rB   r   r&   r	   Tr,   keepdimNg     )r  r  r  r  r/   r~   r  r}   r   rr   r   r  r   r   amaxdetachr   r  rf   rq   rb   r   r   r  r  )r\   r   latentsr5   hr   kv_inputr   vsimbsnhskdokdr  mask_expones_expattnouts                      r7   rn   z)EvollaSequenceCompressorAttention.forwardL  sB    OOA##G,JJIIg99a\r2zz(#))2 * 
1 FF166!9affQiB/771aCFF166!9affQiB/771aCFF166!9affQiB/771aC

N ll1akk"b12CHHTH299;;99BSzz"c"%%dkk24q()aD()("ooq4xoo/6{{r{"ll4#kk!Q1% kk#((1+sxx{B7{{3r9   )@      r   rz   s   @r7   r  r  >  s    <) r9   r  c                   &     e Zd Zd fd	Zd Z xZS )EvollaFeedForwardc                    t         |           t        ||z        }t        j                  |      | _        t        j                  ||d      | _        t        j                         | _	        t        j                  ||d      | _
        y NFr  )rE   rF   r.   r   rM   normr   fc1GELUra  fc2)r\   r,   multr  r^   s       r7   rF   zEvollaFeedForward.__init__y  s`    d
O	LL%	99S)%8'')99Y%8r9   c           	      ~    | j                  | j                  | j                  | j                  |                        S r   )r  ra  r  r  )r\   r   s     r7   rn   zEvollaFeedForward.forward  s+    xx1(>?@@r9   )   r   rz   s   @r7   r  r  x  s    9Ar9   r  c                   *     e Zd Zdef fdZd Z xZS )!EvollaSequenceCompressorResamplerr]   c           
         t         |           |j                  j                  }|j                  | _        t        j                  t        j                  | j
                  |      d      | _
        t        j                  g       | _        t        |j                        D ]g  }| j                  j                  t        j                  t!        ||j"                  |j$                        t'        ||j(                        g             i t        j*                  |j                        | _        t        j.                  ||j                        | _        y )NT)requires_grad)r,   r  r  )r,   r  )rE   rF   protein_encoder_configrI   resampler_num_latentsnum_latentsr   	Parameterr/   randnr  rG  layersrH  resampler_depthappendr  resampler_dim_headresampler_headsr  resampler_ff_multrM   r  r   protein_projector)r\   r]   protein_repr_dimr  r^   s       r7   rF   z*EvollaSequenceCompressorResampler.__init__  s   !88DD!77||EKK0@0@BR$ScghmmB'v--. 
	AKK9 06;T;T\b\r\r *.>VE]E]^		
	 LL!3!34	!#+;V=O=O!Pr9   c                 j   |j                   d   }|j                   \  }}t        j                  || j                        j	                  |j
                        }t        j                  ||fd      }t        j                  |      j	                  | j                  j
                        }| j                  d    |j                  ddd      z  }|j	                  |j                        }| j                  D ]  \  }	}
 |	|||      |z   } |
|      |z   } | j                  |      }| j                  |      S )Nr   r&   r+   rB   )r   r/   r  r  rf   rq   r~   r  r   rg   r  r  r  )r\   embedsr5   br  r  latent_maskr  r  r  fftransformed_features               r7   rn   z)EvollaSequenceCompressorResampler.forward  s   LLO

AjjT%5%5699$++Fyy$,!4 zz!} 3 34,,t$tyyQ'::**V\\* 	,HD"67D1G;GkG+G	, #44W=yy,--r9   )ru   rv   rw   r'   rF   rn   ry   rz   s   @r7   r  r    s    Q| Q*.r9   r  c                       e Zd ZU dZej
                  ed<   dZeej
                     ed<   dZ	ee
ej
                  df      ed<   dZee
ej
                  df      ed<   y)EvollaProteinEncoderModelOutputNsequence_compressor_outputrP  .r   rQ  )ru   rv   rw   r  r/   r   rt  rP  r   r   r   rQ  rN  r9   r7   r  r    si     59 1 1859x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r9   r  c                   f     e Zd Zdef fdZedej                  dej                  fd       Z	 xZ
S )EvollaProteinEncoderr]   c                 z    t         |           t        |j                        | _        t        |      | _        y )Nr]   )rE   rF   rx  r  modelr  sequence_compressor_resamplerr[   s     r7   rF   zEvollaProteinEncoder.__init__  s.    /v7T7TU
-NV\-]*r9   r3   rh   c                     | j                  ||      }|j                  }| j                  ||      }t        ||j                        S )Nr  )r  rP  )r  rP  r  r  )r\   r3   rh   kwargsprotein_outputprotein_embedssequence_reprs          r7   rn   zEvollaProteinEncoder.forward  sJ    iW'99::>>Z.'4,>>
 	
r9   )ru   rv   rw   r'   rF   r#   r/   
LongTensorr   rn   ry   rz   s   @r7   r  r    s?    ^| ^
 
!1!1 
5CTCT 
 
r9   r  c                   b     e Zd Z	 	 	 ddee   dee   dee   f fdZd Z	 	 	 	 	 	 	 ddZ xZS )	#EvollaSequenceAlignerCrossAttentionprotein_encoder_dimstructure_encoder_dimmsa_encoder_dimc                    t         |           |j                  | _        |j                  | _        | j                  dz  | _        t        | j                  | j                  z        | _        | j                  | j                  z  | _        |j                  }|j                  }|j                  }t        j                  | j                  | j                        | _        |Kt        j                  || j                        | _        t        j                  || j                        | _        nd | _        d | _        |Kt        j                  || j                        | _        t        j                  || j                        | _        nd | _        d | _        |Kt        j                  || j                        | _        t        j                  || j                        | _        nd | _        d | _        t)        | j                        | _        t        j,                  |      | _        t        j                  | j                  | j                  |      | _        t3        | j                  |      | _        t        j6                  t9        j:                  dg            | _        t        j6                  t9        j:                  dg            | _        y )Nr   r  r`   ) rE   rF   rI   r   r  r.   r   r   $aligner_attention_probs_dropout_probaligner_enable_biasaligner_ffn_multr   r   r   key_proteinvalue_proteinkey_structurevalue_structurekey_msa	value_msaEvollaRMSNormattention_normrP   rR   out_projr  r  r  r/   tensorgate_attentiongate_ffw)	r\   r]   r  r  r  r   enable_biasffn_multr^   s	           r7   rF   z,EvollaSequenceAlignerCrossAttention.__init__  s    	!--#)#=#= --t3
#&t'7'7$:R:R'R#S !558P8PP'-'R'R$00**YYt//1C1CD
*!yy)<d>P>PQD!#+>@R@R!SD#D!%D ,!#+@$BTBT!UD#%99-BDDVDV#WD !%D#'D &99_d6H6HIDLYY8J8JKDNDL!DN+D,<,<=zz">?		$"2"2D4D4D;W#D$4$4h? ll5<<+>?U\\3%%89r9   c	                    |||g}	|	D 
cg c]  }
|
|
	 }	}
|	st        d      t        j                  |	d      }	| j                  |      }| j	                  |      }| j
                  @| j                  4|j                  |      }| j                  |      }| j                  |      }nd}d}| j                  @| j                  4|j                  |      }| j                  |      }| j                  |      }nd}d}| j                  @| j                  4|j                  |      }| j                  |      }| j                  |      }nd}d}|||g}|D 
cg c]  }
|
|
	 }}
t        j                  |d      }|||g}|D 
cg c]  }
|
|
	 }}
t        j                  |d      }|j                         dd | j                  | j                  fz   } |j                  | j!                  dddd      }|j                         dd | j                  | j                  fz   } |j                  | j!                  dddd      }|j                         dd | j                  | j                  fz   } |j                  | j!                  dddd      }|| j"                  z  }|Mt        j$                  |j                  d      |j                  d            j                  |j&                        }|ddddddf   |	ddddddf   z  }t        j(                  ||j+                  dd	            }||j-                  dd
      j/                         z
  }|j1                  d|z
  j3                         t        j4                  |j6                        j8                        } t;        j<                  d      |      }t        j(                  ||      }|j!                  dddd      j?                         }|j                         dd	 | j@                  fz   } |j                  | }| jC                  |      }|S c c}
w c c}
w c c}
w )z
        query_states: text
        key_value_states: protein
        query_states: [bs, query_seq_len, dim]
        key_value_states: [bs, kv_seq_len, dim]
        query_attn_mask: [bs, query_seq_len]
        kv_attn_mask: [bs, kv_seq_len]
        Nz=At least one modality should be provided for cross attention.r&   r+   rB   r   r|   r	   r   Tr  )"r   r/   r~   r  r   r  r  rf   r  r  r  r  rr   r   r   r   r   r  r  rq   r   r   r  r  rb   r   r  rg   r  r   Softmaxr   r   r  )r\   query_statesprotein_key_value_statesstructure_key_value_statesmsa_key_value_statesquery_attn_maskprotein_kv_attn_maskstructure_kv_attn_maskmsa_kv_attn_maskkv_attn_maskr  r   key_layer_proteinvalue_layer_proteinkey_layer_structurevalue_layer_structurekey_layer_msavalue_layer_msar   r   new_query_layer_shapenew_key_layer_shapenew_value_layer_shaperh   attn_weightsr   r   r   r   s                                r7   cross_attentionz3EvollaSequenceAlignerCrossAttention.cross_attention  si   * -.DFVW#/Aa1=AA\]]yy15)),7 jj-'D,>,>,J'?'B'B<'P$ $ 0 01I J"&"4"45M"N $"&)d.B.B.N)C)F)F|)T&"&"4"45O"P$($8$89S$T!"&$(!<<#(B#7#:#:<#H  LL)=>M"nn-ABO M"O&(;]K	 );1Q]Q;	;IIiQ/	*,A?S"-?Qq??ii3 + 0 0 23B 7$$$$;
 !
 'k&&(=>FFq!QPQR'nn.s3$$$$7
 
 #INN$78@@Aq!L	 + 0 0 23B 7$$$$;
 !
 'k&&(=>FFq!QPQR!DJJ. "#jj):):1)=|?P?PQR?STWWXdXkXklO(D!T)9:\!TSWYZJZ=[[||K1D1DR1LM#l&7&7B&7&M&T&T&VV'33%%'\5G5G)H)L)L
 -"**,-=> _kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDm4q BL < @s"   P5P5P:P:P?P?c           
      ^   |z|j                   \  }}}|jt        j                  ||      j                  |	j                        |	j                  ||f      j                  z  j                  |j                        }nd }|z|j                   \  }}}|jt        j                  ||      j                  |	j                        |
j                  ||f      j                  z  j                  |j                        }nd }|z|j                   \  }}}|jt        j                  ||      j                  |	j                        |j                  ||f      j                  z  j                  |j                        }nd }|}||j                         s$||j                         s||j                         rz|}| j                  ||||||||      }t        j                  | j                        |z  }||z   }|}| j                  |      t        j                  | j                        z  }||z   }|S )N)rr   )r  r   r  r  r  r  r  r  )r   r/   r  rf   rq   rW   Tanyr  tanhr  r  r  )r\   r  protein_kv_statesstructure_kv_statesmsa_kv_statesr  r  r  r  protein_batch_maskstructure_batch_maskmsa_batch_maskpast_key_valuer  protein_kv_seq_lenr,   structure_kv_seq_lenmsa_kv_seq_lenr   residuals                       r7   rn   z+EvollaSequenceAlignerCrossAttention.forwardo  sL    (*;*A*A'B"C#+JJr#5699:L:S:ST(//6H"5M/NPPQ"&--. %
 $( *,?,E,E)B$c%-JJr#78;;<N<U<UV*118Lb7Q1RTTU"(//0 '
 &*"$&3&9&9#B'JJr>2556H6O6OP$++."1E+FHHI"]))* !
  $$ */C/G/G/I#/4J4N4N4P).>.B.B.D$H 00*):+>%2 /%9'=!1 1 	M "JJt':':;mKM$}4M$H GGM2UZZ5NNM$}4Mr9   )NNNNNNNNNN)	ru   rv   rw   r   r.   rF   r  rn   ry   rz   s   @r7   r  r    sb     .2/3)-1: &c]1:  (}	1:
 "#1:fnn "#!Gr9   r  RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )r  c                     t         |           t        j                  t	        j
                  |            | _        || _        y)z<
        EvollaRMSNorm is equivalent to T5LayerNorm
        N)rE   rF   r   r  r/   r  r   variance_epsilon)r\   rI   r>   r^   s      r7   rF   zEvollaRMSNorm.__init__  s1     	ll5::k#:; #r9   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr|   rB   T)r  )	rg   rf   r/   r   powri  rsqrtr&  r   )r\   r   r  variances       r7   rn   zEvollaRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r9   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r   r   r   r&  r}  s    r7   
extra_reprzEvollaRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr9   )gư>)ru   rv   rw   rF   rn   r,  ry   rz   s   @r7   r  r    s    $;Jr9   r  c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )EvollaRotaryEmbeddingr]   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typer   defaultr   FrC   )rE   rF   r   rl  r0  dictgetr1  rV   max_seq_len_cachedoriginal_max_seq_lenr]   r   rope_init_fnattention_scalingrT   r   original_inv_freq)r\   r]   rq   r   r^   s       r7   rF   zEvollaRotaryEmbedding.__init__  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r9   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rB   r&   r   r   F)r  enabledr|   r+   r   )r   re   rW   r   rf   rq   rl  r   strr/   autocastr   r~   r   r8  r   rg   )
r\   r   rA   inv_freq_expandedposition_ids_expandedr  r   r   r   r   s
             r7   rn   zEvollaRotaryEmbedding.forward  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r   )
ru   rv   rw   r'   rF   r/   no_gradr   rn   ry   rz   s   @r7   r.  r.    s3    /| /" U]]_<  <r9   r.  c                   $     e Zd Z fdZd Z xZS )	EvollaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nr  )rE   rF   r]   rI   r*  r   r   mlp_bias	gate_projup_proj	down_projr
   
hidden_actact_fnr[   s     r7   rF   zEvollaMLP.__init__  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r9   c                     | j                  | j                  | j                  |            | j                  |      z        }|S r   )rG  rI  rE  rF  )r\   r   rG  s      r7   rn   zEvollaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r9   r   rz   s   @r7   rB  rB    s    0r9   rB  c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrB   r|   r+   )r   r/   r~   r   s      r7   rotate_halfrL     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r9   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )rc   rL  )r   r   r   r   rA   unsqueeze_dimq_embedk_embeds           r7   apply_rotary_pos_embrQ    sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr9   r   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r&   N)r   rW   r  )r   rR  batchnum_key_value_headsslenhead_dims         r7   	repeat_kvrX  "  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr9   rr  r   r   r   rh   scalingrR   r  c                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr|   r	   r   rB   )r,   rg   )pr  r&   )rX  num_key_value_groupsr/   r   r   r   r   r   r   r   rf   rg   rR   r  r   )rr  r   r   r   rh   rY  rR   r  
key_statesvalue_statesr  causal_maskr	  s                r7   eager_attention_forwardr`  .  s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r9   c                       e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   de
ej                     de
e   d	e
ej                     d
ee   de	ej                  ej                  f   fdZ xZS )EvollaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr]   r   c                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )NrW  r   Tr  )rE   rF   r]   r   rS   rI   r   rW  rU  r\  rY  attention_dropoutr   r   r   attention_biasq_projk_projv_projo_projr  s      r7   rF   zEvollaAttention.__init__K  sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r9   r   rX   rh   r  r  r  r   c                 4   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )NrB   r&   r|   )r   r   r  r
  r`   )rR   rY  )r   rW  rf  r   r   rg  rh  rQ  updater   r`  r]   r  r   r  rd  rY  r  r   ri  )r\   r   rX   rh   r  r  r  rs   r   r  r]  r^  r   r   cache_kwargsattention_interfacer	  r  s                     r7   rn   zEvollaAttention.forwardb  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r9   r   )ru   rv   rw   rx   r'   r.   rF   r/   r   r   r   r   r  r    r!   rn   ry   rz   s   @r7   rb  rb  H  s    G
| 
 
8 +/59))||)) #5<<#=>)) !.	))
 !)) !!1!12)) +,)) 
u||U\\)	*))r9   rb  c                        e Zd Zdedef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddej                  deej                  ej                  f   de	ej                     de	ej                     de	e   d	e	e   d
e	ej                     de	ej                     de	ej                     de	ej                     de	ej                     de	ej                     de	ej                     de	ej                     deej                     fdZ xZS )EvollaDecoderLayerr]   r   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        |dz   t        |j                  |j                  z  d      z  dk(  rt        ||j                        | _        y y )Nr]   r   r=   r&   r   )r  )rE   rF   rI   rb  	self_attnrB  mlpr  rms_norm_epsinput_layernormpost_attention_layernormmaxrI  aligner_num_add_layersr  adapterr  s      r7   rF   zEvollaDecoderLayer.__init__  s    !--()LV$,V-?-?VEXEXY(5f6H6HfNaNa(b%MS!9!9V=Z=Z!Z\]^^bcc>$*$6$6DL dr9   r   rX   rh   rA   r  	use_cacher  r  r  r  r  r  r  r  r   c                    |}| j                  |      } | j                  d|||||||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }t	        | d      r| j                  |||	|
||||      }|S )N)r   rh   rA   r  rz  r  rX   ry  )r  r  r  r  r  r  r  r  rN  )ru  rr  rv  rs  r   ry  )r\   r   rX   rh   rA   r  rz  r  r  r  r  r  r  r  r  r  r!  r  s                     r7   rn   zEvollaDecoderLayer.forward  s    $ !,,]; *4>> 	
')%)) 3	
 	
q !=0 !55mD/ =04# LL*"3$7+ /#5%9- ) 	M r9   )NNNFNNNNNNNN)ru   rv   rw   r'   r.   rF   r/   r   r   r   r  r   r   rn   ry   rz   s   @r7   ro  ro    sZ   |  & 2637*.$)59486:04597;15265||5 #5<<#=>5 !.	5
 u//05 !5 D>5 !!1!125 $ELL15 &ell35  -5 %U\\25 'u||45 !.5 "%,,/5" 
u||	#5r9   ro  c                   ^     e Zd ZU eed<   dZdZg dZdgZdZ	dZ
dZdZdZeedZ fdZ xZS )	EvollaPreTrainedModelr]   r  T)ro  r  r  past_key_valuesF)r   rQ  c                    | j                   j                  }t        |   |       t	        |t
              rd|j                  j                          |j                  j                          |j                  j                  j                  j                  d       y t	        |t              r(|j                  j                  j                  d|       y y )Nr   r`   rh  )r]   rk  rE   rs  rl  r  r  rp  r  r  r   rm  rq  r  r  rn  )r\   rr  rj  r^   s      r7   rs  z#EvollaPreTrainedModel._init_weights  s    kk++f%fAB!!'')OO!!#!!((--33C8 ABNN''Sc': Cr9   )ru   rv   rw   r'   rt  base_model_prefixsupports_gradient_checkpointingru  _skip_keys_device_placementrv  _supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendro  rb  _can_record_outputsrs  ry   rz   s   @r7   r}  r}    s]    &*#
 $5"5N!"'+%
; ;r9   r}  c            !           e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dee	j                     dee	j                     dee   d	ee	j                     d
ee   dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     deeef   fd              Z xZS )EvollaModelr]   c           	      F   t         |   |       |j                  | _        |j                  | _        t        j                  | j                  |j                  | j                        | _        t        |      | _
        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t!        |j                  |j"                        | _        t'        |      | _        t+        |dd      | _        | j/                          y c c}w )Nr  rq  r=   rL  F)rE   rF   rJ   r4   rH   r   rG   rI   embed_tokensr  protein_encoderrG  rH  rI  ro  r  r  rt  r  r.  
rotary_embrS   rL  	post_initr  s      r7   rF   zEvollaModel.__init__  s     !.. ++LL&:L:LdN^N^_36Bmm "'v'?'?!@
 	 #!'
 "&"4"4&:M:MN	/v>&-f6NPU&V#s   $Dc                     | j                   S r   r  r}  s    r7   r~  z EvollaModel.get_input_embeddings  s       r9   c                     || _         y r   r  r  s     r7   r  z EvollaModel.set_input_embeddings  s
    !r9   r3   rh   rA   r~  ri   rz  r  protein_input_idsprotein_attention_maskstructure_feats	msa_featsr  r  r   c                    |du |duz  rt        d      || j                  |      }|r|
t               }|F||j                         nd}t	        j
                  |||j                  d   z   |j                        }||j                  d      }d}d}|S|	Q| j                  ||	      }|j                  }t	        j                  dg|j                  d   z  |j                        }t        | j                  ||||      }|}| j                  ||      }| j                  D ]  } ||f||||||||
|||||d	|} | j!                  |      }t#        ||
      }|S )a;  
        protein_input_ids (torch.LongTensor):
            The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
        protein_attention_mask (torch.Tensor):
            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.
        structure_feats (torch.FloatTensor):
            The input IDs for purely structure-based features. Should be of shape `(batch_size, structure_seq_length, structure_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
        msa_feats (torch.FloatTensor):
            The input IDs for purely MSA-based features. Should be of shape `(batch_size, msa_seq_length, msa_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
        structure_batch_mask (torch.Tensor):
            The batch mask to decide which protein sequences are purely structure-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `structure_feats`. Dummpy input for now.
        msa_batch_mask (torch.Tensor):
            The batch mask to decide which protein sequences are purely MSA-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `msa_feats`. Dummpy input for now.
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r&   r   r  T)r]   input_embedsrh   r  r~  )rh   rA   r  rz  r  rX   r  r  r  r  r  r  r  )rP  r~  )r   r  r   get_seq_lengthr/   rU   r   rq   rc   r  r  r  r   r]   r  r  r  r   )r\   r3   rh   rA   r~  ri   rz  r  r  r  r  r  r  r  r  past_seen_tokensprotein_featsr  protein_outputsr_  r   rX   decoder_layerr  s                           r7   rn   zEvollaModel.forward  s   B -t";<YZZ  --i8M0*nO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L!(-C-O"22+5 3 O ,FFM!&tf7H7N7Nq7Q.QZkZrZr!s(;;&))+
 & #oom\J![[ 	M)*).#-$7"/$3'#5%9- . M	& 		-0(++
 r9   )NNNNNNNNNNNNN)ru   rv   rw   r'   rF   r~  r  r"   r%   r/   r  r   r   r   r   r   r   r   r   rn   ry   rz   s   @r7   r  r    sw   | *!"  '+1537+/59$(598<9=7;157;15b##b !.b u//0	b
 "%b   1 12b D>b !!1!12b $E$4$45b !) 6b "%"3"34b E--.b 'u||4b !.b  
u--	.!b  br9   r  c                       e Zd Z fdZd Zd Zee	 	 	 	 	 	 	 ddej                  de
ej                     de
ej                     de
ej                     dej                  d	e
ej                     d
e
e   fd              Z xZS )EvollaForProteinText2Textc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  | j                  d      | _        | j                          y r  )
rE   rF   r  r  rH   r   r   rI   lm_headr  r[   s     r7   rF   z"EvollaForProteinText2Text.__init__{  sQ      (
 ++yy!3!3T__5Qr9   c                 6    | j                   j                         S r   )r  r~  r}  s    r7   r~  z.EvollaForProteinText2Text.get_input_embeddings  s    zz..00r9   c                 8    | j                   j                  |      S r   )r  r  r  s     r7   r  z.EvollaForProteinText2Text.set_input_embeddings  s    zz..u55r9   r3   rh   ri   labelsr  r  rz  c           
          | j                   d||||||d|}	|	d   }
| j                  |
      }d}|  | j                  d||| j                  d|}t	        |||	j
                  |	j                  |	j                        }|S )a,  
        protein_input_ids (torch.LongTensor):
            The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
        protein_attention_mask (torch.Tensor):
            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.

        Example:

        ```python
        >>> from transformers import EvollaProcessor, EvollaForProteinText2Text
        >>> model = EvollaForProteinText2Text.from_pretrained("westlake/Evolla-10B-hf")
        >>> processor = EvollaProcessor.from_pretrained("westlake/Evolla-10B-hf")

        >>> protein_information = {
            "aa_seq": "your amino acid sequence",
            "foldseek": "your foldseek sequence",
        }
        >>> question = "What is the function of this protein?"
        >>> message = [
            {"role": "system", "content": "You are an AI expert that can answer any questions about protein."},
            {"role": "user", "content": question},
        ]

        >>> inputs = processor(proteins=[protein_information], messages_list=[message], return_tensors="pt", padding="longest")
        >>> outputs = model.generate(**inputs)

        >>> print(processor.batch_decode(outputs, skip_special_tokens=True))
        ```)r3   rh   ri   r  r  rz  r   N)logitsr  rH   )lossr  r~  r   rQ  rN  )r  r  loss_functionrH   r   r~  r   rQ  )r\   r3   rh   ri   r  r  r  rz  r  r   r   r  r  
lm_outputss                 r7   rn   z!EvollaForProteinText2Text.forward  s    T $** 
)'/#9
 
  
m,%4%%iVFtibhiD+#33!//))

 r9   r"  )ru   rv   rw   rF   r~  r  r#   r"   r/   r  r   r   r   r   rn   ry   rz   s   @r7   r  r  z  s    16  '+1559-1.29=$(?##? !.?   1 12	?
 ))*? !++? !) 6? D>?  ?r9   r  )r  r  r}  )Nr&   )r`   )cr$  r  dataclassesr   typingr   r   r   r/   r   r   activationsr
   cache_utilsr   r   
generationr   integrationsr   masking_utilsr   modeling_flash_attention_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   r   r   r   r   processing_utilsr    utilsr!   r"   r#   r$   utils.genericr%   configuration_evollar'   r(   r)   
get_loggerru   r   r8   Moduler;   r   r   r   r   r   r   r  r  r&  r(  r.  r2  rE  r^  rf  rx  r  r  r  r  r  r  r  r.  rB  rL  rQ  r.   rX  re   r`  rb  ro  r}  r  r  __all__rN  r9   r7   <module>r     sS  ,   ! , ,   ! . ) 7 / h 9  L  ' R R / < J 
		H	%4 ^=RYY ^=B(
2(
")) (
Vp		 pf
RYY 
m"; mb '4$  /BII /d;ryy 
 
G2 GT9
")) 9
x  */ * **_'!< _'D7 		 7 tA		 A'.		 '.T ?k ?  ?
299 
$k")) k\ Y'JBII J (J(<BII <D		  (6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4C)bii C)LE3 EP ;O ; ;@@' @FP 5 Pf Pr9   