
    rh                     B   d dl Z d dlmZmZmZ d dlZd dlmZ d dlmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(  e%       rddl)m*Z*m+Z+  e&jX                  e-      Z. G d dej^                        Z0 G d dej^                        Z1	 	 	 d1dejd                  dejf                  dejf                  dejf                  deejf                     dee4   de4deejf                     fdZ5 G d d ejd                        Z6 G d! d"e      Z7e$ G d# d$e             Z8e$ G d% d&e8             Z9 e$d'(       G d) d*e8e             Z:e$ G d+ d,e8             Z; e$d-(       G d. d/e8             Z<g d0Z=y)2    N)CallableOptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_flex_attn_availablelogging   )BioGptConfig)	BlockMaskmake_flex_block_causal_maskc                   x     e Zd ZdZdedef fdZ	 	 d	dej                  dedeej                     f fdZ	 xZ
S )
 BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y )N   )offsetsuper__init__)selfr"   r#   	__class__s      }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/biogpt/modeling_biogpt.pyr(   z)BioGptLearnedPositionalEmbedding.__init__;   s$     $++5}E    attention_maskpast_key_values_lengthposition_idsc                     |8t        j                  |d      }||z  dz
  j                         }|dd|df   }t        |   || j
                  z         S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr'   forwardr&   )r)   r-   r.   r/   r*   s       r+   r6   z(BioGptLearnedPositionalEmbedding.forwardA   s^      <<A>L(>9A=CCEL'+A+B(BCLw|dkk9::r,   )r   N)__name__
__module____qualname____doc__intr(   r3   
LongTensorr   r6   __classcell__r*   s   @r+   r!   r!   6   s]    Fs F3 F '(37	;((; !$; u//0	; ;r,   r!   c            
       `     e Zd ZdZd	dedededee   f fdZdej                  f fdZ
 xZS )
BioGptScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    r"   r#   padding_idxembed_scalec                 6    t         |   |||       || _        y N)r'   r(   rB   )r)   r"   r#   rA   rB   r*   s        r+   r(   z"BioGptScaledWordEmbedding.__init__W   s    D&r,   	input_idsc                 <    t         |   |      | j                  z  S rD   )r'   r6   rB   )r)   rE   r*   s     r+   r6   z!BioGptScaledWordEmbedding.forward[   s    wy)D,<,<<<r,   )      ?)r7   r8   r9   r:   r;   r   floatr(   r3   Tensorr6   r=   r>   s   @r+   r@   r@   R   sE    's '3 'S '_ghm_n '= = =r,   r@   modulequerykeyvaluer-   scalingdropout	head_maskc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }	||	|z   }	t        j
                  j                  |	d      }	||	|j                  dddd      z  }	t        j
                  j                  |	|| j                        }	t        j                  |	|      }
|
j                  dd      j                         }
|
|	fS )N      r%   r	   r1   r   ptraining)sizer3   matmul	transposenn
functionalsoftmaxviewrO   rV   
contiguous)rJ   rK   rL   rM   r-   rN   rO   rP   kwargsattn_weightsattn_outputs              r+   eager_attention_forwardrb   _   s     **R.D(<<s}}Q':;gEL!#n4==((2(>L#innQAq&AA==((6??([L,,|U3K''1-88:K$$r,   c                   z    e Zd ZdZ	 	 	 	 	 	 ddededededededee   d	ee   f fd
Z		 	 	 	 	 	 dde
j                  dee
j                     dee   dee
j                     dee
j                     dedee
j                     dee   dee
j                  ee
j                     eee
j                        f   fdZ xZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsrO   
is_decoderbias	is_causalconfig	layer_idxc	                    t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        || _        |9| j                  r-t        j                  d| j                  j                   d       t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rS   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.rh   )r'   r(   re   rf   rO   head_dimrj   
ValueErrorrN   rg   ri   rk   loggerwarning_oncer*   r7   rZ   Lineark_projv_projq_projout_proj)
r)   re   rf   rO   rg   rh   ri   rj   rk   r*   s
            r+   r(   zBioGptAttention.__init__   s$    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	94@ii	94@ii	94@		)YTBr,   hidden_stateskey_value_statespast_key_valuer-   layer_head_maskoutput_attentionscache_positionr_   returnc                    |du}	|j                   dd \  }
}|	r|j                   d   n|}|
|d| j                  f}|
|d| j                  f} | j                  |      j                  | j	                  dd      }|St        |t              rA|j                  j                  | j                        }|	r|j                  }n|j                  }n|}|	r|n|}|	rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j!                  |      } |j                  | j	                  dd      } |j                  | j	                  dd      }|D|	s|nd}j#                  ||| j                  d|i      \  }}|	rd|j                  | j                  <   t$        }| j&                  j(                  dk7  rt*        | j&                  j(                     } || ||||f| j,                  sdn| j.                  | j0                  ||d	|\  }}|j3                  |
|d      j5                         }| j7                  |      }||fS )
z#Input shape: Batch x Time x ChannelNrR   r   r%   r|   Teager        )rO   rN   r{   rP   )shapern   ru   r]   rY   
isinstancer   
is_updatedgetrk   cross_attention_cacheself_attention_cachelayerskeysvaluesrs   rt   updaterb   rj   _attn_implementationr   rV   rO   rN   reshaper^   rv   )r)   rw   rx   ry   r-   rz   r{   r|   r_   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesattention_interfacera   r`   s                           r+   r6   zBioGptAttention.forward   sy   " .T9 %**3B/W/A"((+wgr4==9wDMM: 7t{{=166FPPQRTUV%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,33DNNCHHJ.55dnnELLL^4J;;~6L(.9CCAqIJ,<,,n=GG1ML)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#w;FFHmmK0L((r,   )r   FTFNN)NNNNFN)r7   r8   r9   r:   r;   rH   boolr   r   r(   r3   rI   r   r   r   tupler6   r=   r>   s   @r+   rd   rd   }   sT   G  )-#'%C%C %C 	%C
 %C %C %C &%C C=%CT 48*.1526"'15Q)||Q) #5<<0Q) !	Q)
 !.Q) "%,,/Q)  Q) !.Q) -.Q) 
u||Xell3XeELL>Q5RR	SQ)r,   rd   c                   Z    e Zd Zddedee   f fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee	   dee
   d	ee
   d
eej                     deej                     dee   deej                  eeej                  ej                  f      f   fdZ xZS )BioGptDecoderLayerrj   rk   c           	      n   t         |           |j                  | _        t	        | j                  |j
                  |j                  dd||      | _        |j                  | _	        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t        j"                  | j                  |j$                        | _        t        j"                  |j$                  | j                        | _        t        j                  | j                        | _        y )NT)re   rf   rO   rg   ri   rj   rk   )r'   r(   hidden_sizere   rd   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrO   r
   
hidden_actactivation_fnactivation_dropoutrZ   	LayerNormself_attn_layer_normrr   intermediate_sizefc1fc2final_layer_norm)r)   rj   rk   r*   s      r+   r(   zBioGptDecoderLayer.__init__   s    ++(nn0077
 11#F$5$56"(";";$&LL$@!99T^^V-E-EF99V55t~~F "T^^ <r,   rw   r-   rz   ry   r{   	use_cacher/   r|   r_   r}   c	                 `   |}
| j                  |      } | j                  d|||||||d|	\  }}t        j                  j	                  || j                  | j
                        }|
|z   }|}
| j                  |      }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }|
|z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        )rw   ry   r-   rz   r{   r/   r|   rT    )r   r   rZ   r[   rO   rV   r   r   r   r   r   )r)   rw   r-   rz   ry   r{   r   r/   r|   r_   residualself_attn_weightsoutputss                r+   r6   zBioGptDecoderLayer.forward  s@   < !11-@ ,:4>> 	,
'))+/%)	,
 	,
(( --mt||VZVcVc-d =0 !--m</**=9--mt?V?Vaeanan-o/--mt||VZVcVc-d =0 ")++Gr,   rD   )NNNFTNN)r7   r8   r9   r   r   r;   r(   r3   rI   r   r   r<   r   r   r   FloatTensorr6   r=   r>   s   @r+   r   r      s    =| = =4 2626*.,1$(3715?||? !.? "%,,/	?
 !? $D>? D>? u//0? !.? +,? 
u  (51B1BEDUDU1U+V"WW	X?r,   r   c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
deeej                  df      dej                  dej                  defd	Zedej                  d
ededej&                  dej                  defd       Zy)BioGptPreTrainedModelrj   biogptTr-   r   input_tensorr|   past_key_valuesc           	         | j                   j                  dk(  rqt        |t        j                        rt        |      }|S |Ft        t        j                  |j                  d   |j                  d   f|j                              }|S | j                   j                  dk(  r||dk(  j                         r|S y ||j                         nd}||j                  nd}| j                   j                  dk(  r&|s$t        j                  |||| j                  	      ry |j                  }|j                  d   }|r|j!                         }	n1t        |t        j                        r|j                  d
   n||z   dz   }	| j#                  |||	|||j                  d         }
| j                   j                  dk(  rO|M|j                  j$                  dv r5t        j&                  |      j(                  }t        j*                  |
|      }
|
S )Nflex_attentionr   r   )rW   deviceflash_attention_2r   Fsdpa)inputs_embedsr.   is_trainingrR   )sequence_lengthtarget_lengthdtyper|   
batch_size)cudaxpunpu)rj   r   r   r3   rI   r   onesr   r   anyget_seq_lengthis_compileabler   _ignore_causal_mask_sdparV   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r)   r-   r   r|   r   past_seen_tokensusing_compilable_cacher   r   r   causal_mask	min_dtypes               r+   _update_causal_maskz)BioGptPreTrainedModel._update_causal_maska  s    ;;++/??.%,,7!<^!L "!  '!<JJ*003\5G5G5JK-44" "!;;++/BB)~/D.I.I.K%%
 @O?Z?99;`aCRC^!?!?di ;;++v5>T%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD
 E*..I0CCKQZ[Kr,   r   r   r   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer   r   r   )diagonalr   rR   r   )r2   r3   r   r   fullr   triuaranger   expandcloner   tomasked_fill)r-   r   r   r   r|   r   r_   r   r   mask_lengthpadding_masks              r+   r   zKBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r,   N)r7   r8   r9   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r   r3   rI   r   r   staticmethodr;   r   r   r   r,   r+   r   r   U  s     &*#N!J u||['@!ABJ llJ 	J
 JX 444 4 {{	4
 4 4 4r,   r   c                   l    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     dee
e
ej                           dee   d	eej                     d
ee   dee   dee   deej                     dee   dee
ef   fd       Z xZS )BioGptModelrj   c           	         t         |   |       || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _	        |j                  rt        j                  |j                        nd}t        |j                  | j                  | j                  |      | _        t!        |j"                  | j                        | _        t'        j(                  t+        |j,                        D cg c]  }t/        ||       c}      | _        t'        j2                  | j                        | _        d| _        | j9                          y c c}w )NrG   )rB   )rk   F)r'   r(   rj   	layerdropr   rO   r   re   pad_token_idrA   scale_embeddingmathsqrtr@   
vocab_sizeembed_tokensr!   max_position_embeddingsembed_positionsrZ   
ModuleListrangenum_hidden_layersr   r   r   
layer_normgradient_checkpointing	post_init)r)   rj   rB   ir*   s       r+   r(   zBioGptModel.__init__  s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vQR%7!%L$vw,,t~~6&+# %ws   E"rE   r-   rP   r   r   r   r/   r{   output_hidden_statesreturn_dictr|   r_   r}   c                 6   ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|
|
n| j                   j                  }
|d u |d uz  rt        d      |$|}|j                  }|j                  d|d         }n-| |j                         d d }|d d d d df   }nt        d      || j                  |      }| j                  r%| j                  r|rt        j                  d       d}d}|r<t        |t              s,d}t        j                  d       t!        j"                  |      }|j                         d d \  }}||j%                         nd}|%t'        j(                  |||z   |j*                  	      }|'||z   }t'        j,                  |||j*                  	      }t        |t               r|j.                  n|}| j1                  ||||      }|8t'        j2                  |d
      }||z  d
z
  j5                         }|d d |d f   }| j7                  |||      }||z   }t8        j:                  j=                  || j<                  | j                        }| j                  r%| j                  r|rt        j                  d       d}|	rdnd }|rdnd }d }t?        | j@                        D ]g  \  }}|	r||fz  }| j                  r%t'        jB                  g       }|| jD                  k  r? ||f||||   nd |||||d|}|d   }|s_||d
   fz  }i |	r||fz  }| jG                  |      }|r|jI                         }|
stK        d |||||fD              S tM        |||||      S )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timerR   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   r   r1   )r/   rT   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   )r-   rz   ry   r{   r   r/   r|   c              3   $   K   | ]  }|| 
 y wrD   r   ).0vs     r+   	<genexpr>z&BioGptModel.forward.<locals>.<genexpr>  s      = s   )last_hidden_stater   rw   
attentionscross_attentions)'rj   r{   r   r   use_return_dictro   r   r]   rW   r   r   rV   rp   rq   r   r   r   from_legacy_cacher   r3   r   r   r   r   r   r4   r5   r   rZ   r[   rO   	enumerater   randr   r   to_legacy_cacher   r   )r)   rE   r-   rP   r   r   r   r/   r{   r   r   r|   r_   inputinput_shapereturn_legacy_cacher   
seq_lengthr.   mask_seq_lengthself_attn_cacher   	positionsrw   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputss                                  r+   r6   zBioGptModel.forward  s      2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] -t";<stt"E++K!r;r?;I&',,.s3K!!Q(+Edee  --e4M&&4==##q "	 $Z?"&\
 2CCOTO!.!3!3!5cr!:
JETE`!?!?!Afg!"\\&(>(KTaThThN !4zAO"ZZ
OML`L`aN /+>? 00  	 ..	
  <<A>L(>9A=CCEL'+A+B(BCL((9O^j(k	%	1--mt||VZVcVc-d&&4==##p "	"6BD0d#"+DKK"8 	6C#!m%55!}}&+jjn#&7)
*3<3H3d."3#)-
 
M *!,M =#3"551	66  -!116-==?O ':K^]qr  
 9+++%1
 	
r,   )NNNNNNNNNNN)r7   r8   r9   r   r(   r   r   r3   r<   r   r   rI   r   r   r   r   r   r6   r=   r>   s   @r+   r   r     sI   | *  156:1559@D$(37,0/3&*15W
E,,-W
 !!2!23W
 E--.	W

   1 12W
 "%ell(;"<=W
 D>W
 u//0W
 $D>W
 'tnW
 d^W
 !.W
 +,W
 
u??	@W
 W
r,   r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                        e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	eeee	j                           d
ee	j                     dee   dee	j                     dee   dee   dee   dee	j                     dee   deeef   fd       Z xZS )BioGptForCausalLMzoutput_projection.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFrm   )
r'   r(   r   r   rZ   rr   r   r   output_projectionr   r)   rj   r*   s     r+   r(   zBioGptForCausalLM.__init__  sJ     !&)!#6+=+=v?P?PW\!] 	r,   c                     | j                   S rD   r!  r)   s    r+   get_output_embeddingsz'BioGptForCausalLM.get_output_embeddings  s    %%%r,   c                     || _         y rD   r$  )r)   new_embeddingss     r+   set_output_embeddingsz'BioGptForCausalLM.set_output_embeddings  s
    !/r,   rE   r-   rP   r   r   labelsr   r/   r{   r   r   r|   r_   r}   c                    ||n| j                   j                  } | j                  |f|||||||	|
||d
|}|d   }| j                  |      }d}|* | j                  ||fd| j                   j
                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r-   rP   r   r   r   r/   r{   r   r   r|   r   r   r   )losslogitsr   rw   r  r  )rj   r	  r   r!  loss_functionr   r   r   rw   r  r  )r)   rE   r-   rP   r   r   r*  r   r/   r{   r   r   r|   r_   r   sequence_outputprediction_scoreslm_lossoutputs                      r+   r6   zBioGptForCausalLM.forward  s   . &1%<k$++B]B]$++
)'+%/!5#)
 
 "!* 22?C(d((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r,   NNNNNNNNNNNN)r7   r8   r9   _tied_weights_keysr(   r&  r)  r   r   r3   r<   r   r   rI   r   r   r   r   r   r6   r=   r>   s   @r+   r  r    s`    55&0  156:1559@D-1$(37,0/3&*15>
E,,->
 !!2!23>
 E--.	>

   1 12>
 "%ell(;"<=>
 ))*>
 D>>
 u//0>
 $D>>
 'tn>
 d^>
 !.>
 +,>
 
u77	8>
 >
r,   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     dee	e	ej                           deej                     deej                     d	ee   d
eej                     dee   dee   dee   deej                     dee	ef   fd       Z xZS )BioGptForTokenClassificationc                 z   t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n|j                  }t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropout)r'   r(   
num_labelsr   r   hasattrr8  r   rZ   DropoutrO   rr   r   
classifierr   )r)   rj   r8  r*   s      r+   r(   z%BioGptForTokenClassification.__init__  s      ++!&)6/0V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr,   rE   token_type_idsr-   rP   r   r   r*  r   r/   r{   r   r   r|   r}   c                    ||n| j                   j                  }| j                  |||||||	|
|||      }|d   }| j                  |      }| j	                  |      }d}|t               }||j                  d      dk(  }|j                  d| j                        }t        j                  ||j                  d      t        j                  |j                        j                  |            } |||      }n2 ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N
r   r-   rP   r   r   r/   r{   r   r   r|   r   rR   r   r%   )r,  r-  rw   r  )rj   r	  r   rO   r<  r   r]   r9  r3   wheretensorignore_indextype_asr   rw   r  )r)   rE   r=  r-   rP   r   r   r*  r   r/   r{   r   r   r|   transformer_outputsrw   r-  r,  loss_fctactive_lossactive_logitsactive_labelsr2  s                          r+   r6   z$BioGptForTokenClassification.forward  su   . &1%<k$++B]B]"kk+)'%/!5#) * 
 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r,   )NNNNNNNNNNNNN)r7   r8   r9   r(   r   r   r3   r<   r   r   rI   r   r   r   r6   r=   r>   s   @r+   r6  r6    sc     15596:15@D59-1$(37,0/3&*15A
E,,-A
 !!1!12A
 !!2!23	A

 E--.A
 "%ell(;"<=A
   1 12A
 ))*A
 D>A
 u//0A
 $D>A
 'tnA
 d^A
 !.A
 
u++	,A
 A
r,   r6  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
e
ej                           deej                     deej                     d	ee   d
eej                     dee   dee   dee   deej                     dee
ef   fd       Zd Zd Z xZS )BioGptForSequenceClassificationrj   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r   )
r'   r(   r9  r   r   rZ   rr   r   scorer   r"  s     r+   r(   z(BioGptForSequenceClassification.__init__U  sS      ++!&)YYv114??O
 	r,   rE   r-   rP   r   r   r*  r   r/   r{   r   r   r|   r}   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }| j                  |      }||j                  dd \  }}n|j                  dd \  }}| j                   j
                  d}n|Vt        j                  || j                   j
                        j                  d      dz
  j                  |j                        }n.d}t        j                  | j                  j                   d       |t        j                  ||j                        |f   }d}|| j                   j                   | j"                  dk(  rd	| j                   _        nl| j"                  dkD  rL|j$                  t        j&                  k(  s|j$                  t        j(                  k(  rd
| j                   _        nd| j                   _        | j                   j                   d	k(  rIt+               }| j"                  dk(  r& ||j-                         |j-                               }n |||      }n| j                   j                   d
k(  r=t/               } ||j1                  d| j"                        |j1                  d            }n,| j                   j                   dk(  rt3               } |||      }|s|f|dd z   }||f|z   S |S t5        |||j6                  |j8                  |j:                        S )r?  Nr@  r   r%   rR   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r,  r-  r   rw   r  )rj   r	  r   rM  r   r   r3   nesumr   r   rp   rq   r*   r7   r   problem_typer9  r   r5   r;   r   squeezer   r]   r   r   r   rw   r  )r)   rE   r-   rP   r   r   r*  r   r/   r{   r   r   r|   rE  rw   r-  r   r   pooled_logitsr,  rF  r2  s                         r+   r6   z'BioGptForSequenceClassification.forward^  s   , &1%<k$++B]B]"kk+)'%/!5#) * 
 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88It{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||Jv}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r,   c                 .    | j                   j                  S rD   r   r   r%  s    r+   get_input_embeddingsz4BioGptForSequenceClassification.get_input_embeddings  s    {{'''r,   c                 &    || j                   _        y rD   rX  )r)   rM   s     r+   set_input_embeddingsz4BioGptForSequenceClassification.set_input_embeddings  s    #( r,   r3  )r7   r8   r9   r   r(   r   r   r3   r<   r   r   rI   r   r   r   r6   rY  r[  r=   r>   s   @r+   rK  rK  F  s\   |   156:15@D59-1$(37,0/3&*15Z
E,,-Z
 !!2!23Z
 E--.	Z

 "%ell(;"<=Z
   1 12Z
 ))*Z
 D>Z
 u//0Z
 $D>Z
 'tnZ
 d^Z
 !.Z
 
u66	7Z
 Z
x()r,   rK  )r  r6  rK  r   r   )Nr   N)>r   typingr   r   r   r3   torch.nnrZ   r   r   r   activationsr
   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_biogptr   integrations.flex_attentionr   r   
get_loggerr7   rp   	Embeddingr!   r@   ModulerI   rH   rb   rd   r   r   r   r  r6  rK  __all__r   r,   r+   <module>rn     s  ,  , ,   A A ! 5 ) > B 9  G & ^ ^ .  !U 
		H	%;r|| ;8
= 
=&  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<{)bii {)|W3 Wt MO M M` n
' n
 n
b 
Q
- Q

Q
h Q
#8 Q
 Q
h k)&; k)k)\r,   