
    rh                    8   d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/  e-j`                  e1      Z2 G d de	jf                        Z4 G d de	jf                        Z5 G d de5      Z6 G d de	jf                        Z7e5e6dZ8 G d de	jf                        Z9 G d de	jf                        Z: G d  d!e	jf                        Z; G d" d#e      Z< G d$ d%e	jf                        Z= G d& d'e	jf                        Z>e+ G d( d)e%             Z? G d* d+e	jf                        Z@ G d, d-e	jf                        ZAe+ G d. d/e?             ZBe+ G d0 d1e?             ZC e+d23       G d4 d5e?             ZDe+ G d6 d7e?             ZEe+ G d8 d9e?             ZFe+ G d: d;e?             ZG e+d<3       G d= d>e?e             ZHdAd?ZIg d@ZJy)BzPyTorch CamemBERT model.    N)OptionalUnion)version)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)CacheEncoderDecoderCache)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringget_torch_versionlogging   )CamembertConfigc                   2     e Zd ZdZ fdZ	 ddZd Z xZS )CamembertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r"   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr*   register_buffertorcharangeexpandzerosr,   sizelongr'   selfconfig	__class__s     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.pyr3   zCamembertEmbeddings.__init__:   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
     c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )Nr-   r"   r/   r   r1   devicer+   )"create_position_ids_from_input_idsr'   &create_position_ids_from_inputs_embedsrH   hasattrr/   rF   rD   rG   rI   r,   rR   r8   r<   r*   r:   r=   rA   )rK   	input_idsr/   r,   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr<   
embeddingsr:   s                rN   forwardzCamembertEmbeddings.forwardS   sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
rO   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr-   r"   rQ   r   )rH   rD   rE   r'   rI   rR   	unsqueezerF   )rK   rW   rY   sequence_lengthr,   s        rN   rT   z:CamembertEmbeddings.create_position_ids_from_inputs_embeds{   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rO   )NNNNr   )__name__
__module____qualname____doc__r3   r^   rT   __classcell__rM   s   @rN   r%   r%   4   s    

4 rs&P=rO   r%   c                        e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     dee   dee	   deej
                     d	e
ej
                     fd
Z xZS )CamembertSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        || _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r*   r+   relative_keyrelative_key_query   r"   )r2   r3   r6   num_attention_headsrU   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer?   attention_probs_dropout_probrA   rB   r*   r9   r4   distance_embedding
is_decoder	layer_idxrK   rL   r*   r|   rM   s       rN   r3   zCamembertSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++"rO   hidden_statesattention_mask	head_maskencoder_hidden_statespast_key_valueoutput_attentionscache_positionreturnc                 	   |j                   \  }}	}
| j                  |      }|j                  |d| j                  | j                        j                  dd      }|d u}|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }|j                  |d| j                  | j                        j                  dd      }| j#                  |      }|j                  |d| j                  | j                        j                  dd      }|D|s|nd }j%                  ||| j                  d|i      \  }}|rd|j                  | j                  <   t'        j(                  ||j                  dd            }| j*                  dk(  s| j*                  dk(  r|j                   d   |j                   d   }}|Dt'        j,                  |dz
  t&        j.                  |j0                  	      j                  dd      }n@t'        j2                  |t&        j.                  |j0                  	      j                  dd      }t'        j2                  |t&        j.                  |j0                  	      j                  dd      }||z
  }| j5                  || j6                  z   dz
        }|j9                  |j:                  
      }| j*                  dk(  rt'        j<                  d||      }||z   }nE| j*                  dk(  r6t'        j<                  d||      }t'        j<                  d||      }||z   |z   }|t?        j@                  | j                        z  }|||z   }tB        jD                  jG                  |d      }| jI                  |      }|||z  }t'        j(                  ||      }|jK                  dddd      jM                         }|jO                         d d | jP                  fz   }|j                  |      }||fS )Nr-   r"   ro   r   Trm   rn   rQ   r0   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r
   ))shaperv   viewrp   rs   	transpose
isinstancer   
is_updatedgetr|   cross_attention_cacheself_attention_cachelayerskeysvaluesrw   rx   updaterD   matmulr*   tensorrI   rR   rE   rz   r9   tor1   einsummathsqrtr   
functionalsoftmaxrA   permute
contiguousrH   rt   )rK   r~   r   r   r   r   r   r   
batch_sizerZ   _query_layeris_cross_attentionr   curr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  rN   r^   zCamembertSelfAttention.forward   sN    %2$7$7!
Jjj/!&&z2t7O7OQUQiQijttq
 3$>%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<+224>>BGGI-44T^^DKKK0I!z2t7O7OQUQiQijtt1I **^4K%**B 8 8$:R:Ri1o  )7It)<)C)C{DNN=M~<^*&	; &@DN--dnn= !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L)!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--rO   NNNNNNFNrb   rc   rd   r3   rD   Tensorr   FloatTensorr   booltupler^   rf   rg   s   @rN   ri   ri      s    #< 7;15=A*.,115d.||d. !!2!23d. E--.	d.
  ((9(9:d. !d. $D>d. !.d. 
u||	d.rO   ri   c                        e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  deej
                     deej                     deej                     dee   dee	   deej
                     d	e
ej
                     f fd
Z xZS )CamembertSdpaSelfAttentionc                     t         |   |||       |j                  | _        t	        j
                  t                     t	        j
                  d      k  | _        y )Nr*   r|   z2.2.0)r2   r3   ry   dropout_probr   parser    require_contiguous_qkvr}   s       rN   r3   z#CamembertSdpaSelfAttention.__init__  sK    9P\ef"??&-mm4E4G&H7==Y`Ka&a#rO   r~   r   r   r   r   r   r   r   c           	         | j                   dk7  s|s|*t        j                  d       t        |   |||||||      S |j                         \  }}	}
| j                  |      j                  |d| j                  | j                        j                  dd      }|d u}|r|n|}|St        |t              rA|j                  j                  | j                        }|r|j                   }n|j"                  }n|}|r|n|}|rK|IrGj$                  | j                     j&                  }|j$                  | j                     j(                  }n| j+                  |      j                  |d| j                  | j                        j                  dd      }| j-                  |      j                  |d| j                  | j                        j                  dd      }|D|s|nd }j/                  ||| j                  d|i      \  }}|rd|j                  | j                  <   | j0                  rK|j2                  j4                  dk(  r2|0|j7                         }|j7                         }|j7                         }| j8                  xr | xr |d u xr |	dkD  }t:        j<                  j>                  jA                  ||||| jB                  r| jD                  nd	|
      }|j                  dd      }|jG                  ||	| jH                        }|d fS )Nr+   a  CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r-   r"   ro   r   Tcuda        )	attn_mask	dropout_p	is_causal)%r*   loggerwarning_oncer2   r^   rH   rv   r   rp   rs   r   r   r   r   r   r|   r   r   r   r   r   rw   rx   r   r   rR   typer   r{   rD   r   r   scaled_dot_product_attentiontrainingr   reshapert   )rK   r~   r   r   r   r   r   r   bsztgt_lenr   r   r   r   r   r   r   r   r   attn_outputrM   s                       rN   r^   z"CamembertSdpaSelfAttention.forward  s    '':59JiNcH 7?%!  (,,.Wa JJ}%**3D4L4LdNfNfgqqrsuvw 	 3$>2D.-%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<+224>>BGGI-44T^^DKKK (c2t779Q9QR1a  

>*c2t779Q9QR1a  )7It)<)C)C{DNN=M~<^*&	; &@DN--dnn=
 &&;+=+=+B+Bf+LQ_Qk%002K!,,.I%002K OOi,>(>i>UYCYi^ehi^i	hh))FF$+/==d''c G 
 "++Aq1!))#w8J8JKD  rO   r   r   r   rg   s   @rN   r   r     s    b 2615=A*.,115e!||e! !.e! E--.	e!
  ((9(9:e! !e! $D>e! !.e! 
u||	e! e!rO   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )CamembertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr(   )r2   r3   r   ru   r6   denser=   r>   r?   r@   rA   rJ   s     rN   r3   zCamembertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rO   r~   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   rA   r=   rK   r~   r   s      rN   r^   zCamembertSelfOutput.forward  7    

=1]3}|'CDrO   rb   rc   rd   r3   rD   r   r^   rf   rg   s   @rN   r   r     1    >U\\  RWR^R^ rO   r   )eagersdpac                        e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     dee	   dee
   d	eej                     d
eej                     fdZ xZS )CamembertAttentionc                     t         |           t        |j                     |||      | _        t        |      | _        t               | _        y )Nr   )	r2   r3    CAMEMBERT_SELF_ATTENTION_CLASSES_attn_implementationrK   r   outputsetpruned_headsr}   s       rN   r3   zCamembertAttention.__init__  sF    4V5P5PQ$;
	
 *&1ErO   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r"   r   )lenr   rK   rp   rs   r   r   rv   rw   rx   r   r   rt   union)rK   headsindexs      rN   prune_headszCamembertAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rO   r~   r   r   r   r   r   r   r   c           	      r    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r   r   r   r   r   r   r"   )rK   r   )rK   r~   r   r   r   r   r   r   self_outputsattention_outputoutputss              rN   r^   zCamembertAttention.forward  s\     yy)"7)/) ! 
  ;;|AF#%QR(88rO   r   r   )rb   rc   rd   r3   r   rD   r   r   r   r   r   r   r^   rf   rg   s   @rN   r   r     s    ";* 7;15=A*.,115|| !!2!23 E--.	
  ((9(9: ! $D> !. 
u||	rO   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CamembertIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r2   r3   r   ru   r6   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnrJ   s     rN   r3   zCamembertIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rO   r~   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rK   r~   s     rN   r^   zCamembertIntermediate.forward  s&    

=100?rO   r   rg   s   @rN   r   r     s#    9U\\ ell rO   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )CamembertOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r2   r3   r   ru   r   r6   r   r=   r>   r?   r@   rA   rJ   s     rN   r3   zCamembertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rO   r~   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rN   r^   zCamembertOutput.forward  r   rO   r   rg   s   @rN   r   r     r   rO   r   c                       e Zd Zd fd	Z	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     dee   dee	   d	eej
                     d
e
ej
                     fdZd Z xZS )CamembertLayerc                 l   t         |           |j                  | _        d| _        t	        ||      | _        |j                  | _        |j                  | _        | j                  r-| j                  st        |  d      t	        |d|      | _	        t        |      | _        t        |      | _        y )Nr"   r|   z> should be used as a decoder model if cross attention is addedr+   r   )r2   r3   chunk_size_feed_forwardseq_len_dimr   	attentionr{   add_cross_attentionrq   crossattentionr   intermediater   r   )rK   rL   r|   rM   s      rN   r3   zCamembertLayer.__init__  s    '-'E'E$+FiH ++#)#=#= ##?? D6)g!hii"4VU_kt"uD1&9%f-rO   r~   r   r   r   encoder_attention_maskr   r   r   r   c	           	      H   | j                  ||||||      }	|	d   }
|	dd  }| j                  rB|@t        | d      st        d|  d      | j	                  |
||||||      }|d   }
||dd  z   }t        | j                  | j                  | j                  |
      }|f|z   }|S )N)r   r   r   r   r   r   r"   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r  r{   rU   rq   r  r   feed_forward_chunkr  r  )rK   r~   r   r   r   r
  r   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputs                 rN   r^   zCamembertLayer.forward  s    "&)/)) "0 "
 2!4(,??4@4!12 =dV DD D 
 '+&9&9 5#&;-"3- ': '#  7q9 7 ;;G0##T%A%A4CSCSUe
  /G+rO   c                 L    | j                  |      }| j                  ||      }|S r   )r	  r   )rK   r   intermediate_outputr  s       rN   r  z!CamembertLayer.feed_forward_chunk+  s,    "//0@A{{#68HIrO   r   )NNNNNFN)rb   rc   rd   r3   rD   r   r   r   r   r   r   r^   r  rf   rg   s   @rN   r  r    s    ." 7;15=A>B*.,115.||. !!2!23. E--.	.
  ((9(9:. !)):): ;. !. $D>. !.. 
u||	.`rO   r  c                   f    e Zd Zd fd	Z	 	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   deej
                     de
eej
                     ef   fdZ xZS )CamembertEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w )Nr  F)
r2   r3   rL   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)rK   rL   r|   irM   s       rN   r3   zCamembertEncoder.__init__3  sR    ]]QVW]WoWoQp#qAN6Q$G#qr
&+# $rs   A%r~   r   r   r   r
  past_key_values	use_cacher   output_hidden_statesreturn_dictr   r   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}d}|rR| j                   j                  r<t        |t              s,t        j                  d       d}t        j                  |      }t        | j                        D ]W  \  }}|	r||fz   }|||   nd } |||||||||      }|d   }|s/||d   fz   }| j                   j                  sO||d	   fz   }Y |	r||fz   }|r|j                         }|
st        d
 |||||fD              S t        |||||      S )N zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.T)r
  r   r   r   r   r"   ro   c              3   $   K   | ]  }|| 
 y wr   r   ).0vs     rN   	<genexpr>z+CamembertEncoder.forward.<locals>.<genexpr>z  s      
 = 
s   )last_hidden_stater  r~   
attentionscross_attentions)rL   r  r  r   r   r   r{   r   r   r   from_legacy_cache	enumerater  to_legacy_cacher   r   )rK   r~   r   r   r   r
  r  r  r   r  r  r   all_hidden_statesall_self_attentionsall_cross_attentionsreturn_legacy_cacher  layer_modulelayer_head_masklayer_outputss                       rN   r^   zCamembertEncoder.forward9  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#//
?TY8Z\
 #'1CCOTO(4 	VOA|#$58H$H!.7.CilO(%'=."3-	M *!,M &9]1=M<O&O#;;22+?=QRCSBU+U(+	V.   1]4D D-==?O 
 "#%'(
 
 
 9+++*1
 	
rO   r   )
NNNNNNFFTN)rb   rc   rd   r3   rD   r   r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r  2  s"   , 7;15=A>BEI$(,1/4&*15R
||R
 !!2!23R
 E--.	R

  ((9(9:R
 !)):): ;R
 "%e.?.?(@"ABR
 D>R
 $D>R
 'tnR
 d^R
 !.R
 
uU\\"$MM	NR
rO   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CamembertPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r2   r3   r   ru   r6   r   Tanh
activationrJ   s     rN   r3   zCamembertPooler.__init__  s9    YYv1163E3EF
'')rO   r~   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   r6  )rK   r~   first_token_tensorpooled_outputs       rN   r^   zCamembertPooler.forward  s6     +1a40

#566rO   r   rg   s   @rN   r3  r3    s#    $
U\\ ell rO   r3  c                   *    e Zd ZU eed<   dZdZdZd Zy)CamembertPreTrainedModelrL   robertaTc                 l   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weightsr   )meanstdNg      ?)r   r   ru   weightdatanormal_rL   initializer_rangebiaszero_r4   r'   r=   fill_CamembertLMHead)rK   modules     rN   _init_weightsz&CamembertPreTrainedModel._init_weights  s&   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S)0KK""$ 1rO   N)	rb   rc   rd   r#   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_sdparJ  r   rO   rN   r<  r<    s    !&*#N%rO   r<  c                   (     e Zd ZdZ fdZd Z xZS )CamembertClassificationHeadz-Head for sentence-level classification tasks.c                 Z   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        y r   )r2   r3   r   ru   r6   r   classifier_dropoutr@   r?   rA   
num_labelsout_projrK   rL   rR  rM   s      rN   r3   z$CamembertClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrO   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r8  )rA   r   rD   tanhrT  rK   featureskwargsxs       rN   r^   z#CamembertClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rO   )rb   rc   rd   re   r3   r^   rf   rg   s   @rN   rP  rP    s    7IrO   rP  c                   .     e Zd ZdZ fdZd Zd Z xZS )rH  z,Camembert Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        | j                  | j                  _        y r   )r2   r3   r   ru   r6   r   r=   r>   
layer_normr5   decoder	ParameterrD   rG   rE  rJ   s     rN   r3   zCamembertLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIrO   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r   r^  r_  rX  s       rN   r^   zCamembertLMHead.forward  s;    JJx GOOA LLOrO   c                     | j                   j                  j                  j                  dk(  r| j                  | j                   _        y | j                   j                  | _        y )Nmeta)r_  rE  rR   r   rK   s    rN   _tie_weightszCamembertLMHead._tie_weights  sC     <<##((F2 $		DLL))DIrO   )rb   rc   rd   re   r3   r^   re  rf   rg   s   @rN   rH  rH    s    6&*rO   rH  c            "           e Zd ZdZg Zd fd	Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
ej                     de
ej                     de
ej                     de
eej                        de
e   de
e   de
e   de
e   de
ej                     deeej                     ef   fd       Z xZS )CamembertModela1  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
    `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762

    c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        |j                  | _
        |j                  | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r2   r3   rL   r%   r]   r  encoderr3  poolerr   attn_implementationr*   	post_init)rK   rL   add_pooling_layerrM   s      rN   r3   zCamembertModel.__init__  si    
 	 -f5'/1Bof-#)#>#> '-'E'E$ 	rO   c                 .    | j                   j                  S r   r]   r8   rd  s    rN   get_input_embeddingsz#CamembertModel.get_input_embeddings  s    ...rO   c                 &    || j                   _        y r   ro  )rK   rx   s     rN   set_input_embeddingsz#CamembertModel.set_input_embeddings  s    */'rO   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsri  r  r  r   )rK   heads_to_pruner  r   s       rN   _prune_headszCamembertModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	CrO   rV   r   r/   r,   r   rW   r   r
  r  r  r   r  r  r   r   c                 6   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }d}|	5t        |	t              s|	d   d   j                  d   n|	j                         }|pt        | j                  d      r4| j                  j                   d d d |f   }|j#                  ||      }|}n&t%        j&                  |t$        j(                  |      }| j                  |||||	      }|t%        j*                  |||z   f|
      }| j,                  dk(  xr | j.                  dk(  xr	 |d u xr | }|rQ|j1                         dk(  r>| j                   j                  rt3        ||||      }n+t5        ||j6                  |      }n| j9                  ||      }| j                   j                  rs|q|j                         \  }}}||f}|t%        j*                  ||
      }|r,|j1                         dk(  rt5        ||j6                  |      }n| j;                  |      }nd }| j=                  || j                   j>                        }| jA                  ||||||	|
||||      }|d   }| jB                  | jC                  |      nd } |s
|| f|dd  z   S tE        || |jF                  |jH                  |jJ                  |jL                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer-   z5You have to specify either input_ids or inputs_embedsr   r   r/   rQ   )rV   r,   r/   rW   rX   )rR   r   r+   ro   )r   )
r   r   r   r
  r  r  r   r  r  r   r"   )r%  pooler_outputr  r~   r&  r'  )'rL   r   r  use_return_dictr{   r  rq   %warn_if_padding_and_no_attention_maskrH   rR   r   r   r   get_seq_lengthrU   r]   r/   rF   rD   rG   rI   onesrk  r*   r   r   r   r1   get_extended_attention_maskinvert_attention_maskget_head_maskr  ri  rj  r   r  r~   r&  r'  )!rK   rV   r   r/   r,   r   rW   r   r
  r  r  r   r  r  r   rY   r   rZ   rR   rX   r[   r\   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputssequence_outputr:  s!                                    rN   r^   zCamembertModel.forward%  s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"& "/59  "1%++B/$335 # !t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'#9 + 
 !"ZZZBX5X(YbhiN $$. &,,
:&T!& &%	 	! $(:(:(<(A {{%%*T"$*	+' +N"$4$:$:J+' '+&F&F~Wb&c# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&',B,F,F,HA,M 3V*,<,B,BJ3/ 372L2LMc2d/.2+ &&y$++2O2OP	,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rO   )TNNNNNNNNNNNNNN)rb   rc   rd   re   _no_split_modulesr3   rp  rr  rv  r   r   rD   r   listr   r   r   r   r   r^   rf   rg   s   @rN   rg  rg    s    &/0C  -11515/3,0048<9==A$(,0/3&*15S
ELL)S
 !.S
 !.	S

 u||,S
 ELL)S
  -S
  (5S
 !) 6S
 "$u'8'8"9:S
 D>S
 $D>S
 'tnS
 d^S
 !.S
  
uU\\"$PP	Q!S
 S
rO   rg  c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deee	j                     ef   fd       Z xZS )CamembertForMaskedLMlm_head.decoder.weightlm_head.decoder.biasc                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzpIf you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Frm  
r2   r3   r{   r   warningrg  r=  rH  lm_headrl  rJ   s     rN   r3   zCamembertForMaskedLM.__init__  sR     NN1
 &fF&v. 	rO   c                 .    | j                   j                  S r   r  r_  rd  s    rN   get_output_embeddingsz*CamembertForMaskedLM.get_output_embeddings      ||###rO   c                 &    || j                   _        y r   r  rK   new_embeddingss     rN   set_output_embeddingsz*CamembertForMaskedLM.set_output_embeddings      -rO   rV   r   r/   r,   r   rW   r   r
  labelsr   r  r  r   c                    ||n| j                   j                  }| j                  |||||||||
||      }|d   }| j                  |      }d}|	a|	j	                  |j
                        }	t               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
r   r/   r,   r   rW   r   r
  r   r  r  r   r-   ro   losslogitsr~   r&  )rL   ry  r=  r  r   rR   r   r   r5   r   r~   r&  )rK   rV   r   r/   r,   r   rW   r   r
  r  r   r  r  r   r  prediction_scoresmasked_lm_lossloss_fctr   s                      rN   r^   zCamembertForMaskedLM.forward  s   > &1%<k$++B]B],,))%'"7#9/!5#  
 "!* LL9YY0778F')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rO   )NNNNNNNNNNNN)rb   rc   rd   _tied_weights_keysr3   r  r  r   r   rD   
LongTensorr   r   r   r   r   r   r^   rf   rg   s   @rN   r  r    sk    34JK$.  156:59371559=A>B-1,0/3&*@
E,,-@
 !!2!23@
 !!1!12	@

 u//0@
 E--.@
   1 12@
  ((9(9:@
 !)):): ;@
 ))*@
 $D>@
 'tn@
 d^@
 
uU\\"N2	3@
 @
rO   r  z
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eej                     ef   fd       Z xZS )"CamembertForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFr  )	r2   r3   rS  rL   rg  r=  rP  
classifierrl  rJ   s     rN   r3   z+CamembertForSequenceClassification.__init__#  sJ      ++%fF5f= 	rO   rV   r   r/   r,   r   rW   r  r   r  r  r   c                 T   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}||j	                  |j
                        }| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt!               } |||      }|
s|f|d	d z   }||f|z   S |S t#        |||j$                  |j&                  
      S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r/   r,   r   rW   r   r  r  r   r"   
regressionsingle_label_classificationmulti_label_classificationr-   ro   r  )rL   ry  r=  r  r   rR   problem_typerS  r1   rD   rI   rr   r	   squeezer   r   r   r   r~   r&  rK   rV   r   r/   r,   r   rW   r  r   r  r  r   r  r  r  r  r   s                    rN   r^   z*CamembertForSequenceClassification.forward.  s   : &1%<k$++B]B],,))%'/!5#  

 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rO   
NNNNNNNNNN)rb   rc   rd   r3   r   r   rD   r  r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r    s"   	  156:59371559-1,0/3&*N
E,,-N
 !!2!23N
 !!1!12	N

 u//0N
 E--.N
   1 12N
 ))*N
 $D>N
 'tnN
 d^N
 
uU\\"$<<	=N
 N
rO   r  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eej                     ef   fd       Z xZS )CamembertForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr"   )r2   r3   rg  r=  r   r?   r@   rA   ru   r6   r  rl  rJ   s     rN   r3   z#CamembertForMultipleChoice.__init__  sV     %f-zz&"<"<=))F$6$6: 	rO   rV   r/   r   r  r,   r   rW   r   r  r  r   c                    |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|.|j                  |j                        }t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr"   r-   r   )r,   r/   r   r   rW   r   r  r  ro   r  )rL   ry  r   r   rH   r=  rA   r  r   rR   r   r   r~   r&  )rK   rV   r/   r   r  r,   r   rW   r   r  r  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r:  r  reshaped_logitsr  r  r   s                           rN   r^   z"CamembertForMultipleChoice.forward  s   Z &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ,,*..,/!5#  

  
]3/ ++b+6YY556F')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rO   r  )rb   rc   rd   r3   r   r   rD   r  r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r    s"     15596:-1371559,0/3&*Z
E,,-Z
 !!1!12Z
 !!2!23	Z

 ))*Z
 u//0Z
 E--.Z
   1 12Z
 $D>Z
 'tnZ
 d^Z
 
uU\\"$==	>Z
 Z
rO   r  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eej                     ef   fd       Z xZS )CamembertForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r  )r2   r3   rS  rg  r=  rR  r@   r   r?   rA   ru   r6   r  rl  rU  s      rN   r3   z(CamembertForTokenClassification.__init__  s      ++%fF)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rO   rV   r   r/   r,   r   rW   r  r   r  r  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|W|j                  |j                        }t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r-   ro   r  )rL   ry  r=  rA   r  r   rR   r   r   rS  r   r~   r&  r  s                    rN   r^   z'CamembertForTokenClassification.forward  s   6 &1%<k$++B]B],,))%'/!5#  

 "!*,,71YYv}}-F')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rO   r  )rb   rc   rd   r3   r   r   rD   r  r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r    s     156:59371559-1,0/3&*=
E,,-=
 !!2!23=
 !!1!12	=

 u//0=
 E--.=
   1 12=
 ))*=
 $D>=
 'tn=
 d^=
 
uU\\"$99	:=
 =
rO   r  c                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eej                     ef   fd       Z xZS )CamembertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
r2   r3   rS  rg  r=  r   ru   r6   
qa_outputsrl  rJ   s     rN   r3   z&CamembertForQuestionAnswering.__init__@  sU      ++%fF))F$6$68I8IJ 	rO   rV   r   r/   r,   r   rW   start_positionsend_positionsr   r  r  r   c                 (   ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Nr  r   r"   r-   r   )ignore_indexro   )r  start_logits
end_logitsr~   r&  )rL   ry  r=  r  splitr  r   r   rH   clampr   r   r~   r&  )rK   rV   r   r/   r,   r   rW   r  r  r   r  r  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rN   r^   z%CamembertForQuestionAnswering.forwardJ  s   4 &1%<k$++B]B],,))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rO   )NNNNNNNNNNN)rb   rc   rd   r3   r   r   rD   r  r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r  =  s;     156:593715596:48,0/3&*I
E,,-I
 !!2!23I
 !!1!12	I

 u//0I
 E--.I
   1 12I
 "%"2"23I
   0 01I
 $D>I
 'tnI
 d^I
 
uU\\"$@@	AI
 I
rO   r  zU
    CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c            "           e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     deeee	j                           dee   dee   dee   dee   deee	j                     ef   fd       Z xZS )CamembertForCausalLMr  r  c                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzQIf you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  r  rJ   s     rN   r3   zCamembertForCausalLM.__init__  sL       NNno%fF&v. 	rO   c                 .    | j                   j                  S r   r  rd  s    rN   r  z*CamembertForCausalLM.get_output_embeddings  r  rO   c                 &    || j                   _        y r   r  r  s     rN   r  z*CamembertForCausalLM.set_output_embeddings  r  rO   rV   r   r/   r,   r   rW   r   r
  r  r  r  r   r  r  r   c                    ||n| j                   j                  }|	d}| j                  |||||||||
||||      }|d   }| j                  |      }d}|	E|	j	                  |j
                        }	 | j                  ||	fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )aq  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
        >>> config = AutoConfig.from_pretrained("almanach/camembert-base")
        >>> config.is_decoder = True
        >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r   r/   r,   r   rW   r   r
  r  r  r   r  r  r   r5   ro   )r  r  r  r~   r&  r'  )rL   ry  r=  r  r   rR   loss_functionr5   r   r  r~   r&  r'  )rK   rV   r   r/   r,   r   rW   r   r
  r  r  r  r   r  r  rZ  r   r  r  lm_lossr   s                        rN   r^   zCamembertForCausalLM.forward  s6   d &1%<k$++B]B]I,,))%'"7#9+/!5#  
  "!* LL9YY0778F(d((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
rO   r  )rb   rc   rd   r  r3   r  r  r   r   rD   r  r   r   r   r   r   r   r^   rf   rg   s   @rN   r  r    s    34JK
$.  156:59371559=A>B-1EI$(,0/3&*^
E,,-^
 !!2!23^
 !!1!12	^

 u//0^
 E--.^
   1 12^
  ((9(9:^
 !)):): ;^
 ))*^
 "%e.?.?(@"AB^
 D>^
 $D>^
 'tn^
 d^^
" 
uU\\"$EE	F#^
 ^
rO   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r"   r   )nerr   rD   cumsumtype_asrI   )rV   r'   rX   maskincremental_indicess        rN   rS   rS     sW     <<$((*D <<!4<<TBE[[_cc##%33rO   )r  r  r  r  r  r  rg  r<  )r   )Kre   r   typingr   r   rD   torch.utils.checkpoint	packagingr   r   torch.nnr   r   r	   activationsr   r   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r    r!   configuration_camembertr#   
get_loggerrb   r   Moduler%   ri   r   r   r   r   r   r   r  r  r3  r<  rP  rH  rg  r  r  r  r  r  r  rS   __all__r   rO   rN   <module>r     s_      "     A A ' 5 ) w 9	 	 	 . l l ? ? 4 
		H	%V=")) V=t@.RYY @.Hl!!7 l!`"))  $&$  2 2lBII  bii B/ BLY
ryy Y
zbii  % % %6")) .*bii *> I
- I
 I
X Y
3 Y
 Y
x [
)A [
[
| f
!9 f
 f
R M
&> M
 M
` U
$< U
 U
p t
3_ t
t
p4 	rO   