
    rh                        d Z ddlZddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&  e$jN                  e(      Z)d Z* G d de	jV                        Z, G d de	jV                        Z- G d de-      Z.e-e.dZ/ G d de	jV                        Z0 G d de	jV                        Z1 G d de	jV                        Z2e# G d de             Z3e e#d !       G d" d#e"                    Z4e# G d$ d%e3             Z5 e#d&!       G d' d(e3             Z6 G d) d*e	jV                        Z7 G d+ d,e	jV                        Z8e# G d- d.e3             Z9 e#d/!       G d0 d1e3             Z:e# G d2 d3e3             Z;e# G d4 d5e3             Z<e# G d6 d7e3             Z=g d8Z>y)9zPyTorch ALBERT model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indices"is_torch_greater_or_equal_than_2_2prune_linear_layer)ModelOutputauto_docstringlogging   )AlbertConfigc           	      	   	 ddl }ddl}ddl}t        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }g }	|D ]^  \  }
}t        j                  d|
 d|        |j                  j                  ||
      }|j                  |
       |	j                  |       ` t        ||	      D ]  \  }
}t        |
        t        ||	      D ]J  \  }
}|
}|
j!                  dd      }
|
j!                  d	d
      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  dd      }
|
j!                  d d!      }
|
j!                  d"d#      }
|
j!                  d$d%      }
t#        |
j%                  d            d&k(  rd'|
v sd(|
v rd)|
z   }
d*|
v r$|
j!                  d+d,      }
|
j!                  d-d.      }
|
j%                  d      }
d/|
v sd0|
v sd1|
v sd2|
v sd3|
v r)t        j                  d4dj'                  |
              | }|
D ]  }|j)                  d5|      r|j%                  d6|      }n|g}|d   d7k(  s|d   d8k(  rt+        |d.      }nW|d   d'k(  s|d   d9k(  rt+        |d:      }n:|d   d(k(  rt+        |d.      }n%|d   d;k(  rt+        |d<      }n	 t+        ||d         }t#        |      d=k\  st/        |d&         }||   } d>d d?k(  rt+        |d.      }n|d7k(  r|j1                  |      }	 |j2                  |j2                  k7  r&t5        d@|j2                   dA|j2                   dB      	 t        dC|
 dD|        t9        j:                  |      |_        M | S # t        $ r t        j                  d        w xY w# t,        $ r+ t        j                  d4dj'                  |
              Y w xY w# t4        $ r1}|xj6                  |j2                  |j2                  fz  c_         d}~ww xY w)Ez'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zmodule/ ffn_1ffnzbert/zalbert/attention_1	attentionz
transform/LayerNorm_1full_layer_layer_norm	LayerNormzattention/LayerNormztransformer/zintermediate/dense/zffn/intermediate/output/dense/zffn_output/z/output//z/self/zpooler/densepoolerzcls/predictionspredictionszpredictions/attentionzembeddings/attention
embeddingsinner_group_zalbert_layers/group_zalbert_layer_groups/r   output_biasoutput_weightszclassifier/seq_relationshipzseq_relationship/output_zsop_classifier/classifier/weightsweightadam_madam_vAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepz	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammabetabiassquad
classifier   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight z from )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipprintreplacelensplitjoin	fullmatchgetattrAttributeErrorint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr?   nptftf_path	init_varsnamesarraysnamerX   arrayoriginal_namepointerm_namescope_namesnumes                      }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/albert/modeling_albert.pyload_tf_weights_in_albertrp   3   s   
 ggoo01G
KK8	BC''0IEF  e(l5'BC&&w5Te	 5&) ed 5&) ]/e ||Ir* ||GU+||GY/||M;7||L"-||M+BC||K)>?||NB/ ||126||<mL ||J,||Hc* ||NH5 ||-}=||3]C ||2LA||N,<=||H&<= tzz#1$-4*?CSW[C[ 4'D %<< :<XYD<<	84Dzz# 4)T1+t3$KK)CHHTN#345 	'F||,f5 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g{1~>G ;1$+a.)!#,-	'0 #$<=(gx0GxLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	*4&}oFG''.{]/~ Li  Q	
 	@ & KK)CHHTN+; <=  	FFw}}ekk22F	s5   Q 3Q%?R Q"%0RR	S%,SSc                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                     deej                     deej                     deej                     de
d	ej                  fd
Z xZS )AlbertEmbeddingszQ
    Construct the embeddings from word, position and token_type embeddings.
    r_   c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       t+        |dd      | _        | j#                  d	t%        j.                  | j0                  j3                         t$        j4                  
      d       y )N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr&   layer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr[   arangeexpandrT   rz   zerosrw   sizelongselfr_   	__class__s     ro   r   zAlbertEmbeddings.__init__   s1   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`" f&;&;AVAVWzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsr|   rw   inputs_embedspast_key_values_lengthreturnc                 Z   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }	|	}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j                  |      }| j                  |      }|S )Nrx   r   r|   r   r~   devicer{   )r   rw   hasattrr|   r   r[   r   r   r   r   r   rz   r   r&   r   )r   r   r|   rw   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   r*   r   s                ro   forwardzAlbertEmbeddings.forward   sH     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r   )NNNNr   )__name__
__module____qualname____doc__r   r   r   r[   
LongTensorFloatTensorrV   Tensorr   __classcell__r   s   @ro   rr   rr      s    
| 
. 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
'r   rr   c                        e Zd Zdef fdZdee   ddfdZ	 	 	 ddej                  de
ej                     d	e
ej                     d
edeeej                     eej                  ej                  f   f   f
dZ xZS )AlbertAttentionr_   c                    t         |           |j                  |j                  z  dk7  r1t	        |d      s%t        d|j                   d|j                         |j                  | _        |j                  | _        |j                  |j                  z  | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        t        j                  |j                         | _        t        j                  |j                  |j                        | _        t        j&                  |j                  |j(                        | _        t+               | _        t/        |dd      | _        | j0                  dk(  s| j0                  d	k(  rG|j2                  | _        t        j4                  d
|j2                  z  dz
  | j                        | _        y y )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads (ru   rz   r{   relative_keyrelative_key_queryr=   r   )r   r   hidden_sizenum_attention_headsr   rY   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattention_dropoutr   output_dropoutdenser&   r   setpruned_headsrT   rz   r   r   distance_embeddingr   s     ro   r   zAlbertAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 4457 
 $*#=#= !--#)#5#59S9S#S !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
!#F,O,O!P jj)C)CDYYv1163E3EF
f&8&8f>S>STE'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr   headsr   Nc                    t        |      dk(  ry t        || j                  | j                  | j                        \  }}t        | j                  |      | _        t        | j                  |      | _        t        | j                  |      | _        t        | j                  |d      | _	        | j                  t        |      z
  | _        | j                  | j                  z  | _
        | j                  j                  |      | _        y )Nr   r   dim)rP   r   r   r   r   r   r   r   r   r   r   union)r   r   indexs      ro   prune_headszAlbertAttention.prune_heads  s    u:?74++T-E-EtGXGX
u
 (

E:
%dhh6'

E:
'

EqA
 $(#;#;c%j#H !558P8PP --33E:r   hidden_statesattention_mask	head_maskoutput_attentionsc                    |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  |d| j
                  | j                        j                  dd      }|	j	                  |d| j
                  | j                        j                  dd      }	|
j	                  |d| j
                  | j                        j                  dd      }
t        j                  ||	j                  dd            }|t        j                  | j                        z  }|||z   }| j                  dk(  s| j                  dk(  rF|j                         d   }t        j                  |t        j                  |j                         j	                  dd      }t        j                  |t        j                  |j                         j	                  dd      }||z
  }| j#                  || j$                  z   dz
        }|j'                  |j(                        }| j                  dk(  rt        j*                  d	||      }||z   }nE| j                  dk(  r6t        j*                  d	||      }t        j*                  d
|	|      }||z   |z   }t,        j.                  j1                  |d      }| j3                  |      }|||z  }t        j                  ||
      }|j                  dd      j5                  d      }| j7                  |      }| j9                  |      }| j;                  ||z         }|r||fS |fS )Nrx   r   r=   r   r   r   r}   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )rX   r   r   r   viewr   r   rW   r[   matmulmathsqrtrz   r   r   r   r   r   r   tor~   einsumr   
functionalsoftmaxr   flattenr   r   r&   )r   r   r   r   r   
batch_sizer   _query_layer	key_layervalue_layerattention_scoresposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layerprojected_context_layerprojected_context_layer_dropoutlayernormed_context_layers                           ro   r   zAlbertAttention.forward$  s"    %2$7$7!
Jjj/HH]+	jj/!&&z2t7O7OQUQiQijttq
 NN:r43K3KTMeMefppqrtuv	!&&z2t7O7OQUQiQijttq

 !<<Y5H5HR5PQ+dii8P8P.QQ%/.@''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s  --//0@b/I 00A  -	9O_kB%//15==a@"&**]";*.*=*=>U*V'$(NN=Cb3b$c!?P)?;rWpVrrr   NNF)r   r   r   r   r   listrV   r   r[   r   r   r   boolr   tupler   r   r   s   @ro   r   r      s    u| u8;c ;t ;* 7;15"'<s||<s !!2!23<s E--.	<s
  <s 
uU\\"E%,,*D$EE	F<sr   r   c                        e Zd Z fdZ	 	 	 ddej
                  deej                     deej                     dede	e
ej
                     e
ej
                  ej
                  f   f   f
 fdZ xZS )	AlbertSdpaAttentionc                 ^    t         |   |       |j                  | _        t         | _        y N)r   r   r   dropout_probr   require_contiguous_qkvr   s     ro   r   zAlbertSdpaAttention.__init__d  s)     "??*L&L#r   r   r   r   r   r   c                 R   | j                   dk7  s|r't        j                  d       t        |   |||      S |j                         \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  rK|j                  j                  dk(  r2|0|j!                         }|	j!                         }	|
j!                         }
t"        j$                  j&                  j)                  ||	|
|| j*                  r| j,                  ndd	
      }|j                  dd      }|j/                  ||| j0                        }| j3                  |      }| j5                  |      }| j7                  ||z         }|fS )Nr{   a  AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` . Falling back to the eager attention implementation, but specifying the eager implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   rx   r   r=   cuda        F)r   r   r   	attn_mask	dropout_p	is_causal)rz   rC   warningr   r   r   r   r   r   r   rW   r   r   r   r   type
contiguousr[   r   r   scaled_dot_product_attentiontrainingr   reshaper   r   r   r&   )r   r   r   r   r   r   seq_lenr   r   r   r   attention_outputr   r   r   r   s                  ro   r   zAlbertSdpaAttention.forwardi  s     '':59JNNH 7?=.Te?ff!.!3!3!5
GQJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 &&;+=+=+B+Bf+LQ_Qk%002K!,,.I%002K 88..KK$+/==d''c L 
 ,55a;+33JI[I[\"&**-=">*.*=*=>U*V'$(NN=Cb3b$c!)++r   r   )r   r   r   r   r[   r   r   r   r   r   r   r   r   r   s   @ro   r   r   c  s    M 7;15"'9,||9, !!2!239, E--.	9,
  9, 
uU\\"E%,,*D$EE	F9, 9,r   r   )eagersdpac                        e Zd Zdef fdZ	 	 	 	 ddej                  deej                     deej                     de	de	de
ej                  ej                  f   fd	Zd
ej                  dej                  fdZ xZS )AlbertLayerr_   c                 .   t         |           || _        |j                  | _        d| _        t        j                  |j                  |j                        | _	        t        |j                     |      | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t"        |j$                     | _        t        j(                  |j*                        | _        y )Nr   ru   )r   r   r_   chunk_size_feed_forwardseq_len_dimr   r&   r   r   r%   ALBERT_ATTENTION_CLASSES_attn_implementationr#   r   intermediate_sizer!   
ffn_outputr   
hidden_act
activationr   r   r   r   s     ro   r   zAlbertLayer.__init__  s    '-'E'E$%'\\&2D2D&J_J_%`"1&2M2MNvV99V//1I1IJ))F$<$<f>P>PQ !2!23zz&"<"<=r   r   r   r   r   output_hidden_statesr   c                     | j                  ||||      }t        | j                  | j                  | j                  |d         }| j                  ||d   z         }|f|dd  z   S )Nr   r   )r#   r   ff_chunkr  r  r%   )r   r   r   r   r   r  r  r  s           ro   r   zAlbertLayer.forward  sy      >>-Tef.MM((Q	

 22:@PQR@S3ST"212"666r   r  c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r!   r  r  )r   r  r  s      ro   r  zAlbertLayer.ff_chunk  s3    XX./
__Z0
__Z0
r   NNFF)r   r   r   r   r   r[   r   r   r   r   r   r   r  r   r   s   @ro   r  r    s    >| >  7;15"'%*7||7 !!2!237 E--.	7
  7 #7 
u||U\\)	*7( %,, r   r  c                        e Zd Zdef fdZ	 	 	 	 ddej                  deej                     deej                     de	de	de
eej                  e
ej                     f   d	f   fd
Z xZS )AlbertLayerGroupr_   c                     t         |           t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        y c c}w r   )r   r   r   
ModuleListrangeinner_group_numr  albert_layersr   r_   r   r   s      ro   r   zAlbertLayerGroup.__init__  s=    ]]vOeOeIf+gAK,?+gh+gs   Ar   r   r   r   r  r   .c                     d}d}t        | j                        D ],  \  }}	 |	||||   |      }
|
d   }|r	||
d   fz   }|s'||fz   }. |f}|r||fz   }|r||fz   }|S )N r   r   )	enumerater  )r   r   r   r   r   r  layer_hidden_stateslayer_attentionslayer_indexalbert_layerlayer_outputoutputss               ro   r   zAlbertLayerGroup.forward  s     !)243E3E)F 	M%K'~yQ\G]_pqL(OM #3|A6H#H #&9]<L&L#	M !"!4 66G!1 33Gr   r  )r   r   r   r   r   r[   r   r   r   r   r   r   r   r   r   s   @ro   r  r    s    i| i 7;15"'%*|| !!2!23 E--.	
   # 
uU\\5#667<	=r   r  c                        e Zd Zdef fdZ	 	 	 	 	 ddej                  deej                     deej                     de	de	de	d	e
eef   fd
Z xZS )AlbertTransformerr_   c                     t         |           || _        t        j                  |j
                  |j                        | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        y c c}w r   )r   r   r_   r   r   r   r   embedding_hidden_mapping_inr  r  num_hidden_groupsr  albert_layer_groupsr  s      ro   r   zAlbertTransformer.__init__  sf    +-99V5J5JFL^L^+_(#%==TYZ`ZrZrTs1tq2B62J1t#u 1ts   ,Br   r   r   r   r  return_dictr   c           	      d   | j                  |      }|r|fnd }|rdnd }|d g| j                  j                  z  n|}t        | j                  j                        D ]  }	t	        | j                  j                  | j                  j
                  z        }
t	        |	| j                  j                  | j                  j
                  z  z        } | j                  |   |||||
z  |dz   |
z   ||      }|d   }|r||d   z   }|s||fz   } |st        d |||fD              S t        |||      S )Nr  r   r   rx   c              3   &   K   | ]	  }||  y wr   r  ).0vs     ro   	<genexpr>z,AlbertTransformer.forward.<locals>.<genexpr>&  s     hqZ[Zghs   )last_hidden_stater   
attentions)	r(  r_   num_hidden_layersr  rV   r)  r*  r   r   )r   r   r   r   r   r  r+  all_hidden_statesall_attentionsilayers_per_group	group_idxlayer_group_outputs                ro   r   zAlbertTransformer.forward  sX    88G0D],$0d>G>OTFT[[:::U^	t{{445 	IA"4;;#@#@4;;C`C`#`a A!>!>A^A^!^_`I!D!9!9)!D)&66)a-K[9[\!$" /q1M !/2DR2H!H#$58H$H!)	I, h]4E~$Vhhh+;LYg
 	
r   )NNFFT)r   r   r   r   r   r[   r   r   r   r   r   r   r   r   r   r   s   @ro   r&  r&    s    v| v 7;15"'%* *
||*
 !!2!23*
 E--.	*

  *
 #*
 *
 
%	&*
r   r&  c                   *    e Zd ZU eed<   eZdZdZd Z	y)AlbertPreTrainedModelr_   albertTc                 l   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weights.r   )meanstdN      ?)
isinstancer   r   r1   r]   normal_r_   initializer_ranger:   zero_r   rt   r&   fill_AlbertMLMHead)r   modules     ro   _init_weightsz#AlbertPreTrainedModel._init_weights3  s&   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S).KK""$ /r   N)
r   r   r   r   __annotations__rp   load_tf_weightsbase_model_prefix_supports_sdparH  r  r   ro   r;  r;  ,  s    /O N%r   r;  z2
    Output type of [`AlbertForPreTraining`].
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)AlbertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logits
sop_logitsr   r2  )r   r   r   r   rP  r   r[   r   rI  rQ  rR  r   r   r2  r  r   ro   rO  rO  F  s}    	 )-D(5$$
%,59x 1 129.2J**+28<M8E%"3"345<59Ju00129r   rO  c                       e Zd ZU eed<   dZddedef fdZdej                  fdZ
dej                  ddfd	Zd
eeee   f   ddfdZe	 	 	 	 	 	 	 	 	 ddeej&                     deej(                     deej&                     deej&                     deej(                     deej(                     dee   dee   dee   deeef   fd       Z xZS )AlbertModelr_   r<  add_pooling_layerc                    t         |   |       || _        t        |      | _        t        |      | _        |rIt        j                  |j                  |j                        | _
        t        j                         | _        nd| _
        d| _        |j                  | _        |j                  | _        | j!                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r   r   r_   rr   r*   r&  encoderr   r   r   r(   Tanhpooler_activationr
  attn_implementationrz   	post_init)r   r_   rU  r   s      ro   r   zAlbertModel.__init__d  s    
 	 *62(0))F$6$68J8JKDK%'WWYD"DK%)D"#)#>#> '-'E'E$ 	r   r   c                 .    | j                   j                  S r   r*   r   r   s    ro   get_input_embeddingsz AlbertModel.get_input_embeddings{  s    ...r   r   Nc                 &    || j                   _        y r   r]  )r   r   s     ro   set_input_embeddingsz AlbertModel.set_input_embeddings~  s    */'r   heads_to_prunec                 @   |j                         D ]  \  }}t        || j                  j                  z        }t        ||| j                  j                  z  z
        }| j                  j
                  |   j                  |   j                  j                  |        y)a  
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.

        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
        while [2,3] correspond to the two inner groups of the second hidden layer.

        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
        information about head pruning
        N)	itemsrV   r_   r  rW  r*  r  r#   r   )r   rb  layerr   r8  inner_group_idxs         ro   _prune_headszAlbertModel._prune_heads  s     +002 	tLE5EDKK$?$??@I!%)dkk6Q6Q*Q"QROLL,,Y7EEoV``llmrs	tr   r   r   r|   rw   r   r   r   r  r+  c
                 t   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  |
|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j                  ||||      }| j                   dk(  xr | j"                  d	k(  xr	 |d u xr | }|rt%        ||j&                  |
      }nk|j)                  d      j)                  d      }|j+                  | j&                        }d|z
  t        j,                  | j&                        j.                  z  }| j1                  || j                   j2                        }| j5                  ||||||	      }|d   }| j6                  '| j9                  | j7                  |d d df               nd }|	s
||f|dd  z   S t;        |||j<                  |j>                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerx   z5You have to specify either input_ids or inputs_embeds)r   r|   r   )rw   r|   r   r  r{   )tgt_lenr   r=   r}   r@  )r   r   r  r+  r   )r1  pooler_outputr   r2  ) r_   r   r  use_return_dictrY   %warn_if_padding_and_no_attention_maskr   r   r[   onesr   r*   r|   r   r   r   rZ  rz   r   r~   	unsqueezer   finfominget_head_maskr3  rW  r(   rY  r   r   r2  )r   r   r   r|   rw   r   r   r   r  r+  r   r   r   r   r   r   embedding_outputuse_sdpa_attention_maskextended_attention_maskencoder_outputssequence_outputpooled_outputs                         ro   r   zAlbertModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZFCN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??L_l + 

 $$. &,,
:&T!& &%	 	  #&I 0 6 6
'# '5&>&>q&A&K&KA&N#&=&@&@tzz&@&R#'*-D'DTXT^T^H_HcHc&c#&&y$++2O2OP	,,#/!5# ' 
 *!,VZVaVaVm..t{{?1a4;P/QRsw#]3oab6III)-')77&11	
 	
r   )T)	NNNNNNNNN)r   r   r   r   rI  rK  r   r   r   r   r_  ra  dictrV   r   rg  r   r   r[   r   r   r   r   r   r   r   r   s   @ro   rT  rT  _  sd    |  ./bll /0",, 04 0t4T#Y+? tD t"  156:59371559,0/3&*T
E,,-T
 !!2!23T
 !!1!12	T

 u//0T
 E--.T
   1 12T
 $D>T
 'tnT
 d^T
 
)50	1T
 T
r   rT  z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       e Zd ZddgZdef fdZdej                  fdZdej                  ddfd	Z	dej                  fd
Ze	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                      deej                     deej                     deej                      deej                      deej                     deej                     dee   dee   dee   deeef   fd       Z xZS )AlbertForPreTrainingpredictions.decoder.biaspredictions.decoder.weightr_   c                     t         |   |       t        |      | _        t	        |      | _        t        |      | _        | j                          y r   )	r   r   rT  r<  rF  r)   AlbertSOPHeadsop_classifierr[  r   s     ro   r   zAlbertForPreTraining.__init__  sB     !&)(0+F3 	r   r   c                 .    | j                   j                  S r   r)   decoderr^  s    ro   get_output_embeddingsz*AlbertForPreTraining.get_output_embeddings      '''r   new_embeddingsNc                 &    || j                   _        y r   r  r   r  s     ro   set_output_embeddingsz*AlbertForPreTraining.set_output_embeddings   s    #1 r   c                 B    | j                   j                  j                  S r   r<  r*   r   r^  s    ro   r_  z)AlbertForPreTraining.get_input_embeddings      {{%%555r   r   r   r|   rw   r   r   labelssentence_order_labelr   r  r+  c                 $   ||n| j                   j                  }| j                  |||||||	|
|	      }|dd \  }}| j                  |      }| j	                  |      }d}|u|st               } ||j                  d| j                   j                        |j                  d            } ||j                  dd      |j                  d            }||z   }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
            sequence B), `1` indicates switched order (sequence B, then sequence A).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, AlbertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```Nr   r|   rw   r   r   r   r  r+  r=   rx   )rP  rQ  rR  r   r2  )r_   rk  r<  r)   r  r   r   r   rO  r   r2  )r   r   r   r|   rw   r   r   r  r  r   r  r+  r$  rv  rw  prediction_scores
sop_scores
total_lossloss_fctmasked_lm_losssentence_order_lossoutputs                         ro   r   zAlbertForPreTraining.forward  sM   R &1%<k$++B]B]++))%'/!5#  

 *1!& ,,_=((7

"6"B')H%&7&<&<RAWAW&XZ`ZeZefhZijN"*:??2q+ACWC\C\]_C`"a'*==J'4wqr{BF/9/EZMF*Q6Q)/!!//))
 	
r   NNNNNNNNNNN)r   r   r   _tied_weights_keysr   r   r   r   r  r  r   r_  r   r   r[   r   r   r   r   rO  r   r   r   r   s   @ro   rz  rz    s~    56RS| (ryy (2BII 2$ 26bll 6  156:59371559-1;?,0/3&*L
E,,-L
 !!2!23L
 !!1!12	L

 u//0L
 E--.L
   1 12L
 ))*L
 'u'7'78L
 $D>L
 'tnL
 d^L
 
)50	1L
 L
r   rz  c                   d     e Zd Zdef fdZdej                  dej                  fdZddZ xZ	S )rF  r_   c                    t         |           t        j                  |j                  |j
                        | _        t        j                  t        j                  |j                              | _
        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        |j                      | _        | j                  | j                  _
        y )Nru   )r   r   r   r&   r   r   	Parameterr[   r   r   r:   r   r   r   r  r   r  r  r   s     ro   r   zAlbertMLMHead.__init__W  s    f&;&;AVAVWLLV->->!?@	YYv1163H3HI
yy!6!68I8IJ !2!23 IIr   r   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|}|S r   )r   r  r&   r  )r   r   r  s      ro   r   zAlbertMLMHead.forwarda  sF    

=16}5]3)  r   c                     | j                   j                  j                  j                  dk(  r| j                  | j                   _        y | j                   j                  | _        y )Nmeta)r  r:   r   r   r^  s    ro   _tie_weightszAlbertMLMHead._tie_weightsk  sC    <<##((F2 $		DLL ))DIr   )r   N)
r   r   r   r   r   r[   r   r   r  r   r   s   @ro   rF  rF  V  s/    &| &!U\\ !ell !*r   rF  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )r~  r_   c                     t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        y r   )
r   r   r   r   classifier_dropout_probr   r   r   
num_labelsr<   r   s     ro   r   zAlbertSOPHead.__init__u  sB    zz&"@"@A))F$6$68I8IJr   rw  r   c                 J    | j                  |      }| j                  |      }|S r   )r   r<   )r   rw  dropout_pooled_outputlogitss       ro   r   zAlbertSOPHead.forward{  s%     $] ;!67r   )	r   r   r   r   r   r[   r   r   r   r   s   @ro   r~  r~  t  s,    K| KU\\ ell r   r~  c                       e Zd ZddgZ fdZdej                  fdZdej                  ddfdZdej                  fd	Z
e	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                     deej                     deej                     deej                     deej                     deej                     dee   dee   dee   deeef   fd       Z xZS )AlbertForMaskedLMr{  r|  c                     t         |   |       t        |d      | _        t	        |      | _        | j                          y NF)rU  )r   r   rT  r<  rF  r)   r[  r   s     ro   r   zAlbertForMaskedLM.__init__  s7     !&EB(0 	r   r   c                 .    | j                   j                  S r   r  r^  s    ro   r  z'AlbertForMaskedLM.get_output_embeddings  r  r   r  Nc                 \    || j                   _        |j                  | j                   _        y r   )r)   r  r:   r  s     ro   r  z'AlbertForMaskedLM.set_output_embeddings  s$    #1  . 3 3r   c                 B    | j                   j                  j                  S r   r  r^  s    ro   r_  z&AlbertForMaskedLM.get_input_embeddings  r  r   r   r   r|   rw   r   r   r  r   r  r+  c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlbertForMaskedLM

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

        >>> # add mask_token
        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> # retrieve index of [MASK]
        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        >>> tokenizer.decode(predicted_token_id)
        'france'
        ```

        ```python
        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
        >>> outputs = model(**inputs, labels=labels)
        >>> round(outputs.loss.item(), 2)
        0.81
        ```
        N	r   r   r|   rw   r   r   r   r  r+  r   rx   r=   rP  r  r   r2  )
r_   rk  r<  r)   r   r   r   r   r   r2  )r   r   r   r|   rw   r   r   r  r   r  r+  r$  sequence_outputsr  r  r  r  s                    ro   r   zAlbertForMaskedLM.forward  s    b &1%<k$++B]B]++))%'/!5#  

 #1: ,,-=>')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   
NNNNNNNNNN)r   r   r   r  r   r   r   r  r  r   r_  r   r   r[   r   r   r   r   r   r   r   r   r   s   @ro   r  r    s[   46RS(ryy (4BII 4$ 46bll 6  156:59371559-1,0/3&*O
E,,-O
 !!2!23O
 !!1!12	O

 u//0O
 E--.O
   1 12O
 ))*O
 $D>O
 'tnO
 d^O
 
~u$	%O
 O
r   r  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   J    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee
   dee
   dee
   deeef   fd       Z xZS )AlbertForSequenceClassificationr_   c                 N   t         |   |       |j                  | _        || _        t	        |      | _        t        j                  |j                        | _	        t        j                  |j                  | j                  j                        | _        | j                          y r   )r   r   r  r_   rT  r<  r   r   r  r   r   r   r<   r[  r   s     ro   r   z(AlbertForSequenceClassification.__init__  st      ++!&)zz&"@"@A))F$6$68N8NO 	r   r   r   r|   rw   r   r   r  r   r  r+  r   c                 @   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j
                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationrx   r=   r  )r_   rk  r<  r   r<   problem_typer  r~   r[   r   rV   r	   squeezer   r   r   r   r   r2  )r   r   r   r|   rw   r   r   r  r   r  r+  r$  rw  r  rP  r  r  s                    ro   r   z'AlbertForSequenceClassification.forward  s   ( &1%<k$++B]B]++))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   r  )r   r   r   r   r   r   r   r[   r   r   r   r   r   r   r   r   r   s   @ro   r  r    s    
| 
  156:59371559-1,0/3&*F
E,,-F
 !!2!23F
 !!1!12	F

 u//0F
 E--.F
   1 12F
 ))*F
 $D>F
 'tnF
 d^F
 
'.	/F
 F
r   r  c                   J    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee
   dee
   dee
   deeef   fd       Z xZS )AlbertForTokenClassificationr_   c                 x   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  | j                  j                        | _        | j                          y r  )r   r   r  rT  r<  r  r   r   r   r   r   r   r_   r<   r[  )r   r_   r  r   s      ro   r   z%AlbertForTokenClassification.__init__J  s      ++!&EB --9 **++ 	 
 zz"9:))F$6$68N8NO 	r   r   r   r|   rw   r   r   r  r   r  r+  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rx   r=   r  )r_   rk  r<  r   r<   r   r   r  r   r   r2  )r   r   r   r|   rw   r   r   r  r   r  r+  r$  rv  r  rP  r  r  s                    ro   r   z$AlbertForTokenClassification.forwardZ  s    $ &1%<k$++B]B]++))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   r  )r   r   r   r   r   r   r   r[   r   r   r   r   r   r   r   r   r   s   @ro   r  r  H  s   |    156:59371559-1,0/3&*2
E,,-2
 !!2!232
 !!1!12	2

 u//02
 E--.2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
$e+	,2
 2
r   r  c                   j    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee
   dee
   dee
   deeef   fd       Z xZS )AlbertForQuestionAnsweringr_   c                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
r   r   r  rT  r<  r   r   r   
qa_outputsr[  r   s     ro   r   z#AlbertForQuestionAnswering.__init__  sU      ++!&EB))F$6$68I8IJ 	r   r   r   r|   rw   r   r   start_positionsend_positionsr   r  r+  r   c                 (   ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   rx   r   )ignore_indexr=   )rP  start_logits
end_logitsr   r2  )r_   rk  r<  r  rQ   r  r   rP   r   clampr   r   r   r2  )r   r   r   r|   rw   r   r   r  r  r   r  r+  r$  rv  r  r  r  r  ignored_indexr  
start_lossend_lossr  s                          ro   r   z"AlbertForQuestionAnswering.forward  s    &1%<k$++B]B]++))%'/!5#  

 "!*#?#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   r  r   r   r   r   r   r   r   r[   r   r   r   r   rO  r   r   r   r   s   @ro   r  r    s+   |   156:593715596:48,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u//0>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
)50	1>
 >
r   r  c                   J    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee
   dee
   dee
   deeef   fd       Z xZS )AlbertForMultipleChoicer_   c                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )r   r   rT  r<  r   r   r  r   r   r   r<   r[  r   s     ro   r   z AlbertForMultipleChoice.__init__  sV     !&)zz&"@"@A))F$6$6: 	r   r   r   r|   rw   r   r   r  r   r  r+  r   c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        Nr   rx   r   r  r=   r  )r_   rk  rX   r   r   r<  r   r<   r   r   r   r2  )r   r   r   r|   rw   r   r   r  r   r  r+  num_choicesr$  rw  r  reshaped_logitsrP  r  r  s                      ro   r   zAlbertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	
 ++))%'/!5#  

  
]3#}= ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   r  r  r   s   @ro   r  r    s    |   156:59371559-1,0/3&*W
E,,-W
 !!2!23W
 !!1!12	W

 u//0W
 E--.W
   1 12W
 ))*W
 $D>W
 'tnW
 d^W
 
)50	1W
 W
r   r  )	rp   r;  rT  rz  r  r  r  r  r  )?r   r   rE   dataclassesr   typingr   r   r[   r   torch.nnr   r   r	   activationsr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   r   utilsr   r   r   configuration_albertr   
get_loggerr   rC   rp   Modulerr   r   r   r	  r  r  r&  r;  rO  rT  rz  rF  r~  r  r  r  r  r  __all__r  r   ro   <module>r     sG     	 ! "   A A ! K   .  : 9 . 
		H	%{|Aryy AHksbii ks\?,/ ?,F  &")) &R ryy  F2
		 2
j %O % %2 
: : :& G
' G
 G
T c
0 c
c
L*BII *<
BII 
 f
- f
 f
R T
&; T
T
n D
#8 D
 D
N J
!6 J
 J
Z c
3 c
 c
L
r   