
    rh+                       d Z ddlZddlZddlmZ ddlmZmZmZ ddl	Z
ddlZddlmZ ddlmZmZmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+  e)jX                  e-      Z.d Z/d>dZ0e e(d       G d de'                    Z1 G d dejd                        Z3 G d dejd                        Z4 G d dejd                        Z5 G d dejd                        Z6 G d d ejd                        Z7 G d! d"ejd                        Z8 G d# d$ejd                        Z9e( G d% d&e!             Z:e e(d'       G d( d)e'                    Z;e( G d* d+e:             Z< G d, d-ejd                        Z= e(d.       G d/ d0e:e             Z> e(d1       G d2 d3e:             Z? e(d4       G d5 d6e:             Z@e( G d7 d8e:             ZAe( G d9 d:e:             ZBe( G d; d<e:             ZCg d=ZDy)?z
PyTorch XLM model.
    N)	dataclass)CallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )geluget_activation)CacheEncoderDecoderCache)GenerationMixin)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )	XLMConfigc                    t        j                  t        |       D cg c];  }t        |      D cg c]$  }|t        j                  dd|dz  z  |z        z  & c}= c}}      }d|_        t        j                  t        j                  |d d dd df               |d d dd df<   t        j                  t        j                  |d d dd df               |d d dd df<   |j                          y c c}w c c}}w )Ni'     Fr   r   )
nparrayrangepowerrequires_gradtorchFloatTensorsincosdetach_)n_posdimoutposjposition_encs         w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/xlm/modeling_xlm.pycreate_sinusoidal_embeddingsr3   1   s    88hmnshtuadQVWZQ[\AcBHHUAaL34F$GG\uvLC$$RVVLADqD,A%BCC14a4L$$RVVLADqD,A%BCC14a4LKKM	 ]us   C;
)C6C;
6C;
c                    t        j                  | t         j                  |j                        }||}n/|j	                         j                         | k  sJ ||dddf   k  }|j                  d      }|r'|ddddf   j                  || d      |ddddf   k  }n|}|j                         || fk(  sJ |du s|j                         || | fk(  sJ ||fS )zH
    Generate hidden states mask, and optionally an attention mask.
    dtypedeviceNr   r   F)r'   arangelongr7   maxitemsizerepeat)slenlengthscausalpadding_maskalenmaskbs	attn_masks           r2   	get_masksrF   9   s     <<EJJw~~FD{{}!!#t+++gag&& 
aBtQ'..r4;tD!TM?RR		 99;2t*$$$U?inn.2tT2BBBB?    zg
    Base class for outputs of question answering models using a [`~modeling_utils.XLMSQuADHead`].
    )custom_introc                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   y)	XLMSquadHeadOutput9  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
        Classification loss as the sum of start token, end token (and is_impossible if provided) classification
        losses.
    start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Log probabilities for the top config.start_n_top start token possibilities (beam-search).
    start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Indices for the top config.start_n_top start token possibilities (beam-search).
    end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
        (beam-search).
    end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
    cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Log probabilities for the `is_impossible` label of the answers.
    Nlossstart_top_log_probsstart_top_indexend_top_log_probsend_top_index
cls_logits)__name__
__module____qualname____doc__rL   r   r'   r(   __annotations__rM   rN   
LongTensorrO   rP   rQ    rG   r2   rJ   rJ   R   s    " )-D(5$$
%,7;%"3"34;26OXe../659x 1 12904M8E,,-4.2J**+2rG   rJ   c                        e Zd ZdZdef fdZ	 ddej                  deej                     dej                  fdZ	 xZ
S )	XLMPoolerStartLogitsz
    Compute SQuAD start logits from sequence hidden states.

    Args:
        config ([`XLMConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    configc                 l    t         |           t        j                  |j                  d      | _        y Nr   )super__init__r   Linearhidden_sizedenseselfr[   	__class__s     r2   r_   zXLMPoolerStartLogits.__init__{   s&    YYv1115
rG   hidden_statesp_maskreturnc                     | j                  |      j                  d      }|;|j                  t        j                  k(  r|d|z
  z  d|z  z
  }|S |d|z
  z  d|z  z
  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        Returns:
            `torch.FloatTensor`: The start logits for SQuAD.
        r     ꌠ9Y>)F)rb   squeezer6   r'   float16)rd   rf   rg   xs       r2   forwardzXLMPoolerStartLogits.forward   so     JJ}%--b1||u}},V$uv~5  V$tf}4rG   N)rR   rS   rT   rU   r   r_   r'   r(   r   rp   __classcell__re   s   @r2   rZ   rZ   r   sO    6y 6
 W["..8@ARAR8S			rG   rZ   c                        e Zd ZdZdef fdZ	 	 	 d
dej                  deej                     deej                     deej                     dej                  f
d	Z
 xZS )XLMPoolerEndLogitsz
    Compute SQuAD end logits from sequence hidden states.

    Args:
        config ([`XLMConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    r[   c                 b   t         |           t        j                  |j                  dz  |j                        | _        t        j                         | _        t        j                  |j                  |j                        | _        t        j                  |j                  d      | _
        y )Nr!   epsr   )r^   r_   r   r`   ra   dense_0Tanh
activation	LayerNormlayer_norm_epsdense_1rc   s     r2   r_   zXLMPoolerEndLogits.__init__   st    yy!3!3a!79K9KL'')f&8&8f>S>STyy!3!3Q7rG   rf   start_statesstart_positionsrg   rh   c                    |	|J d       |R|j                   dd \  }}|ddddf   j                  dd|      }|j                  d|      }|j                  d|d      }| j                  t	        j
                  ||gd            }| j                  |      }| j                  |      }| j                  |      j                  d      }|;|j                  t        j                  k(  r|d|z
  z  d|z  z
  }|S |d|z
  z  d|z  z
  }|S )	a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The end logits for SQuAD.
        N7One of start_states, start_positions should be not Nonerj   r-   r   rk   rl   )shapeexpandgatherry   r'   catr{   r|   r~   rm   r6   rn   )rd   rf   r   r   rg   r>   hszro   s           r2   rp   zXLMPoolerEndLogits.forward   s&   : '?+F 	
E	
F &%++BC0ID#-atm<CCBCPO(//ODL'..r4<LLLM<#@bIJOOANN1LLO##B'||u}},V$uv~5  V$tf}4rG   NNNrR   rS   rT   rU   r   r_   r'   r(   r   rW   rp   rr   rs   s   @r2   ru   ru      s    8y 8 596:.21((1 u0011 "%"2"23	1
 **+1 
		1rG   ru   c                        e Zd ZdZdef fdZ	 	 	 d
dej                  deej                     deej                     deej                     dej                  f
d	Z
 xZS )XLMPoolerAnswerClassz
    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.

    Args:
        config ([`XLMConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    r[   c                    t         |           t        j                  |j                  dz  |j                        | _        t        j                         | _        t        j                  |j                  dd      | _        y )Nr!   r   Fbias)	r^   r_   r   r`   ra   ry   rz   r{   r~   rc   s     r2   r_   zXLMPoolerAnswerClass.__init__   sX    yy!3!3a!79K9KL'')yy!3!3QUCrG   rf   r   r   	cls_indexrh   c                    |j                   d   }|	|J d       |<|ddddf   j                  dd|      }|j                  d|      j                  d      }|=|ddddf   j                  dd|      }|j                  d|      j                  d      }n|dddddf   }| j	                  t        j                  ||gd            }| j                  |      }| j                  |      j                  d      }|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The SQuAD 2.0 answer class.
        rj   Nr   r   r   )	r   r   r   rm   ry   r'   r   r{   r~   )rd   rf   r   r   r   r   cls_token_statero   s           r2   rp   zXLMPoolerAnswerClass.forward   s   : !!"%'?+F 	
E	
F &-atm<CCBCPO(//ODLLRPL !!T4-077BDI+222yAII"MO+Ar1H5OLLL/#BKLOOALLO##B'rG   r   r   rs   s   @r2   r   r      s    Dy D 596:04/((/ u001/ "%"2"23	/
 E,,-/ 
		/rG   r   c                   "    e Zd ZdZdef fdZe	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	ej                     de	ej                     d	e	ej                     d
edeeeej                     f   fd       Z xZS )XLMSQuADHeadz
    A SQuAD head inspired by XLNet.

    Args:
        config ([`XLMConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    r[   c                     t         |           |j                  | _        |j                  | _        t	        |      | _        t        |      | _        t        |      | _	        y rq   )
r^   r_   start_n_top	end_n_toprZ   start_logitsru   
end_logitsr   answer_classrc   s     r2   r_   zXLMSQuADHead.__init__(  sO    !--))08,V408rG   rf   r   end_positionsr   is_impossiblerg   return_dictrh   c                 X   | j                  ||      }||||||fD ]*  }	|	|	j                         dkD  s|	j                  d       , | j                  |||      }
t	               } |||      } ||
|      }||z   dz  }|;|9| j                  |||      }t        j                         } |||      }||dz  z  }|rt        |	      S |fS |j                         \  }}}t        j                  j                  |d
      }t        j                  || j                  d
      \  }}|j                  d      j!                  dd|      }t        j"                  |d|      }|j                  d      j!                  d|dd      }|j                  d      j%                  |      }||j                  d      nd}| j                  |||      }
t        j                  j                  |
d
      }t        j                  || j&                  d
      \  }}|j)                  d| j                  | j&                  z        }|j)                  d| j                  | j&                  z        }t        j*                  d||      }| j                  |||      }|s|||||fS t        |||||      S )a  
        hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
            Final hidden states of the model on the sequence tokens.
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Positions of the first token for the labeled span.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Positions of the last token for the labeled span.
        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Whether the question has a possible answer in the paragraph or not.
        p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
            Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
            should be masked.
        )rg   Nr   rj   )r   rg   r!   )r   r   g      ?)rL   r   r   )r   rg   z
blh,bl->bh)r   r   )rM   rN   rO   rP   rQ   )r   r-   squeeze_r   r	   r   r   r   rJ   r<   
functionalsoftmaxr'   topkr   	unsqueezer   r   	expand_asr   vieweinsum)rd   rf   r   r   r   r   rg   r   r   ro   r   loss_fct
start_lossend_loss
total_lossrQ   loss_fct_clscls_lossbszr>   r   start_log_probsrM   rN   start_top_index_expr   hidden_states_expandedend_log_probsrO   rP   s                                 r2   rp   zXLMSQuADHead.forward1  s   4 ((v(F&=+D%}iO #=QUUWq[JJrN#
 `fgJ')H!,@J
M:H$x/14J$)B!..}oir.s
!335'
MB hn,
:E%:6XJ=X +//1NCs mm33Lb3IO38::!1!1r40 #2";";B"?"F"Fr2s"S <<r;NOL'11!4;;Bb"ML%2%<%<Q%?%I%I&" .4-?V%%b)TF)?lcijJMM11*!1DM/4zzt~~10,} !2 6 6r4;K;Kdnn;\ ])..r43C3Cdnn3TUM <<m_UL**=|_h*iJ+_>OQ^`jkk)(;$3&7"/) rG   )NNNNNF)rR   rS   rT   rU   r   r_   r   r'   r(   r   rW   boolr   rJ   tuplerp   rr   rs   s   @r2   r   r     s    9y 9  7;480448.2!Y((Y "%"2"23Y   0 01	Y
 E,,-Y   0 01Y **+Y Y 
!5):):#;;	<Y YrG   r   c                        e Zd ZdZdef fdZ	 ddej                  deej                     dej                  fdZ
 xZS )	XLMSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`XLMConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r[   c                 f   t         |           t        |dd      | _        | j                  dk(  rt        t        j                         | _        t        |d      rq|j                  ret        |d      r(|j                  r|j                  dkD  r|j                  }n|j                  }t        j                  |j                  |      | _        t        |dd       }|rt        |      nt        j                         | _        t        j                         | _        t        |d      r3|j"                  dkD  r$t        j$                  |j"                        | _        t        j                         | _        t        |d	      r5|j(                  dkD  r%t        j$                  |j(                        | _        y y y )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r^   r_   getattrr   NotImplementedErrorr   Identitysummaryhasattrr   r   
num_labelsra   r`   r   r{   first_dropoutr   Dropoutlast_dropoutr   )rd   r[   num_classesactivation_stringre   s       r2   r_   zXLMSequenceSummary.__init__  sU   #FNFC& &%{{}6-.63J3Jv78V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]6238T8TWX8X!#F,H,H!IDKKM612v7R7RUV7V "

6+F+F GD 8W2rG   rf   r   rh   c                    | j                   dk(  r|dddf   }n| j                   dk(  r|dddf   }n| j                   dk(  r|j                  d      }n| j                   d	k(  r|At        j                  |d
ddddf   |j                  d   dz
  t        j
                        }nX|j                  d      j                  d      }|j                  d|j                         dz
  z  |j                  d      fz         }|j                  d|      j                  d      }n| j                   dk(  rt        | j                        }| j                  |      }| j                  |      }| j!                  |      }|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r   Nrj   firstr   meanr   r   r   .r   )r6   )rj   r   )r   r   r'   	full_liker   r9   r   r   r-   r<   r   rm   r   r   r   r{   r   )rd   rf   r   outputs       r2   rp   zXLMSequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*rG   rq   r   rs   s   @r2   r   r     sQ    2Hy H< Y])"..);CEDTDT;U)			)rG   r   c                   X     e Zd Z ej                         Z fdZd Z	 	 	 	 	 ddZ xZ	S )MultiHeadAttentionc                    t         |           t        t        j                        | _        || _        || _        ||z  | _        |j                  | _
        | j                  | j                  z  dk(  sJ t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        t#               | _        y )Nr   )r^   r_   nextr   NEW_IDlayer_idr-   n_headshead_dimattention_dropoutdropoutr   r`   q_link_linv_linout_linsetpruned_heads)rd   r   r-   r[   re   s       r2   r_   zMultiHeadAttention.__init__  s    /667w//xx$,,&!+++YYsC(
YYsC(
YYsC(
yyc*ErG   c                    | j                   | j                  z  }t        |      dk(  ry t        || j                  || j                        \  }}t        | j                  |      | _        t        | j                  |      | _        t        | j                  |      | _        t        | j                  |d      | _	        | j                  t        |      z
  | _        || j                  z  | _         | j                  j                  |      | _        y )Nr   r   r   )r-   r   lenr   r   r   r   r   r   r   union)rd   headsattention_head_sizeindexs       r2   prune_headszMultiHeadAttention.prune_heads  s    "hh$,,6u:?7t||M`bfbsbstu'

E:
'

E:
'

E:
)$,,1E||c%j0&5 --33E:rG   c                    |j                         \  }}	}
|du}|j                         dk(  r|d|	dfn|dddf}| j                  |      j                  |d| j                  | j
                        j                  dd      }|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|r7|5r3j                  | j                     }|j                  | j                     }n| j!                  |      }| j#                  |      }|j                  |d| j                  | j
                        j                  dd      }|j                  |d| j                  | j
                        j                  dd      }|D|s|nd}j%                  ||| j                  d|i      \  }}|rd|j                  | j                  <   |t'        j(                  | j
                        z  }t+        j,                  ||j                  dd            }|dk(  j                  |      j/                  |      }|j1                  |t+        j2                  |j4                        j6                         t8        j:                  j=                  |j?                         d	      jA                  |      }t8        j:                  jC                  || jB                  | jD                  
      }|||z  }t+        j,                  ||      }|j                  dd      jG                         j                  |d| j                  | j
                  z        }| jI                  |      f}|r||fz   }|S )zd
        Self-attention (if kv is None) or attention over source sentence (provided by kv).
        Nr   r   rj   r!   cache_positionTr   r   ptraining)%r<   r-   r   r   r   r   	transpose
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   updatemathsqrtr'   matmulr   masked_fill_finfor6   minr   r   r   floattype_asr   r   
contiguousr   )rd   inputrC   kvcache	head_maskoutput_attentionsr   rD   qlenr-   is_cross_attentionmask_reshapeqr   curr_past_key_valuecurrent_stateskvscoresweightscontextoutputss                          r2   rp   zMultiHeadAttention.forward  s    

D#t^,0HHJ!OAtR("aBJJu""2r4<<GQQRSUVW%!45"--11$--@
%*/*E*E'*/*D*D'&+#1u%"3
#--dmm<A#//>A

>*A

>*Ar2t||T]];EEaKAr2t||T]];EEaKA 7It*11!QHXZhGij1%6:E$$T]]3		$--((aQ!23	-77?D%++fll";"?"?@--''B'?GGO--''4<<$--'X  	)G,,w*##Aq)446;;BDLLSWS`S`D`a<<(*
*GrG   )NNNFN)
rR   rS   rT   	itertoolscountr   r_   r   rp   rr   rs   s   @r2   r   r     s2    Y__F";& DrG   r   c                   *     e Zd Z fdZd Zd Z xZS )TransformerFFNc                 B   t         |           |j                  | _        t        j                  ||      | _        t        j                  ||      | _        |j                  rt        nt        j                  j                  | _        |j                  | _        d| _        y r]   )r^   r_   r   r   r`   lin1lin2gelu_activationr   r   reluactchunk_size_feed_forwardseq_len_dim)rd   in_dim
dim_hiddenout_dimr[   re   s        r2   r_   zTransformerFFN.__init__Z  sn    ~~IIfj1	IIj'2	!114r}}7I7I'-'E'E$rG   c                 Z    t        | j                  | j                  | j                  |      S rq   )r   ff_chunkr  r  )rd   r   s     r2   rp   zTransformerFFN.forwardc  s%    (8T8TVZVfVfhmnnrG   c                     | j                  |      }| j                  |      }| j                  |      }t        j                  j                  || j
                  | j                        }|S )Nr   )r  r  r  r   r   r   r   )rd   r   ro   s      r2   r  zTransformerFFN.ff_chunkf  sR    IIeHHQKIIaLMM!!!t||dmm!LrG   )rR   rS   rT   r_   rp   r  rr   rs   s   @r2   r  r  Y  s    orG   r  c                   H     e Zd ZU eed<   dZdZ fdZed        Z	d Z
 xZS )XLMPreTrainedModelr[   Ntransformerc                 $    t        |   |i | y rq   )r^   r_   )rd   inputskwargsre   s      r2   r_   zXLMPreTrainedModel.__init__t  s    &+F+rG   c                 &   t        j                  g dg dg dg      }t        j                  g dg dg dg      }| j                  j                  r8| j                  j                  dkD  rt        j                  g dg dg dg      }nd }|||dS )	N)      r   r   r   )r   r!   r   r   r   )r   r   r         )r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   r   )	input_idsattention_masklangs)r'   tensorr[   use_lang_embn_langs)rd   inputs_list
attns_list
langs_lists       r2   dummy_inputszXLMPreTrainedModel.dummy_inputsw  sr    llO_o#VW\\?O_"UV
;;##(;(;a(?&YZJJ(JQ[\\rG   c                    t        |t        j                        r| j                  V| j                  j                  @t        j
                  j                  |j                  d| j                  j                         |j                  1|j                  j                  |j                     j                          t        |t        j                        r| j                  | j                  j                  vt        j
                  j                  |j                  d| j                  j                         |j                  *t        j
                  j                  |j                  d       t        |t        j                        rI|j                  j                  j                          |j                  j                  j!                  d       t        |t"              rb| j                  j$                  rKt'        | j                  j(                  | j                  j*                  |j,                  j                         yyy)zInitialize the weights.Nr   )r   stdg        g      ?)r.   )r   r   	Embeddingr[   embed_init_stdinitnormal_weightpadding_idxdatazero_r`   init_stdr   	constant_r|   fill_XLMModelsinusoidal_embeddingsr3   max_position_embeddingsemb_dimposition_embeddings)rd   modules     r2   _init_weightsz XLMPreTrainedModel._init_weights  sq   fbll+{{&4;;+E+E+QA4;;;U;UV!!-""6#5#56<<>fbii({{&4;;+?+?+KA4;;;O;OP;;*GG%%fkk37fbll+KK""$MM$$S)fh'DKK,M,M(33T[[5H5HfNhNhNoNo -N'rG   )rR   rS   rT   r   rV   load_tf_weightsbase_model_prefixr_   propertyr5  rI  rr   rs   s   @r2   r"  r"  n  s4    O%, ] ]rG   r"  zU
    Base class for outputs of question answering models using a `XLMSQuADHead`.
    c                   f   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeeej                  d	f      ed
<   dZeeej                  d	f      ed<   y)XLMForQuestionAnsweringOutputrK   NrL   rM   rN   rO   rP   rQ   .rf   
attentions)rR   rS   rT   rU   rL   r   r'   r(   rV   rM   rN   rW   rO   rP   rQ   rf   r   rO  rX   rG   r2   rN  rN    s    " )-D(5$$
%,7;%"3"34;26OXe../659x 1 12904M8E,,-4.2J**+2=AM8E%"3"3S"89:A:>Ju00#567>rG   rN  c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     deeee	j                  f      dee	j                     dee	j                     dee   dee   dee   dee	j                     deeef   fd       Z xZS )rC  c           
      F	   t         |   |       |j                  | _        |j                   | _        | j                  rt	        d      |j
                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _	        |j                  | _
        |j                  | _        | j                  dz  | _        |j                  | _        |j                  | _        |j                   | _        |j"                  | _        | j                  | j                  z  dk(  sJ d       t%        j&                  |j(                  | j                        | _        |j                  dkD  r;|j                  r/t%        j&                  | j                  | j                        | _        t%        j&                  | j                  | j                  | j                        | _        t%        j0                  | j                  |j2                        | _        t%        j6                         | _        t%        j6                         | _        t%        j6                         | _        t%        j6                         | _        tA        | j                        D ]  }| j8                  jC                  tE        | j                  | j                  |             | j:                  jC                  t%        j0                  | j                  |j2                               | j<                  jC                  tG        | j                  | j                  | j                  |             | j>                  jC                  t%        j0                  | j                  |j2                                tI        |d	      r|jJ                  jM                         jO                         }i |_%        |D ]h  \  }}| j8                  tQ        |         j                  |j                  k(  s6| jS                  tQ        |      tU        tW        tP        |            i       j | jY                          | j[                  d
t]        j^                  |j(                        ja                  d      d       y )Nz,Currently XLM can only be used as an encoderr*  r   z-transformer dim must be a multiple of n_headsr   )r=  rw   )r[   r   position_ids)r   rj   F)
persistent)1r^   r_   
is_encoder
is_decoderr   r@   r1  r0  n_words	eos_index	pad_indexrF  r-   
hidden_dimr   n_layersr   r   r   r8  rE  rG  lang_embeddings
embeddingsr|   r}   layer_norm_emb
ModuleListrO  layer_norm1ffnslayer_norm2r$   appendr   r  r   r   copyitemsintr   listmap	post_initregister_bufferr'   r8   r   )rd   r[   _r   layerr   re   s         r2   r_   zXLMModel.__init__  sG     !++$///??%&TUUmm ~~"//~~)))) >>((Q,~~~~!'!9!9xx$,,&!+\-\\+ $&<<0N0NPTPXPX#Y >>A&"5"5#%<<dhh#GD ,,t||TXX4>>Z ll4889N9NO --/==?MMO	==?
 t}}% 	WAOO""#5dllDHHU[#\]##BLLv?T?T$UV II^DHHdootxxX^_`##BLLv?T?T$UV	W 6>*!..335;;=L"$F , Ju??3u:.66&..H$$c%j$s32G%HIJ
 	ELL)G)GHOOPWXej 	 	
rG   c                     | j                   S rq   r\  rd   s    r2   get_input_embeddingszXLMModel.get_input_embeddings   s    rG   c                     || _         y rq   rm  rd   new_embeddingss     r2   set_input_embeddingszXLMModel.set_input_embeddings  s	    (rG   c                 p    |j                         D ]#  \  }}| j                  |   j                  |       % y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)rd  rO  r   )rd   heads_to_prunerk  r   s       r2   _prune_headszXLMModel._prune_heads  s7    
 +002 	6LE5OOE"..u5	6rG   r,  r-  r.  token_type_idsrR  r?   r   r  inputs_embedsr  output_hidden_statesr   r   rh   c           	         |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||j	                         \  }}n|	j	                         dd \  }}||j
                  n|	j
                  }t        |t              st        j                  |      }|K|.|| j                  k7  j                  d      j                         }nt        j                  |g|z  |      }|j	                  d      |k(  sJ |j                         j!                         |k  sJ t#        ||| j$                  |      \  }}|| j&                  ddd|f   }n|j	                         ||fk(  sJ ||j	                         ||fk(  sJ | j)                  || j                   j*                        }|S|Q||j-                         z
  }|dd| df   }|dd| df   }||dd| df   }|dd| df   }|dd| df   }|	| j/                  |      }	|	| j1                  |      j3                  |	      z   }|/| j4                  r#| j6                  dkD  r|| j9                  |      z   }||| j/                  |      z   }| j;                  |      }t<        j>                  jA                  || j@                  | jB                        }||jE                  d      jG                  |jH                        z  }|rd	nd}|
rd	nd}tK        | j*                        D ]  }|r||fz   } | jL                  |   |||||   |
|
      }|d   }|
r	||d   fz   }t<        j>                  jA                  || j@                  | jB                        }||z   } | jN                  |   |      }| | jP                  |   |      z   } | jR                  |   |      }||jE                  d      jG                  |jH                        z  } |r||fz   }|stU        d |||fD              S tW        |||      S )$  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        Nrj   r   r   )r7   r   )rA   r   rX   )r   r  r  r   c              3   &   K   | ]	  }||  y wrq   rX   ).0r
  s     r2   	<genexpr>z#XLMModel.forward.<locals>.<genexpr>  s     Yq1=Ys   )last_hidden_staterf   rO  ),r[   r  ry  use_return_dictr<   r7   r   r   r   from_legacy_cacherX  sumr9   r'   r/  r:   r;   rF   r@   rR  get_head_maskrZ  get_seq_lengthr\  rG  r   r0  r1  r[  r]  r   r   r   r   r   tor6   r$   rO  r_  r`  ra  r   r   )rd   r,  r-  r.  rw  rR  r?   r   r  rx  r  ry  r   r   r&  rD   r>   r7   rC   rE   _slenr/  rf   rO  iattn_outputsr   s                              r2   rp   zXLMModel.forward  so   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  ~~'HB$))+CR0HB%.%:!!@T@T%''99%@E?$$6;;;BGGI,,v{6B ||A"$$${{}!!#t+++ $D'4;;^\i ,,QX6L$$&2t*444 ::<B:--- &&y$++2F2FG	 !65//11E!!eVW*-I'E67
3L a%j)E67
#D!!eVW*-I   OOI6M!9!9,!G!Q!QR_!``!2!2t||a7Gd22599F%doon==F$$V,&&v&V$..$''55 3,R$
t}}% 	:A# -	 9 .4??1-#A,"3-L  ?D '<?*<<
==(((VDd]F(T%%a(0F ldiil622F(T%%a(0FdnnR(++FLL99F/	:4  )VI5MYV]J$GYYY}akllrG   NNNNNNNNNNNNN)rR   rS   rT   r_   ro  rs  rv  r   r   r'   Tensordictstrr   r   r   r   rp   rr   rs   s   @r2   rC  rC    sz   D
L)6  -115(,15/3*.37,004,0/3&*15ImELL)Im !.Im %	Im
 !.Im u||,Im %,,'Im S%,,./0Im ELL)Im  -Im $D>Im 'tnIm d^Im !.Im  
uo%	&!Im ImrG   rC  c                   *     e Zd ZdZ fdZddZ xZS )XLMPredLayerz?
    Prediction layer (cross_entropy or adaptive_softmax).
    c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  }|j                  du r(t        j                  ||j                  d      | _        y t        j                  ||j                  |j                  |j                  d      | _        y )NFTr   )in_features	n_classescutoffs	div_value	head_bias)r^   r_   asmrV  rX  rF  r   r`   projAdaptiveLogSoftmaxWithLossasm_cutoffsasm_div_value)rd   r[   r-   re   s      r2   r_   zXLMPredLayer.__init__  s    ::~~))nn::		#v~~DADI55 ..** ..DIrG   c                 z   d}| j                   du rl| j                  |      }|f|z   }|Qt        j                  j	                  |j                  d| j                        |j                  d      d      }|f|z   }|S | j                  j                  |      }|f|z   }|| j                  ||      \  }}|f|z   }|S )z,Compute the loss, and optionally the scores.rX   Frj   r   )	reduction)r  r  r   r   cross_entropyr   rV  log_prob)rd   ro   yr  r  rL   rj  s          r2   rp   zXLMPredLayer.forward  s    88uYYq\Fi')G}}}226;;r4<<3PRSRXRXY[R\hn2o'G+  YY''*Fi')G}))Aq/4'G+rG   rq   )rR   rS   rT   rU   r_   rp   rr   rs   s   @r2   r  r    s    $rG   r  z
    The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c            "           e Zd ZdgZ fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	eee
j                  f      de	e
j                     de	e
j                     de	e
j                     de	e   de	e   de	e   de	e
j                     deeef   fd       Z xZS )XLMWithLMHeadModelzpred_layer.proj.weightc                     t         |   |       t        |      | _        t	        |      | _        | j                          y rq   )r^   r_   rC  r#  r  
pred_layerrh  rc   s     r2   r_   zXLMWithLMHeadModel.__init__  s5     #F+&v. 	rG   c                 .    | j                   j                  S rq   r  r  rn  s    r2   get_output_embeddingsz(XLMWithLMHeadModel.get_output_embeddings  s    ###rG   c                 &    || j                   _        y rq   r  rq  s     r2   set_output_embeddingsz(XLMWithLMHeadModel.set_output_embeddings  s    -rG   c                 P   | j                   j                  }| j                   j                  }|j                  d   }t	        j
                  |df|t        j                  |j                        }t	        j                  ||gd      }|t	        j                  ||      }nd }||dS )Nr   r   r5   r   )r,  r.  )
r[   mask_token_idlang_idr   r'   fullr9   r7   r   r   )rd   r,  r&  r  r  effective_batch_size
mask_tokenr.  s           r2   prepare_inputs_for_generationz0XLMWithLMHeadModel.prepare_inputs_for_generation  s     11++%%(q1ZZ!5q 9=PUPZPZclcscst
IIy*51=	OOIw7EE&77rG   r,  r-  r.  rw  rR  r?   r   r  rx  labelsr  ry  r   r   rh   c                     ||n| j                   j                  } | j                  |f||||||||	||||d|}|d   }| j                  ||
      }|s||dd z   S t	        |
|d   nd|
|d   n|d   |j
                  |j                        S )a  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r-  r.  rw  rR  r?   r   r  rx  r  ry  r   r   r   r   rL   logitsrf   rO  )r[   r  r#  r  r   rf   rO  )rd   r,  r-  r.  rw  rR  r?   r   r  rx  r  r  ry  r   r   r&  transformer_outputsr   r  s                      r2   rp   zXLMWithLMHeadModel.forward  s    X &1%<k$++B]B].d..
))%'/!5#)
 
" %Q'//&&10444%1t!'71:WQZ-;;*55	
 	
rG   NNNNNNNNNNNNNN)rR   rS   rT   _tied_weights_keysr_   r  r  r  r   r   r'   r  r  r  r   r   r   r   rp   rr   rs   s   @r2   r  r    s    33$.8  -115(,15/3*.37,004)-,0/3&*15I
ELL)I
 !.I
 %	I

 !.I
 u||,I
 %,,'I
 S%,,./0I
 ELL)I
  -I
 &I
 $D>I
 'tnI
 d^I
 !.I
" 
un$	%#I
 I
rG   r  z
    XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
    for GLUE tasks.
    c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deee	ej                  f      d	eej                     d
eej                     deej                     dee
   dee
   dee
   deeef   fd       Z xZS )XLMForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        | j                          y rq   )	r^   r_   r   r[   rC  r#  r   sequence_summaryrh  rc   s     r2   r_   z%XLMForSequenceClassification.__init__?  sJ      ++#F+ 26 : 	rG   r,  r-  r.  rw  rR  r?   r   r  rx  r  r  ry  r   rh   c                 $   ||n| j                   j                  }| j                  |||||||||	|||      }|d   }| j                  |      }d}|
| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|
j                  t        j                  k(  s|
j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |
j                               }n |||
      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |
j                  d            }n,| j                   j                  dk(  rt               } |||
      }|s|f|dd z   }||f|z   S |S t        |||j                   |j"                  	      S )
a  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr-  r.  rw  rR  r?   r   r  rx  r  ry  r   r   r   
regressionsingle_label_classificationmulti_label_classificationrj   r  )r[   r  r#  r  problem_typer   r6   r'   r9   re  r
   rm   r	   r   r   r   rf   rO  )rd   r,  r-  r.  rw  rR  r?   r   r  rx  r  r  ry  r   r  r   r  rL   r   s                      r2   rp   z$XLMForSequenceClassification.forwardJ  s   T &1%<k$++B]B]"..))%'/!5# / 
 %Q'&&v.{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y!4QR!88F)-)9TGf$EvE'-;;*55	
 	
rG   r  )rR   rS   rT   r_   r   r   r'   r  r  r  r   r   r   r   rp   rr   rs   s   @r2   r  r  8  sZ   	  -115(,15/3*.37,004)-,0/3&*]
ELL)]
 !.]
 %	]

 !.]
 u||,]
 %,,']
 S%,,./0]
 ELL)]
  -]
 &]
 $D>]
 'tn]
 d^]
 
u..	/]
 ]
rG   r  z
    XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deee	ej                  f      d	eej                     d
eej                     deej                     deej                     dee
   dee
   dee
   deeef   fd       Z xZS )XLMForQuestionAnsweringSimplec                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y rq   )
r^   r_   rC  r#  r   r`   ra   r   
qa_outputsrh  rc   s     r2   r_   z&XLMForQuestionAnsweringSimple.__init__  sG     #F+))F$6$68I8IJ 	rG   r,  r-  r.  rw  rR  r?   r   r  rx  r   r   r  ry  r   rh   c                 .   ||n| j                   j                  }| j                  |||||||||	|||      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}|
|t        |
j                               dkD  r|
j                  d      }
t        |j                               dkD  r|j                  d      }|j                  d      }|
j                  d|      }
|j                  d|      }t        |      } |||
      } |||      }||z   dz  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
r{  Nr  r   r   rj   r   )ignore_indexr!   )rL   r   r   rf   rO  )r[   r  r#  r  splitrm   r   r   r<   clampr	   r   rf   rO  )rd   r,  r-  r.  rw  rR  r?   r   r  rx  r   r   r  ry  r   r  sequence_outputr  r   r   r   ignored_indexr   r   r   r   s                             r2   rp   z%XLMForQuestionAnsweringSimple.forward  s   N &1%<k$++B]B]"..))%'/!5# / 
 .a01#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/2Eab2IIF/9/EZMF*Q6Q+%!-;;*55
 	
rG   r  )rR   rS   rT   r_   r   r   r'   r  r  r  r   r   r   r   rp   rr   rs   s   @r2   r  r    sq     -115(,15/3*.37,0042604,0/3&*Y
ELL)Y
 !.Y
 %	Y

 !.Y
 u||,Y
 %,,'Y
 S%,,./0Y
 ELL)Y
  -Y
 "%,,/Y
  -Y
 $D>Y
 'tnY
 d^Y
  
u22	3!Y
 Y
rG   r  c            (       .    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deee	ej                  f      d	eej                     d
eej                     deej                     deej                     deej                     deej                     deej                     dee
   dee
   dee
   deeef   f$d       Z xZS )XLMForQuestionAnsweringc                     t         |   |       t        |      | _        t	        |      | _        | j                          y rq   )r^   r_   rC  r#  r   r  rh  rc   s     r2   r_   z XLMForQuestionAnswering.__init__  s5     #F+&v. 	rG   r,  r-  r.  rw  rR  r?   r   r  rx  r   r   r   r   rg   r  ry  r   rh   c                    ||n| j                   j                  }| j                  |||||||||	|||      }|d   }| j                  ||
|||||      }|s||dd z   S t	        |j
                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )a]  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels whether a question has an answer or no answer (SQuAD 2.0)
        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the classification token to use as input for computing plausibility of the
            answer.
        p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
            masked. 0.0 mean token is not masked.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, XLMForQuestionAnswering
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
        >>> model = XLMForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
        ...     0
        ... )  # Batch size 1
        >>> start_positions = torch.tensor([1])
        >>> end_positions = torch.tensor([3])

        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
        >>> loss = outputs.loss
        ```Nr  r   )r   r   r   r   rg   r   r   )rL   rM   rN   rO   rP   rQ   rf   rO  )r[   r  r#  r  rN  rL   rM   rN   rO   rP   rQ   rf   rO  )rd   r,  r-  r.  rw  rR  r?   r   r  rx  r   r   r   r   rg   r  ry  r   r  r   r  s                        r2   rp   zXLMForQuestionAnswering.forward#  s    H &1%<k$++B]B]"..))%'/!5# / 
 %Q'//+''# " 
 0444, ' ; ;#33%77!//))-;;*55	
 		
rG   )NNNNNNNNNNNNNNNNN)rR   rS   rT   r_   r   r   r'   r  r  r  r   r   r   rN  rp   rr   rs   s   @r2   r  r    s     -115(,15/3*.37,004260404,0)-,0/3&*%l
ELL)l
 !.l
 %	l

 !.l
 u||,l
 %,,'l
 S%,,./0l
 ELL)l
  -l
 "%,,/l
  -l
  -l
 ELL)l
 &l
  $D>!l
" 'tn#l
$ d^%l
& 
u33	4'l
 l
rG   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deee	ej                  f      d	eej                     d
eej                     deej                     dee
   dee
   dee
   deeef   fd       Z xZS )XLMForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        | j                          y rq   )r^   r_   r   rC  r#  r   r   r   r`   ra   
classifierrh  rc   s     r2   r_   z"XLMForTokenClassification.__init__  sh      ++#F+zz&..1))F$6$68I8IJ 	rG   r,  r-  r.  rw  rR  r?   r   r  rx  r  r  ry  r   rh   c                    ||n| j                   j                  }| j                  |||||||||	|||      }|d   }| j                  |      }| j	                  |      }d}|
<t               } ||j                  d| j                        |
j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rj   r   r  )r[   r  r#  r   r  r	   r   r   r   rf   rO  )rd   r,  r-  r.  rw  rR  r?   r   r  rx  r  r  ry  r   r  r  r  rL   r   r   s                       r2   rp   z!XLMForTokenClassification.forward  s   P &1%<k$++B]B]""))%'/!5# # 
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rG   r  )rR   rS   rT   r_   r   r   r'   r  r  r  r   r   r   r   rp   rr   rs   s   @r2   r  r    sZ   	  -115(,15/3*.37,004)-,0/3&*K
ELL)K
 !.K
 %	K

 !.K
 u||,K
 %,,'K
 S%,,./0K
 ELL)K
  -K
 &K
 $D>K
 'tnK
 d^K
 
u++	,K
 K
rG   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deee	ej                  f      d	eej                     d
eej                     deej                     dee
   dee
   dee
   deeef   fd       Z xZS )XLMForMultipleChoicec                     t        |   |g|i | t        |      | _        t	        |      | _        t        j                  |j                  d      | _	        | j                          y r]   )r^   r_   rC  r#  r   r  r   r`   r   logits_projrh  )rd   r[   r%  r&  re   s       r2   r_   zXLMForMultipleChoice.__init__  sY    3&3F3#F+ 26 :99V%6%6: 	rG   r,  r-  r.  rw  rR  r?   r   r  rx  r  r  ry  r   rh   c                    ||n| j                   j                  }||j                  d   n|	j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|	1|	j                  d|	j	                  d      |	j	                  d            nd}	|t
        j                  d       d}| j                  |||||||||	|||      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|
t               } |||
      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )	a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        langs (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rj   r   zrThe `lengths` parameter cannot be used with the XLM multiple choice models. Please use the attention mask instead.)r,  r-  r.  rw  rR  r?   r   r  rx  r  ry  r   r   r  )r[   r  r   r   r<   loggerwarningr#  r  r  r	   r   rf   rO  )rd   r,  r-  r.  rw  rR  r?   r   r  rx  r  r  ry  r   num_choicesr  r   r  reshaped_logitsrL   r   s                        r2   rp   zXLMForMultipleChoice.forward  s.   D &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei272C

2uzz"~. ( r=#5#5b#9=;M;Mb;QR 	 NN* G"..))%'/!5# / 
 %Q'&&v.!!&) ++b+6')HOV4D%'*=ab*AAF)-)9TGf$EvE("-;;*55	
 	
rG   r  )rR   rS   rT   r_   r   r   r'   r  r  r  r   r   r   r   rp   rr   rs   s   @r2   r  r    sZ     -115(,15/3*.37,004)-,0/3&*w
ELL)w
 !.w
 %	w

 !.w
 u||,w
 %,,'w
 S%,,./0w
 ELL)w
  -w
 &w
 $D>w
 'tnw
 d^w
 
u//	0w
 w
rG   r  )r  r  r  r  r  rC  r"  r  rq   )ErU   r  r   dataclassesr   typingr   r   r   numpyr"   r'   r   torch.nnr   r	   r
   activationsr   r   cache_utilsr   r   
generationr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_xlmr   
get_loggerrR   r  r3   rF   rJ   ModulerZ   ru   r   r   r   r   r  r"  rN  rC  r  r  r  r  r  r  r  __all__rX   rG   r2   <module>r     s~     ! , ,    A A / 5 )  . l l 9 9 ( 
		H	%2 
3 3 34#299 #LB BJ>299 >Bm299 m`` `Fe ePRYY * $ $ $N 
?K ? ?8 _m! _m _mD'299 'T j
+_ j
j
Z j
#5 j
j
Z d
$6 d
d
N w
0 w
 w
t X
 2 X
 X
v C
- C
 C
L	rG   