
    rh                    X   d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%  e#jL                  e'      Z(g dZ)e e"d       G d de                    Z*d Z+ G d dejX                        Z- G d dejX                        Z. G d dejX                        Z/ G d dejX                        Z0 G d dejX                        Z1 G d  d!ejX                        Z2 G d" d#ejX                        Z3 G d$ d%ejX                        Z4 G d& d'e      Z5 G d( d)ejX                        Z6 G d* d+ejX                        Z7 G d, d-ejX                        Z8 G d. d/ejX                        Z9 G d0 d1ejX                        Z:e" G d2 d3e             Z;e" G d4 d5e;             Z< e"d6       G d7 d8e;             Z=e" G d9 d:e;             Z>e" G d; d<e;             Z?e" G d= d>e;             Z@g d?ZAy)@zPyTorch CANINE model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputModelOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )CanineConfig)   +   ;   =   I   a   g   q                           a  
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)CanineModelOutputWithPoolinga  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
        shallow Transformer encoder).
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
        Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
        weights are trained from the next sentence prediction (classification) objective during pretraining.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
        encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
        config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
        initial input to each Transformer encoder. The hidden states of the shallow encoders have length
        `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
        `config.downsampling_rate`.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
        num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
        config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
        attention softmax, used to compute the weighted average in the self-attention heads.
    Nlast_hidden_statepooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r.   r   torchFloatTensor__annotations__r/   r0   tupler1        }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/canine/modeling_canine.pyr-   r-   3   sh    , 6:x 1 12915M8E--.58<M8E%"3"345<59Ju00129r;   r-   c           	         	 ddl }ddl}ddl}t        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }g }	|D ]^  \  }
}t        j                  d|
 d|        |j                  j                  ||
      }|j                  |
       |	j                  |       ` t        ||	      D ]4  \  }
}|
j                  d      }
t!        d |
D              r(t        j                  d	dj#                  |
              R|
d   d
k(  rd|
d<   nU|
d   dk(  r|
j%                  |
d          n8|
d   dk(  rd|
d<   n*|
d   dk(  r
dg|
dd z   }
n|
d   dk(  r|
d   dv r	dg|
dd z   }
| }|
D ]  }|j'                  d|      rd|vr|j                  d|      }n|g}|d   dk(  s|d   dk(  rt)        |d      }nB|d   dk(  s|d   dk(  rt)        |d      }n%|d   dk(  rt)        |d      }n	 t)        ||d         }t-        |      d k\  st/        |d         }||   } d!d d"k(  rt)        |d      }nD|d#d t1        d$      D cg c]  }d%| 	 c}v rt)        |d      }n|dk(  r|j3                  |      }|j4                  |j4                  k7  r&t7        d&|j4                   d'|j4                   d(      t        j                  d)|
        t9        j:                  |      |_        7 | S # t        $ r t        j                  d        w xY w# t*        $ r+ t        j                  d	dj#                  |
              Y w xY wc c}w )*z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   $   K   | ]  }|d v  
 yw))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepclsautoregressive_decoderchar_output_weightsNr:   ).0ns     r<   	<genexpr>z,load_tf_weights_in_canine.<locals>.<genexpr>v   s$      
  	

s   z	Skipping bertencoderr   
embeddingssegment_embeddingstoken_type_embeddingsinitial_char_encoderchars_to_moleculesfinal_char_encoder)	LayerNormconv
projectionz[A-Za-z]+_\d+Embedderz_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weights   i_embeddingsi   	Embedder_zPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoinremove	fullmatchgetattrAttributeErrorlenintrange	transposeshape
ValueErrorr6   
from_numpydata)modelconfigtf_checkpoint_pathrc   nptftf_path	init_varsnamesarraysnamer}   arraypointerm_namescope_namesnumis                     r<   load_tf_weights_in_caniner   X   s   
 ggoo01G
KK8	BC''0IEF  e(l5'BC&&w5Te	 5&) D/ezz#  
 
 
 KK)CHHTN#3457fDG!W$KKQ !W,,-DG!W..()DI5D!W,,a<Q1Q >DH,D 	'F-v6Jf<T hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84%g{1~>G ;1$+a.)!#,'	'( #$<=(gx0GCD\uQx@!	!o@@gx0GxLL'E==EKK'~gmm_<Mekk]Zefgg078''.ID/J Lo  Q	
 	J & KK)CHHTN+; <= As#   L L0$M' L-00M$#M$c                        e Zd ZdZ fdZdedefdZdededefdZ	 	 	 	 ddee	j                     d	ee	j                     d
ee	j                     dee	j                     de	j                  f
dZ xZS )CanineEmbeddingsz<Construct the character, position and token_type embeddings.c           	         t         |           || _        |j                  |j                  z  }t        |j                        D ]2  }d| }t        | |t        j                  |j                  |             4 t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        | j%                  dt'        j(                  |j*                        j-                  d      d       t/        |dd      | _        y )	NHashBucketCodepointEmbedder_epsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   hidden_sizenum_hash_functionsr{   setattrr   	Embeddingnum_hash_bucketschar_position_embeddingstype_vocab_sizerO   rT   layer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr6   arangemax_position_embeddingsexpandrw   r   )selfr   shard_embedding_sizer   r   	__class__s        r<   r   zCanineEmbeddings.__init__   s7     &11V5N5NNv001 	]A1!5DD$V-D-DFZ [\	] )+V5L5LfN`N`(a%%'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$r;   
num_hashesnum_bucketsc                     |t        t              kD  rt        dt        t                     t        d| }g }|D ]  }|dz   |z  |z  }|j                  |         |S )a  
        Converts ids to hash bucket ids via multiple hashing.

        Args:
            input_ids: The codepoints or other IDs to be hashed.
            num_hashes: The number of hash functions to use.
            num_buckets: The number of hash buckets (i.e. embeddings in each table).

        Returns:
            A list of tensors, each of which is the hash bucket IDs from one hash function.
        z`num_hashes` must be <= Nr   )ry   _PRIMESr~   rp   )r   	input_idsr   r   primesresult_tensorsprimehasheds           r<   _hash_bucket_tensorsz%CanineEmbeddings._hash_bucket_tensors   sp     G$7G~FGG*% 	*E 1}-<F!!&)	* r;   embedding_sizec                    ||z  dk7  rt        d| d| d      | j                  |||      }g }t        |      D ]-  \  }}d| }	 t        | |	      |      }
|j	                  |
       / t        j                  |d      S )	zDConverts IDs (e.g. codepoints) into embeddings via multiple hashing.r   zExpected `embedding_size` (z) % `num_hashes` (z) == 0)r   r   r   r   dim)r~   r   	enumeraterw   rp   r6   cat)r   r   r   r   r   hash_bucket_tensorsembedding_shardsr   hash_bucket_idsr   shard_embeddingss              r<   _embed_hash_bucketsz$CanineEmbeddings._embed_hash_buckets   s    J&!+:>:JJ\]g\hhnopp"77	jfq7r"+,?"@ 	6A1!5D2wtT2?C##$45	6
 yy)r22r;   r   token_type_idsr   inputs_embedsreturnc                 `   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|:t        j                  |t        j                  | j                  j
                        }|P| j                  || j                  j                  | j                  j                  | j                  j                        }| j                  |      }||z   }| j                  dk(  r| j                  |      }	||	z  }| j                  |      }| j                  |      }|S )Nr   r   dtypedevicer   )sizer   r6   zeroslongr   r   r   r   r   r   rO   r   r   rT   r   )
r   r   r   r   r   input_shape
seq_lengthrO   rM   position_embeddingss
             r<   forwardzCanineEmbeddings.forward   s$     #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  444;;22DKK4R4RTXT_T_TpTpM !% : :> J"%::
'':5"&"?"?"M--J^^J/
\\*-
r;   )NNNN)r2   r3   r4   r5   r   rz   r   r   r   r6   
LongTensorr7   r   __classcell__r   s   @r<   r   r      s    F^0# C .3S 3c 3`c 3  15593759"E,,-" !!1!12" u//0	"
   1 12" 
		"r;   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )CharactersToMoleculeszeConvert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.c                 >   t         |           t        j                  |j                  |j                  |j
                  |j
                        | _        t        |j                     | _	        t        j                  |j                  |j                        | _
        y )Nin_channelsout_channelskernel_sizestrider   )r   r   r   Conv1dr   downsampling_raterU   r   
hidden_act
activationrT   r   r   r   r   s     r<   r   zCharactersToMolecules.__init__!  sv    II**++00++	
	 !!2!23 f&8&8f>S>STr;   char_encodingr   c                 2   |d d ddd d f   }t        j                  |dd      }| j                  |      }t        j                  |dd      }| j                  |      }|d d ddd d f   }t        j                  ||gd      }| j                  |      }|S )Nr   r   r_   r   r   )r6   r|   rU   r   r   rT   )r   r   cls_encodingdownsampleddownsampled_truncatedresults         r<   r   zCharactersToMolecules.forward0  s    $Q!QY/ q!<ii.ook1a8ook2 !,AqtQJ 7 L*?@aH'r;   )	r2   r3   r4   r5   r   r6   Tensorr   r   r   s   @r<   r   r     s'    oUU\\ ell r;   r   c                   |     e Zd ZdZ fdZ	 ddej                  deej                     dej                  fdZ xZ	S )ConvProjectionz
    Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
    characters.
    c                    t         |           || _        t        j                  |j
                  dz  |j
                  |j                  d      | _        t        |j                     | _
        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y )Nr_   r   r   r   )r   r   r   r   r   r   upsampling_kernel_sizerU   r   r   r   rT   r   r   r   r   r   s     r<   r   zConvProjection.__init__R  s    II**Q.++55	
	 !!2!23 f&8&8f>S>STzz&"<"<=r;   inputsfinal_seq_char_positionsr   c                    t        j                  |dd      }| j                  j                  dz
  }|dz  }||z
  }t	        j
                  ||fd      }| j                   ||            }t        j                  |dd      }| j                  |      }| j                  |      }| j                  |      }|}|t        d      |}	|	S )Nr   r_   r   z,CanineForMaskedLM is currently not supported)r6   r|   r   r   r   ConstantPad1drU   r   rT   r   NotImplementedError)
r   r   r   	pad_totalpad_begpad_endpadr   final_char_seq	query_seqs
             r<   r   zConvProjection.forwarda  s     A.
 KK66:	q.g%1153v;'A.('f%#/
 &&TUU&Ir;   N)
r2   r3   r4   r5   r   r6   r   r   r   r   r   s   @r<   r   r   L  sE    
>$ <@"" #+5<<"8" 
	"r;   r   c                        e Zd Z fdZ	 	 	 d	dej
                  dej
                  deej                     deej                     dee   de	ej
                  eej
                     f   fdZ
 xZS )
CanineSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rG|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        y y )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_queryr_   r   )r   r   r   num_attention_headshasattrr~   rz   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   rw   r   r   r   distance_embeddingr   s     r<   r   zCanineSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr;   from_tensor	to_tensorattention_mask	head_maskoutput_attentionsr   c                 V   |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  |      j                  |d| j                  | j                        j                  dd      }t        j                  ||	j                  dd            }| j                  dk(  s| j                  dk(  rF|j                         d   }t        j                  |t        j                  |j                        j                  dd      }t        j                  |t        j                  |j                        j                  dd      }||z
  }| j                  || j                   z   dz
        }|j#                  |j$                        }| j                  dk(  rt        j&                  d	||      }||z   }nE| j                  dk(  r6t        j&                  d	||      }t        j&                  d
|	|      }||z   |z   }|t)        j*                  | j                        z  }|h|j,                  dk(  rTt        j.                  |d      }d|j1                         z
  t        j2                  |j$                        j4                  z  }||z   }t6        j8                  j;                  |d      }| j=                  |      }|||z  }t        j                  ||
      }|j?                  dddd      jA                         }|j                         d d | jB                  fz   } |j                  | }|r||f}|S |f}|S )Nr   r   r_   rR   r   r   r   )r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   r         ?r   )"r}   r  viewr  r  r|   r  r  r6   matmulr   r   r   r   r   r
  r   tor   einsummathsqrtndim	unsqueezefloatfinfominr   
functionalsoftmaxr   permute
contiguousr  )r   r  r  r  r  r  
batch_sizer   _	key_layervalue_layerquery_layerattention_scoresposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                           r<   r   zCanineSelfAttention.forward  si    %0$5$5!
J HHYT*b$":":D<T<TUYq!_ 	 JJy!T*b$":":D<T<TUYq!_ 	 JJ{#T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ''>9T=Y=Y]q=q$))+A.J"\\*EJJ{OaOabgghjlmnN"\\*EJJ{OaOabgghikmnN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%""a'!&Q!G #&(<(<(>">%++N^NdNdBeBiBi!i/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r;   NNF)r2   r3   r4   r   r6   r   r   r7   boolr9   r   r   r   s   @r<   r   r     s    u4 7;15,1P\\P <<P !!2!23	P
 E--.P $D>P 
u||Xell33	4Pr;   r   c                        e Zd Z fdZdeej                     dej                  deej                  ej                  f   fdZ xZS )CanineSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r   r   r   r  r   denserT   r   r   r   r   r   s     r<   r   zCanineSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r;   r0   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r8  r   rT   r   r0   r9  s      r<   r   zCanineSelfOutput.forward  s9     

=1]3}|'CDr;   	r2   r3   r4   r   r9   r6   r7   r   r   r   s   @r<   r5  r5    sL    >"5#4#45EJEVEV	u  %"3"33	4r;   r5  c                       e Zd ZdZ	 	 	 	 	 	 	 ddedededededef fdZd	 Z	 	 	 dd
ee	j                     dee	j                     dee	j                     dee   dee	j                  ee	j                     f   f
dZ xZS )CanineAttentionav  
    Additional arguments related to local attention:

        - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
        - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to
          attend
        to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`,
        *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all
        positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The
        width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to
        128) -- The number of elements to skip when moving to the next block in `from_tensor`. -
        **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in
        *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to
        skip when moving to the next block in `to_tensor`.
    always_attend_to_first_positionfirst_position_attends_to_allattend_from_chunk_widthattend_from_chunk_strideattend_to_chunk_widthattend_to_chunk_stridec	                 "   t         	|           t        |      | _        t	        |      | _        t               | _        || _        ||k  rt        d      ||k  rt        d      || _
        || _        || _        || _        || _        || _        y )Nze`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped.z``attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped.)r   r   r   r   r5  outputsetpruned_headslocalr~   r@  rA  rB  rC  rD  rE  
r   r   rJ  r@  rA  rB  rC  rD  rE  r   s
            r<   r   zCanineAttention.__init__  s     	'/	&v.E 
"%==w  !#99r  0O,-J*'>$(@%%:"&<#r;   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )ry   r   r   r  r  rI  r   r  r  r  rG  r8  r  union)r   headsindexs      r<   prune_headszCanineAttention.prune_heads2  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r;   r0   r  r  r  r   c                    | j                   s| j                  |||||      }|d   }n|j                  d   x}}|x}	}
g }| j                  r|j	                  d       d}nd}t        ||| j                        D ].  }t        ||| j                  z         }|j	                  ||f       0 g }| j                  r|j	                  d|f       t        d|| j                        D ].  }t        ||| j                  z         }|j	                  ||f       0 t        |      t        |      k7  rt        d| d| d      g }g }t        ||      D ]  \  \  }}\  }}|	d d ||d d f   }|
d d ||d d f   }|d d ||||f   }| j                  rN|d d ||ddf   }t        j                   ||gd      }|
d d ddd d f   }t        j                   ||gd      }| j                  |||||      }|j	                  |d          |s|j	                  |d           t        j                   |d      }| j#                  ||      }|f}| j                   s
|dd  z   }|S |t%              z   }|S )	Nr   r   )r   r   z/Expected to have same number of `from_chunks` (z) and `to_chunks` (z). Check strides.r_   r   )rJ  r   r}   rA  rp   r{   rC  r  rB  rE  rD  ry   r~   rq   r@  r6   r   rG  r9   )r   r0   r  r  r  self_outputsattention_outputfrom_seq_lengthto_seq_lengthr  r  from_chunks
from_startchunk_start	chunk_end	to_chunksattention_output_chunksattention_probs_chunksfrom_endto_startto_endfrom_tensor_chunkto_tensor_chunkattention_mask_chunkcls_attention_maskcls_positionattention_outputs_chunkr1  s                               r<   r   zCanineAttention.forwardD  s    zz99]M>S\^opL+A.;.A.A!.DDOm&33K) K11""6* 

$Z$B_B_` =t?[?[1[\	""K#;<=
 I11  !]!34$Qt7R7RS ;{T=W=W/WX	  +y!9:; ;3y>1 Ek] S$$/=0AC  ')#%'">A+y>Y N:&X(:6$/:h3F0I$J!"+Ax,A"B (6aH9LhW]o6]'^$77)7:h;NPQRSPS8S)T&+0996HJ^5_ef+g(#,Q!QY#7L&+ii0OUV&WO*.))%8LiYj+' (../Fq/IJ$*112I!2LM%N(  %yy)@aH;;'7G#%zzQR 00G  &< ==Gr;   FFF   rg  rg  rg  r2  )r2   r3   r4   r5   r3  rz   r   rP  r9   r6   r7   r   r   r   r   s   @r<   r?  r?     s    & 05.3'*(+%(&)= *.	=
 (,= "%= #&=  #= !$=B;* 7;15,1HU../H !!2!23H E--.	H
 $D>H 
u  (5+<+<"==	>Hr;   r?  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CanineIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r   r   r   r  r   intermediate_sizer8  
isinstancer   strr   intermediate_act_fnr   s     r<   r   zCanineIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r;   r0   r   c                 J    | j                  |      }| j                  |      }|S r   )r8  rn  r   r0   s     r<   r   zCanineIntermediate.forward  s&    

=100?r;   )r2   r3   r4   r   r6   r7   r   r   r   s   @r<   ri  ri    s'    9U%6%6 5;L;L r;   ri  c                   t     e Zd Z fdZdeej                     dej                  dej                  fdZ xZS )CanineOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r7  )r   r   r   r  rk  r   r8  rT   r   r   r   r   r   s     r<   r   zCanineOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r;   r0   r9  r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r;  r<  s      r<   r   zCanineOutput.forward  s7    

=1]3}|'CDr;   r=  r   s   @r<   rr  rr    s:    >U5+<+<%= UM^M^ chctct r;   rr  c                        e Zd Z fdZ	 	 	 d	deej                     deej                     deej                     dee   deej                  eej                     f   f
dZ	d Z
 xZS )
CanineLayerc	           
          t         	|           |j                  | _        d| _        t	        ||||||||      | _        t        |      | _        t        |      | _	        y Nr   )
r   r   chunk_size_feed_forwardseq_len_dimr?  	attentionri  intermediaterr  rG  rK  s
            r<   r   zCanineLayer.__init__  se     	'-'E'E$(+)#$!"	
 /v6"6*r;   r0   r  r  r  r   c                     | j                  ||||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S )N)r  r   r   )r{  r   feed_forward_chunkry  rz  )	r   r0   r  r  r  self_attention_outputsrS  r1  layer_outputs	            r<   r   zCanineLayer.forward  sv     "&/	 "0 "
 2!4(,0##T%A%A4CSCSUe
  /G+r;   c                 L    | j                  |      }| j                  ||      }|S r   )r|  rG  )r   rS  intermediate_outputr  s       r<   r~  zCanineLayer.feed_forward_chunk  s,    "//0@A{{#68HIr;   r2  )r2   r3   r4   r   r9   r6   r7   r   r3  r   r~  r   r   s   @r<   rv  rv    s    +< 7;15,1U../ !!2!23 E--.	
 $D> 
u  (5+<+<"==	>0r;   rv  c                        e Zd Z	 	 	 	 	 	 	 d
 fd	Z	 	 	 	 	 ddeej                     deej                     deej                     dee   dee   dee   de	ee
f   fd	Z xZS )CanineEncoderc	                     t         
|           || _        t        j                  t        |j                        D 	cg c]  }	t        ||||||||       c}	      | _        d| _	        y c c}	w )NF)
r   r   r   r   
ModuleListr{   num_hidden_layersrv  layergradient_checkpointing)r   r   rJ  r@  rA  rB  rC  rD  rE  r"  r   s             r<   r   zCanineEncoder.__init__  sx     	]] v778  31+,)*	

 ',#s   A*r0   r  r  r  output_hidden_statesreturn_dictr   c                    |rdnd }|rdnd }t        | j                        D ]2  \  }	}
|r||fz   }|||	   nd } |
||||      }|d   }|s*||d   fz   }4 |r||fz   }|st        d |||fD              S t        |||      S )Nr:   r   r   c              3   &   K   | ]	  }||  y wr   r:   rH   vs     r<   rJ   z(CanineEncoder.forward.<locals>.<genexpr>!  s     mq_`_lm   )r.   r0   r1   )r   r  r9   r   )r   r0   r  r  r  r  r  all_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                r<   r   zCanineEncoder.forward  s     #7BD$5b4(4 
	POA|#$58H$H!.7.CilO(YjkM)!,M &9]1=M<O&O#
	P   1]4D Dm]4EGZ$[mmm++*
 	
r;   rf  )NNFFT)r2   r3   r4   r   r9   r6   r7   r   r3  r   r   r   r   r   s   @r<   r  r    s     (-&+ #!$!",B 7;15,1/4&*!
U../!
 !!2!23!
 E--.	!

 $D>!
 'tn!
 d^!
 
uo%	&!
r;   r  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )CaninePoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r   r   r   r  r   r8  Tanhr   r   s     r<   r   zCaninePooler.__init__*  s9    YYv1163E3EF
'')r;   r0   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r8  r   )r   r0   first_token_tensorpooled_outputs       r<   r   zCaninePooler.forward/  s6     +1a40

#566r;   r=  r   s   @r<   r  r  )  s,    $
U5+<+<%= %BSBS r;   r  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )CaninePredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r7  )r   r   r   r  r   r8  rl  r   rm  r   transform_act_fnrT   r   r   s     r<   r   z&CaninePredictionHeadTransform.__init__9  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr;   r0   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r8  r  rT   rp  s     r<   r   z%CaninePredictionHeadTransform.forwardB  s4    

=1--m<}5r;   r=  r   s   @r<   r  r  8  s-    UU5+<+<%= %BSBS r;   r  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )CanineLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)r]   )r   r   r  	transformr   r  r   
vocab_sizedecoder	Parameterr6   r   r]   r   s     r<   r   zCanineLMPredictionHead.__init__J  sm    6v> yy!3!3V5F5FUSLLV->->!?@	 !IIr;   r0   r   c                 J    | j                  |      }| j                  |      }|S r   )r  r  rp  s     r<   r   zCanineLMPredictionHead.forwardW  s$    }5]3r;   r=  r   s   @r<   r  r  I  s,    &U5+<+<%= %BSBS r;   r  c                   b     e Zd Z fdZdeej                     deej                     fdZ xZS )CanineOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r   r   r  predictionsr   s     r<   r   zCanineOnlyMLMHead.__init__^  s    1&9r;   sequence_outputr   c                 (    | j                  |      }|S r   )r  )r   r  prediction_scoress      r<   r   zCanineOnlyMLMHead.forwardb  s     !,,_=  r;   )	r2   r3   r4   r   r9   r6   r   r   r   r   s   @r<   r  r  ]  s1    :!u||,! 
u||	!r;   r  c                   *    e Zd ZU eed<   eZdZdZd Z	y)CaninePreTrainedModelr   canineTc                 "   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsg        )meanstdNr  )rl  r   r  r   rZ   r   normal_r   initializer_ranger]   zero_r   padding_idxrT   fill_)r   modules     r<   _init_weightsz#CaninePreTrainedModel._init_weightsq  s   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r;   N)
r2   r3   r4   r   r8   r   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr  r:   r;   r<   r  r  j  s    /O &*#*r;   r  c                       e Zd Zd fd	Zd Zd Zdej                  defdZ	dej                  ded	ej                  fd
Z
e	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     dee   dee   dee   d	eeef   fd       Z xZS )CanineModelc           
         t         |   |       || _        t        j                  |      }d|_        t        |      | _        t        |ddd|j                  |j                  |j                  |j                        | _
        t        |      | _        t        |      | _        t        |      | _        t        |      | _        |rt#        |      nd| _        | j'                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   TF)rJ  r@  rA  rB  rC  rD  rE  N)r   r   r   copydeepcopyr  r   char_embeddingsr  local_transformer_striderP   r   rQ   rL   r   rV   rS   r  pooler	post_init)r   r   add_pooling_layershallow_configr   s       r<   r   zCanineModel.__init__  s    
 	 v.+,(/7$1,1*/$*$C$C%+%D%D"("A"A#)#B#B	%
! #8"?$V,(0"/"?.?l6*T 	r;   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrL   r  r{  rP  )r   heads_to_pruner  rN  s       r<   _prune_headszCanineModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr;   c                    |j                   d   |j                   d   }}|j                   d   }t        j                  ||d|f      j                         }t        j                  ||dft        j
                  |j                        }||z  }|S )aP  
        Create 3D attention mask from a 2D tensor mask.

        Args:
            from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
            to_mask: int32 Tensor of shape [batch_size, to_seq_length].

        Returns:
            float Tensor of shape [batch_size, from_seq_length, to_seq_length].
        r   r   )r   r   r   )r}   r6   reshaper  onesfloat32r   )r   r  to_maskr!  rT  rU  broadcast_onesmasks           r<   )_create_3d_attention_mask_from_input_maskz5CanineModel._create_3d_attention_mask_from_input_mask  s     '2&7&7&:K<M<Ma<PO
a(--*a)GHNNP
 *oq)IQVQ^Q^gnguguv 'r;   char_attention_maskr   c                     |j                   \  }}t        j                  ||d|f      }t        j                  j	                  ||      |j                               }t        j                  |d      }|S )z[Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.r   )r   r   r   r   )r}   r6   r  r   	MaxPool1dr  squeeze)r   r  r   r!  char_seq_lenpoolable_char_maskpooled_molecule_maskmolecule_attention_masks           r<   _downsample_attention_maskz&CanineModel._downsample_attention_mask  sw     $7#<#< 
L"]]+>QP\@]^  %xx11>OXi1j$$& 

 #(--0D""M&&r;   	moleculeschar_seq_lengthr   c                    | j                   j                  }|ddddddf   }t        j                  ||d      }|ddddddf   }||z  }t        j                  |||z   d      }t        j                  ||gd      S )zDRepeats molecules to make them the same length as the char sequence.Nr   rR   )repeatsr   r   r   )r   r   r6   repeat_interleaver   )	r   r  r  ratemolecules_without_extra_clsrepeatedlast_moleculeremainder_lengthremainder_repeateds	            r<   _repeat_moleculeszCanineModel._repeat_molecules  s     {{,,&/12q&9#**+FPTZ\] "!RS!),*T1"44$t+	
 yy($67R@@r;   r   r  r   r   r  r   r  r  r  c
                 B   ||n| j                   j                  }||n| j                   j                  }|rdnd }
|rdnd }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }n!||j                         d d }nt	        d      |\  }}||j                  n|j                  }|t        j                  ||f|      }|&t        j                  |t        j                  |      }| j                  ||      }| j                  || j                   j                        }| j                  |||j                  d   f      }| j!                  || j                   j"                        }| j%                  ||||      }| j'                  ||n||      }| j)                  ||||	      }|j*                  }| j-                  |      }| j/                  ||||||	
      }|d   }| j0                  | j1                  |      nd }| j3                  ||d         }t        j4                  ||gd      }| j7                  |      }| j9                  ||||	      }|j*                  }|r2|	r|j:                  n|d   }|
|j:                  z   |z   |j:                  z   }
|r2|	r|j<                  n|d   } ||j<                  z   | z   |j<                  z   }|	s||f}!|!t?        d |
|fD              z  }!|!S tA        |||
|      S )Nr:   zDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r   )r   )r   r   r   r   )r  r  r  )r  r  r  r  r  r   )r  r   r   c              3   &   K   | ]	  }||  y wr   r:   r  s     r<   rJ   z&CanineModel.forward.<locals>.<genexpr>  s     a!STS`Aar  )r.   r/   r0   r1   )!r   r  r  use_return_dictr~   %warn_if_padding_and_no_attention_maskr   r   r6   r  r   r   get_extended_attention_maskr  r   r}   get_head_maskr  r  r  rP   r.   rQ   rL   r  r  r   rV   rS   r0   r1   r9   r-   )"r   r   r  r   r   r  r   r  r  r  r  r  r   r!  r   r   extended_attention_maskr   extended_molecule_attention_maskinput_char_embeddingsr  init_chars_encoder_outputsinput_char_encodinginit_molecule_encodingencoder_outputsmolecule_sequence_outputr  repeated_moleculesconcatr  final_chars_encoder_outputsdeep_encoder_hidden_statesdeep_encoder_self_attentionsrG  s"                                     r<   r   zCanineModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 #7BD$5b4%0%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!"[[EJJvVN 150P0PQ_al0m"&"A"Adkk.K.K #B #
 :>9Y9Y#j2I2O2OPR2S%T:
( &&y$++2O2OP	 !% 4 4%)'	 !5 !
 #LL".IM>
 &*%>%>!./!5	 &? &
" 9JJ  "&!8!89L!M ,,";/!5# ' 
 $31#5 AEAX$<=^b "334L^ijl^m3n /1CD"M //&1 '+&=&=2/!5	 '> '
# 6GGJU)F)F[jkl[m&!,::;,- .;;<  IT?+E+EZijlZm(#,778./ .889   %}5Fea(9;N'OaaaFM+-'+*	
 	
r;   )T)	NNNNNNNNN)r2   r3   r4   r   r  r  r6   r   rz   r  r  r   r   r   r7   r3  r   r9   r-   r   r   r   s   @r<   r  r    sM    DC6'ell '_b '"A5<< A# ARWR^R^ A2  156:59371559,0/3&*\
E,,-\
 !!2!23\
 !!1!12	\

 u//0\
 E--.\
   1 12\
 $D>\
 'tn\
 d^\
 
u22	3\
 \
r;   r  z
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )CanineForSequenceClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   r   r   
num_labelsr  r  r   r   r   r   r  r   
classifierr  r   s     r<   r   z(CanineForSequenceClassification.__init__  i      ++!&)zz&"<"<=))F$6$68I8IJ 	r;   r   r  r   r   r  r   labelsr  r  r  r   c                 @   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j
                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   r  r   r  r  r  r   
regressionsingle_label_classificationmulti_label_classificationr   r_   losslogitsr0   r1   )r   r  r  r   r  problem_typer  r   r6   r   rz   r	   r  r   r  r   r   r0   r1   )r   r   r  r   r   r  r   r  r  r  r  r1  r  r  r  loss_fctrG  s                    r<   r   z'CanineForSequenceClassification.forward  s   ( &1%<k$++B]B]++))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r;   
NNNNNNNNNN)r2   r3   r4   r   r   r   r6   r   r7   r3  r   r9   r   r   r   r   s   @r<   r  r    s   	  156:59371559-1,0/3&*E
E,,-E
 !!2!23E
 !!1!12	E

 u//0E
 E--.E
   1 12E
 ))*E
 $D>E
 'tnE
 d^E
 
u..	/E
 E
r;   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )CanineForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y rx  )r   r   r  r  r   r   r   r   r  r   r  r  r   s     r<   r   z CanineForMultipleChoice.__init__  sV     !&)zz&"<"<=))F$6$6: 	r;   r   r  r   r   r  r   r  r  r  r  r   c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   rR   r	  r_   r  )r   r  r}   r  r   r  r   r  r   r   r0   r1   )r   r   r  r   r   r  r   r  r  r  r  num_choicesr1  r  r  reshaped_logitsr  r  rG  s                      r<   r   zCanineForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ++))%'/!5#  

  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r;   r  )r2   r3   r4   r   r   r   r6   r   r7   r3  r   r9   r   r   r   r   s   @r<   r  r    s     156:59371559-1,0/3&*X
E,,-X
 !!2!23X
 !!1!12	X

 u//0X
 E--.X
   1 12X
 ))*X
 $D>X
 'tnX
 d^X
 
u//	0X
 X
r;   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )CanineForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   r  r   s     r<   r   z%CanineForTokenClassification.__init__X  r  r;   r   r  r   r   r  r   r  r  r  r  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CanineForTokenClassification
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
        >>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

        >>> inputs = tokenizer(
        ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
        ... )

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_token_class_ids = logits.argmax(-1)

        >>> # Note that tokens are classified rather then input words which means that
        >>> # there might be more predicted token classes than words.
        >>> # Multiple token classes might account for the same word
        >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        >>> predicted_tokens_classes  # doctest: +SKIP
        ```

        ```python
        >>> labels = predicted_token_class_ids
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)  # doctest: +SKIP
        ```Nr	  r   r   r_   r  )r   r  r  r   r  r   r  r  r   r0   r1   )r   r   r  r   r   r  r   r  r  r  r  r1  r  r  r  r  rG  s                    r<   r   z$CanineForTokenClassification.forwardc  s    ` &1%<k$++B]B]++))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r;   r  )r2   r3   r4   r   r   r   r6   r   r7   r3  r   r9   r   r   r   r   s   @r<   r  r  V  s   	  156:59371559-1,0/3&*P
E,,-P
 !!2!23P
 !!1!12	P

 u//0P
 E--.P
   1 12P
 ))*P
 $D>P
 'tnP
 d^P
 
u++	,P
 P
r;   r  c                   d    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eef   fd       Z xZS )CanineForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r   r   r  r  r  r   r  r   
qa_outputsr  r   s     r<   r   z#CanineForQuestionAnswering.__init__  sS      ++!&)))F$6$68I8IJ 	r;   r   r  r   r   r  r   start_positionsend_positionsr  r  r  r   c                    ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|       |j                  d|       t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr	  r   r   r   r   )ignore_indexr_   )r  start_logits
end_logitsr0   r1   )r   r  r  r   rr   r  ry   r   clamp_r   r   r0   r1   )r   r   r  r   r   r  r   r!  r"  r  r  r  r1  r  r  r%  r&  
total_lossignored_indexr  
start_lossend_lossrG  s                          r<   r   z"CanineForQuestionAnswering.forward  s    &1%<k$++B]B]++))%'/!5#  

 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r;   )NNNNNNNNNNN)r2   r3   r4   r   r   r   r6   r   r7   r3  r   r9   r   r   r   r   s   @r<   r  r    s$     156:593715596:48,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u//0>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
u22	3>
 >
r;   r  )r  r  r  r  rv  r  r  r   )Br5   r  r  ri   dataclassesr   typingr   r   r6   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_caniner   
get_loggerr2   rg   r   r-   r   Moduler   r   r   r   r5  r?  ri  rr  rv  r  r  r  r  r  r  r  r  r  r  r  __all__r:   r;   r<   <module>r:     sQ      	 ! "    A A ! 9  . l l , . 
		H	% U :; : ::^Bbryy bJ+BII +\7RYY 7tg")) gTryy  Lbii L^ 299 7, 7t@
BII @
F299 BII "RYY (
!		 
! *O * *. M
' M
 M
` R
&; R
R
j d
3 d
 d
N ]
#8 ]
 ]
@ J
!6 J
 J
Z	r;   