
    rh[                        d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'  G d dejP                        Z) G d dejP                        Z*dejV                  de,dejV                  fdZ-	 d2dejP                  dejV                  dejV                  dejV                  deejV                     de.d e.d!ee!   fd"Z/d# Z0d3d$Z1 G d% d&ejP                        Z2 G d' d(ejP                        Z3 G d) d*e      Z4e" G d+ d,e             Z5e" G d- d.e5             Z6e" G d/ d0e5e             Z7g d1Z8y)4    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )Cohere2Configc                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )Cohere2RotaryEmbeddingconfigc                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)super__init__hasattr
isinstancer    dictgetr!   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr   r   rope_init_fnattention_scalingregister_bufferr$   original_inv_freq)selfr   devicer$   	__class__s       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/cohere2/modeling_cohere2.pyr'   zCohere2RotaryEmbedding.__init__+   s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%    c                 .   | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  |dd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j                  |j                   
      	j                  |j                   
      fS # 1 sw Y   AxY w)Nr   r   mpscpuF)device_typeenabled   dimdtype)r$   floatexpandshaper)   r4   r"   strtorchautocast	transposerepeat_interleavecosr0   sintorB   )
r3   xposition_idsinv_freq_expandedposition_ids_expandedr<   freqsembrK   rL   s
             r6   forwardzCohere2RotaryEmbedding.forward<   sD    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))%;C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFFN)
__name__
__module____qualname__r   r'   rG   no_gradr   rT   __classcell__r5   s   @r6   r   r   *   s3    /} /" U]]_<  <r7   r   c                   &     e Zd Zd fd	Zd Z xZS )Cohere2LayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)r&   r'   nn	ParameterrG   onesweightvariance_epsilon)r3   hidden_sizeepsbiasr5   s       r6   r'   zCohere2LayerNorm.__init__M   s/    ll5::k#:; #r7   c                    |j                   }|j                  t        j                        }|j	                  dd      }||z
  j                  d      j	                  dd      }||z
  t        j                  || j                  z         z  }| j                  j                  t        j                        |z  }|j                  |      S )Nr9   T)keepdimr>   )	rB   rM   rG   float32meanpowrsqrtrc   rb   )r3   hidden_statesinput_dtyperj   variances        r6   rT   zCohere2LayerNorm.forwardS   s    #))%((7!!"d!3!D(--a055b$5G&-XH]H]=]1^^u}}5E,,r7   )Ngh㈵>FrV   rW   rX   r'   rT   rZ   r[   s   @r6   r]   r]   L   s    $-r7   r]   rm   n_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rE   rD   reshape)rm   rq   batchnum_key_value_headsslenhead_dims         r6   	repeat_kvry   ]   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr7   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr>   r   r9   )r@   rB   )ptrainingr   )ry   num_key_value_groupsrG   matmulrI   rE   r_   
functionalsoftmaxri   rM   rB   r   r   
contiguous)rz   r{   r|   r}   r~   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r6   eager_attention_forwardr   i   s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r7   c                     | dd d df   }| ddd df   }t        j                  | |gd      j                  d      }|S )N.r>   r   r9   r?   r   )rG   stackflatten)rN   x1x2rot_xs       r6   rotate_halfr      sL    	
3!8B	
319BKK"b	r*2226ELr7   c                 6   | j                   }| j                         } |j                         }|j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }|j	                  |      |j	                  |      fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rA   )rB   rC   	unsqueezer   rM   )	qkrK   rL   rO   unsqueeze_dimrB   q_embedk_embeds	            r6   apply_rotary_pos_embr      s    ( GGE		A		A
--
&C
--
&C3w;q>C/0G3w;q>C/0G::E:"GJJUJ$;;;r7   c                   >    e Zd ZdZddedee   f fdZ	 	 ddej                  de
ej                  ej                  f   deej                     dee   d	eej                     d
ee   de
ej                  eej                     ee
ej                        f   fdZ xZS )Cohere2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperr   	layer_idxc                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        |j                  |   dk(  r|j                  nd | _        t        j                   |j
                  |j                  | j                  z  |j"                        | _        t        j                   |j
                  |j                  | j                  z  |j"                        | _        t        j                   |j
                  |j                  | j                  z  |j"                        | _        t        j                   |j                  | j                  z  |j
                  |j"                        | _        y )Nrx   g      Tsliding_attentionrf   )r&   r'   r   r   getattrrd   num_attention_headsrx   rv   r   r   attention_dropout	is_causallayer_typessliding_windowr_   Linearattention_biasq_projk_projv_projo_projr3   r   r   r5   s      r6   r'   zCohere2Attention.__init__   sq   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!97=7I7I)7TXk7kf33quii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r7   rm   position_embeddingsr~   past_key_valuecache_positionr   rr   c                 b   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}| j                  t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                   | j"                  | j                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr9   r   r>   )rL   rK   r   eager        )r   r   r   )rE   rx   r   viewrI   r   r   r   r   updater   r   r   _attn_implementationr   r   r   r   rt   r   r   )r3   rm   r   r~   r   r   r   input_shapehidden_shapequery_statesr   r   rK   rL   cache_kwargsattention_interfacer   r   s                     r6   rT   zCohere2Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S*';L*VY[^'_$L*%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((r7   rU   )NN)rV   rW   rX   __doc__r   r   intr'   rG   Tensortupler   
LongTensorr   r   rT   rZ   r[   s   @r6   r   r      s    G
} 
# 
: +/59*)||*) #5<<#=>*) !.	*)
 !*) !!1!12*) -.*) 
u||Xell3XeELL>Q5RR	S*)r7   r   c                   $     e Zd Z fdZd Z xZS )
Cohere2MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFr   )r&   r'   r   rd   intermediate_sizer_   r   	gate_projup_proj	down_projr   
hidden_actact_fnr3   r   r5   s     r6   r'   zCohere2MLP.__init__   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r7   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rU   )r   r   r   r   )r3   rN   r   s      r6   rT   zCohere2MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r7   rp   r[   s   @r6   r   r      s    0r7   r   c                   <    e Zd Zdedef fdZ	 	 	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	e   d	e	ej                     d
ee   deej                  e	eej                  ej                  f      f   fdZ xZS )Cohere2DecoderLayerr   r   c                     t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        |j                  |   | _        y )N)r   r   rd   re   )r&   r'   rd   r   	self_attnr   mlpr]   layer_norm_epsinput_layernormr   attention_typer   s      r6   r'   zCohere2DecoderLayer.__init__  se    !--)9Mf%/V=O=OV\VkVkl$00;r7   rm   r   r~   r   	use_cacher   r   rr   c           
          |}| j                  |      } | j                  d||||||d|\  }	}
| j                  |      }||	z   |z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )rm   r   r~   r   r   r    )r   r   r   )r3   rm   r   r~   r   r   r   r   residualhidden_states_attention_hidden_states_mlps               r6   rT   zCohere2DecoderLayer.forward
  sx    : !,,];%3T^^ &
' 3)))&
 &
" !HH]3 #::=NNr7   )NNFN)rV   rW   rX   r   r   r'   rG   r   r   r   r   boolr   r   r   FloatTensorrT   rZ   r[   s   @r6   r   r     s    <} < < 26*.$)59+||+ #5<<#=>+ !.	+
 !+ D>+ !!1!12+ -.+ 
u  (51B1BEDUDU1U+V"WW	X+r7   r   c                   J    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZy)Cohere2PreTrainedModelr   modelTr   past_key_values)rm   
attentionsN)rV   rW   rX   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr   r7   r6   r   r   8  sQ    &*#./#4"5N!"&,&r7   r   c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     dee   d	eej                     d
ee   defd              Z xZS )Cohere2Modelr   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr   )r   F)r&   r'   pad_token_idpadding_idx
vocab_sizer_   	Embeddingrd   embed_tokens
ModuleListrangenum_hidden_layersr   layersr]   r   normr   
rotary_embgradient_checkpointing	post_initr   s      r6   r'   zCohere2Model.__init__M  s     !.. ++LL):):F<N<NPTP`P`ammEJ6KcKcEde	 3e
 %&2D2D6K`K`a	0?&+# 	 fs   D	input_idsr~   rO   r   inputs_embedsr   r   r   rr   c           
      p   |d u |d uz  rt        d      || j                  |      }|r|| j                  s
t               }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              s*| j                  |||||d}t        d	i |t        d	i |d}
|}| j                  ||      }| j                   D ]  } ||f||
|j"                     |||d|}  | j%                  |      }t'        ||      S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   r   )r4   )r   input_embedsr~   r   r   rO   )full_attentionr   )r   r~   r   r   r   )last_hidden_stater   r   )
ValueErrorr   r   r	   get_seq_lengthrG   arangerE   r4   r   r)   r*   r   r   r   r  r   r   r   r   )r3   r  r~   rO   r   r  r   r   r   past_seen_tokenscausal_mask_mappingmask_kwargsrm   r   decoder_layers                  r6   rT   zCohere2Model.forward]  s    -t";<YZZ  --i8M0*nO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L?-F++ -"0"0#2 ,K #5"C{"C%F%U%U#
 &"oom\J![[ 		M)$72=3O3OP.#- M		 		-0&++
 	
r7   )NNNNNNN)rV   rW   rX   r   r'   r   r   r   rG   r   r   r   r   r   r   r   r   rT   rZ   r[   s   @r6   r   r   K  s    }    151537+/59$(59<
E,,-<
 !.<
 u//0	<

 "%<
   1 12<
 D><
 !!1!12<
 +,<
 
!<
  <
r7   r   c                       e Zd ZdgZddiZddgdgfiZ fdZd Zd Ze	e
	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deeeeej$                     f      deej$                     deej                     dee   dee   dee   deej                     deeej                  f   dee   defd              Z xZS )Cohere2ForCausalLMzlm_head.weightlm_headcolwise_reprm   logitsc                 ,   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        |j                  | _
        | j                          y r   )r&   r'   r   r   r   r_   r   rd   r  logit_scaletie_word_embeddingsr  r   s     r6   r'   zCohere2ForCausalLM.__init__  sq     !&)
 ++yy!3!3V5F5FUS!--#)#=#=  	r7   c                     || _         y rU   r   )r3   decoders     r6   set_decoderzCohere2ForCausalLM.set_decoder  s	    
r7   c                     | j                   S rU   r  )r3   s    r6   get_decoderzCohere2ForCausalLM.get_decoder  s    zzr7   r  r~   rO   r   r  labelsr   output_attentionsoutput_hidden_statesr   logits_to_keepr   rr   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }|| j                  z  }d}|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                        S )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, Cohere2ForCausalLM

        >> model = Cohere2ForCausalLM.from_pretrained("Cohere2ForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("Cohere2ForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  r~   rO   r   r  r   r   r!  r   )r  r  r   )lossr  r   rm   r   r   )r   r   r!  r   r	  r)   r   slicer  r  loss_functionr   r   r   rm   r   )r3   r  r~   rO   r   r  r  r   r   r!  r   r"  r   outputsrm   slice_indicesr  r$  s                     r6   rT   zCohere2ForCausalLM.forward  s+   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A$***%4%%pVFt{{OeOepiopD%#33!//))
 	
r7   )NNNNNNNNNNr   )rV   rW   rX   _tied_weights_keys_tp_plan_pp_planr'   r  r  r   r   r   rG   r   r   r   r   listr   r   r   r   r   r   rT   rZ   r[   s   @r6   r  r    s   *+=)H_-z:;H	  151537KO59-1$(,0/35934H
E,,-H
 !.H
 u//0	H

 "%tE4E4E/F(F"GHH
   1 12H
 ))*H
 D>H
 $D>H
 'tnH
 !!1!12H
 c5<</0H
 +,H
 
 H
  H
r7   r  )r  r   r   )r   )Nr   )9typingr   r   r   rG   torch.nnr_   activationsr   cache_utilsr   r	   
generationr
   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   configuration_cohere2r   Moduler   r]   r   r   ry   rC   r   r   r   r   r   r   r   r   r  __all__r   r7   r6   <module>r>     s  , - ,   ! . ) R B 9 O K F & I I / 0<RYY <D-ryy -"	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4<<E)ryy E)P  44 4n _  $ O
) O
 O
d `
/ `
 `
F Kr7   