
    rht                     |   d dl mZmZ d dlZd dlmZ d dlmc mZ d dl	Zddl
mZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZ d	d
lmZ  ej6                  e      Z G d dej<                        Z G d de      Z ddZ! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z&g dZ'y)    )CallableOptionalN   )Cache)ALL_ATTENTION_FUNCTIONS)logging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )OlmoLayerNormz/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2    t         |           |f| _        y N)super__init__normalized_shape)selfr   	__class__s     x/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/olmo/modular_olmo.pyr   zOlmoLayerNorm.__init__   s    !,    hidden_statesc                     |j                   }t        j                  |j                  t        j
                        | j                  d d d      j                  |      S )N)dtypegh㈵>)eps)r#   F
layer_normtotorchfloat32r   )r   r!   
orig_dtypes      r   forwardzOlmoLayerNorm.forward"   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r    )
__name__
__module____qualname____doc__intr   r(   Tensorr+   __classcell__r   s   @r   r   r      s4    9/C /D /
U\\ 
ell 
r    r   c                        e Zd Z fdZ xZS )OlmoMLPc                 J   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        y )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r   configr   s     r   r   zOlmoMLP.__init__*   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr    )r,   r-   r.   r   r2   r3   s   @r   r5   r5   )   s    Y Yr    r5   c                 
   | j                   |j                   }}|j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }	|j                  |      |	j                  |      fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r#   	unsqueezer   r'   )
qkcossinposition_idsunsqueeze_dimq_typek_typeq_embedk_embeds
             r   apply_rotary_pos_embrK   1   s|    ( WWaggFF
--
&C
--
&C3w;q>C/0G3w;q>C/0G::fwzz&111r    c                      e Zd Z	 	 d	dej                  deej                  ej                  f   deej                     dee   deej                     deej                  eej                     eeej                        f   fdZ	y)
OlmoAttentionNr!   position_embeddingsattention_maskpast_key_valuecache_positionr   c                    |j                   d d }g |d| j                  }| j                  |      }	| j                  |      }
| j	                  |      }| j
                  j                  |	j                  | j
                  j                   | j
                  j                         |
j                  | j
                  j                   | j
                  j                         |j                  | j
                  j                   | j
                  j                         |	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|j                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j
                  j                  dk7  rt        | j
                  j                     } || |	|
||f| j                   sdn| j"                  | j$                  d|\  }} |j&                  g |d j)                         }| j+                  |      }||fS )	N)minmaxr   r	   )rD   rC   rQ   eagerg        )dropoutscaling)shapehead_dimq_projk_projv_projr>   clip_qkvclamp_view	transposerK   update	layer_idxr   _attn_implementationr   trainingattention_dropoutrX   reshape
contiguouso_proj)r   r!   rN   rO   rP   rQ   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrC   rD   cache_kwargsattention_interfaceattn_outputattn_weightss                     r   r+   zOlmoAttention.forwardN   sB    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r    )NN)
r,   r-   r.   r(   r1   tupler   r   
LongTensorr+    r    r   rM   rM   M   s     +/592)||2) #5<<#=>2) !.	2)
 !2) !!1!122) 
u||Xell3XeELL>Q5RR	S2)r    rM   c                   (     e Zd Zdedef fdZ xZS )OlmoDecoderLayerr>   rc   c                     t         |   ||       t        |j                        | _        t        |j                        | _        t        ||      | _        y )N)r>   rc   )r   r   r   r   input_layernormpost_attention_layernormrM   	self_attnr   r>   rc   r   s      r   r   zOlmoDecoderLayer.__init__   sF    +,V-?-?@(5f6H6H(I%&f	Jr    )r,   r-   r.   r   r0   r   r2   r3   s   @r   rx   rx      s    Kz Kc K Kr    rx   c                       e Zd Zd Zy)OlmoRotaryEmbeddingc                    | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	||	fcd d d        S # 1 sw Y   y xY w)
Nr   rS   r   mpscpuF)device_typeenabledr	   )dim)inv_freqfloatexpandrY   r'   device
isinstancetypestrr(   autocastra   catrC   attention_scalingrD   )
r   xrE   inv_freq_expandedposition_ids_expandedr   freqsembrC   rD   s
             r   r+   zOlmoRotaryEmbedding.forward   s0    MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C8	 	 	s    BE22E;N)r,   r-   r.   r+   rv   r    r   r   r      s    
r    r   c                   $     e Zd Zdef fdZ xZS )	OlmoModelr>   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |j                        | _
        y c c}w r   )r   r   r8   
ModuleListrangenum_hidden_layersrx   layersr   r   normr}   s      r   r   zOlmoModel.__init__   s[     mmBGH`H`BabYfi0b
 "&"4"45	 cs   A1)r,   r-   r.   r   r   r2   r3   s   @r   r   r      s    6z 6 6r    r   c                       e Zd Zy)OlmoForCausalLMN)r,   r-   r.   rv   r    r   r   r      s    r    r   )r   r   OlmoPreTrainedModel)Nr   )(typingr   r   r(   torch.nnr8   torch.nn.functional
functionalr%   torch.utils.checkpointcache_utilsr   modeling_utilsr   utilsr   llama.modeling_llamar
   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr,   loggerModuler   r5   rK   rM   rx   r   r   r   __all__rv   r    r   <module>r      s    %        5 	 	 	 + 
		H	%
BII 
Yh Y283)N 3)lK( K. 6
 6	& 	r    