
    rh}                        d dl mZmZmZmZ d dlZd dlmc mZ	 d dlmZ ddl
mZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)  e'       r	d dl*m+Z+m,Z, nd\  Z+Z, ed       G d dejZ                               Z. G d dejZ                        Z/ G d dejZ                        Z0 G d de      Z1d Z2d;dZ3d ejh                  d!e5d"ejh                  fd#Z6	 d<d$ejZ                  d%ejh                  d&ejh                  d'ejh                  d(eejh                     d)e7d*e7d+ee!   fd,Z8 G d- d.ejZ                        Z9d/ Z:e+e,fZ; e<e;      Z= G d0 d1ejZ                        Z> G d2 d3e      Z?e" G d4 d5e             Z@e" G d6 d7e@             ZAe" G d8 d9e@e             ZBg d:ZCy)=    )AnyCallableOptionalUnionN)nn   )CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs)is_causal_conv1d_available   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Lfm2RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z:
        Lfm2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.pyr%   zLfm2RMSNorm.__init__1   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor'   float32powmeanrsqrtr*   r)   )r+   hidden_statesinput_dtypevariances       r/   forwardzLfm2RMSNorm.forward9   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r0   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler)   shaper*   r+   s    r/   
extra_reprzLfm2RMSNorm.extra_repr@   s*    ))*+6$2G2G1HIIr0   )gư>)__name__
__module____qualname__r%   r>   rC   __classcell__r.   s   @r/   r"   r"   /   s    $;Jr0   r"   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )Lfm2RotaryEmbeddingconfigc                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r$   r%   hasattr
isinstancerM   dictgetrN   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrK   r   rope_init_fnattention_scalingregister_bufferrQ   original_inv_freq)r+   rK   devicerQ   r.   s       r/   r%   zLfm2RotaryEmbedding.__init__E   s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r0   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r3   r   mpscpuF)device_typeenabledr2   dim)r5   )rQ   floatexpandrA   r6   r^   rT   rO   strr'   autocast	transposecatcosr[   sinr5   )
r+   xposition_idsinv_freq_expandedposition_ids_expandedrb   freqsembrl   rm   s
             r/   r>   zLfm2RotaryEmbedding.forwardV   sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.N)
rD   rE   rF   r   r%   r'   no_gradr   r>   rG   rH   s   @r/   rJ   rJ   D   s3    /z /" U]]_<  <r0   rJ   c                   *     e Zd Zdef fdZd Z xZS )Lfm2MLPrK   c                    t         |           |j                  }|j                  rat	        d|z  dz        }|j
                  Dt	        |j
                  |z        }|j                  ||j                  z   dz
  |j                  z  z  }t        j                  |j                  |d      | _
        t        j                  |j                  |d      | _        t        j                  ||j                  d      | _        y )Nr2   r   r   Fbias)r$   r%   intermediate_sizeblock_auto_adjust_ff_dimintblock_ffn_dim_multiplierblock_multiple_ofr   Linearr,   w1w3w2)r+   rK   r{   r.   s      r/   r%   zLfm2MLP.__init__g   s    "44** #A(9$9A$= >..:$'(G(GJ[([$\!$*$<$<&)A)AAAE&JbJbb%! ))F..0AN))F..0AN))-v/A/ANr0   c                     | j                  t        j                  | j                  |            | j	                  |      z        S rt   )r   Fsilur   r   )r+   rn   s     r/   r>   zLfm2MLP.forwardv   s/    wwqvvdggaj)DGGAJ677r0   )rD   rE   rF   r   r%   r>   rG   rH   s   @r/   rw   rw   f   s    Oz O8r0   rw   c                   Z   e Zd ZdZdZdZdZdZej                  dfde
dedej                  deej                  edf   fdZ	 dd	ej"                  d
ej"                  dedeeeef      deej"                  ej"                  f   f
dZdej.                  fdZddee   defdZdej"                  dedeeef   fdZdefdZdedeej"                  ej"                  f   fdZdeeej"                     eej"                     f   fdZeddeeeej>                           ddfd       Z d Z!y)Lfm2HybridConvCachea  
    Attention and conv cache for Lfm2.

    It stores the Key and Value states as a list of tensors, one for each layer.
    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
    NFrK   max_batch_sizer5   r^   c                 ,   g | _         g | _        || _        |j                  | _        | j                  j	                  d      | _        |j                  | _        || _        g | _        |t        j                  |      nd }t        |j                        D ]~  }t        j                  | j                  |j                  | j                  | j                  |      }t        j                  j!                  |       | j                  j#                  |        y )Nfull_attention)r5   r^   )	key_cachevalue_cacher   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cacher'   r^   rangenum_hidden_layerszerosr,   _dynamomark_static_addressappend)r+   rK   r   r5   r^   _
conv_states          r/   r%   zLfm2HybridConvCache.__init__   s     ,!--%)%5%5%;%;<L%M""//.0)/);f%v//0 		/A##""!!kkJ MM--j9OO"":.		/r0   
key_statesvalue_states	layer_idxcache_kwargsreturnc                 &   |qt        | j                        |k  rt        t        | j                        |      D ]^  }| j                  j                  t	        j
                  g              | j                  j                  t	        j
                  g              ` | j                  j                  |       | j                  j                  |       n| j                  |   j                         s|| j                  |<   || j                  |<   nft	        j                  | j                  |   |gd      | j                  |<   t	        j                  | j                  |   |gd      | j                  |<   | j                  |   | j                  |   fS )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        rd   )	lenr   r   r   r'   tensorr   numelrk   )r+   r   r   r   r   r   s         r/   updatezLfm2HybridConvCache.update   sB   0 !4>>"i/s4>>2I> >ANN))%,,r*:;$$++ELL,<=> %%j1  ''5NN9-335,6y).:  +,1IIt~~i7PR\6]ce,fy).3ii9I9I)9TVb8cik.l  +~~i($*:*:9*EEEr0   beam_idxc                 D   t        t        | j                              D ]  }| j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<    y)zDReorders the cache for beam search, given the selected beam indices.r   N)r   r   r   r^   index_selectr6   r   r   )r+   r   r   r^   s       r/   reorder_cachez!Lfm2HybridConvCache.reorder_cache   s    s4>>23 	iI^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI&	ir0   c                     | j                   |   dk7  r| j                  n|}t        | j                        |k  s | j                  |   j	                         dk(  ry| j                  |   j
                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r   r   )r   r   r   r   r   rA   r+   r   s     r/   get_seq_lengthz"Lfm2HybridConvCache.get_seq_length   sm     372B2B92MQa2aD..gp	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r0   cache_positionc                 V    d}|j                   d   }| j                         }||z   }||fS )aB  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
        for each layer.
        r   )rA   r   )r+   r   r   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengths          r/   get_mask_sizesz"Lfm2HybridConvCache.get_mask_sizes   s@      %++A...0 #33	---r0   
max_lengthc                    |dk  r| j                         t        |      z
  }| j                         |k  ryt        t        | j                              D ]l  }| j                  |   j                         s!| j                  |   dd|ddf   | j                  |<   | j                  |   dd|ddf   | j                  |<   n y)z"Crop the cache to the given lengthr   N.)r   absr   r   r   r   r   )r+   r   idxs      r/   cropzLfm2HybridConvCache.crop   s    >,,.Z@J J.T^^,- 	SC~~c"((*&*nnS&9#{
{A:M&Ns#(,(8(8(=c;J;PQ>Q(R  %	Sr0   c                 >    | j                   |   | j                  |   fS rt   )r   r   r   s     r/   __getitem__zLfm2HybridConvCache.__getitem__   s!    ~~i($*:*:9*EEEr0   c                     t        d      Nz<Lfm2HybridConvCache does not have a legacy cache equivalent.NotImplementedErrorrB   s    r/   to_legacy_cachez#Lfm2HybridConvCache.to_legacy_cache  s    !"`aar0   past_key_valuesr
   c                     t        d      r   r   )clsr   s     r/   from_legacy_cachez%Lfm2HybridConvCache.from_legacy_cache  s    !"`aar0   c                     t        t        | j                              D ]  }| j                  |   j                          ! y rt   )r   r   r   zero_r   s     r/   resetzLfm2HybridConvCache.reset  s4    s4??34 	/IOOI&,,.	/r0   rt   )r   )"rD   rE   rF   __doc__r   is_compileabler   r   r'   r7   r   r}   r5   r   r^   rh   r%   Tensorr   rU   r   r@   r   
LongTensorr   r   r   r   r   r   classmethodFloatTensorr   r    r0   r/   r   r   z   s    NNIK #]]15// / {{	/
 ellC-./D 26)FLL)F ll)F 	)F
 tCH~.)F 
u||U\\)	*)FV	ie&6&6 	i3 3c 3.U\\ .c .eTWY\T\o .Ss SFS FU5<<3M-N FbuU\\':E%,,<O'O!P b buUEVEV?W9X0Y bes b b/r0   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr3   r2   rd   )rA   r'   rk   )rn   x1x2s      r/   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r0   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkrl   rm   ro   unsqueeze_dimq_embedk_embeds           r/   apply_rotary_pos_embr     sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr0   r;   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rA   rg   reshape)r;   r   batchnum_key_value_headsslenhead_dims         r/   	repeat_kvr   0  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr0   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr2   r   r   r3   )re   r5   )ptrainingr   )r   num_key_value_groupsr'   matmulrj   rA   r   
functionalsoftmaxr7   r6   r5   r   r   
contiguous)r   r   r   r   r   r   r   r   r   r   attn_weightscausal_maskattn_outputs                r/   eager_attention_forwardr   <  s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r0   c                   ,    e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   de
ej                     de
e   d	e
ej                     d
e	ej                  e
ej                     e
e	ej                        f   fdZ xZS )Lfm2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrK   r   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        t%        | j                  |j&                        | _        t%        | j                  |j&                        | _        y )Nr   g      TFry   r-   )r$   r%   rK   r   getattrr,   num_attention_headsr   r   r   r   	is_causalr   r   q_projk_projv_projout_projr"   norm_epsq_layernormk_layernormr+   rK   r   r.   s      r/   r%   zLfm2Attention.__init__Y  sL   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejk		&"<"<t}}"LfN`N`glm&t}}&//J&t}}&//Jr0   r;   position_embeddingsr   past_key_valuer   r   c                 4   |j                   d d }g |d| j                  }| j                   | j                  |      j                  |       j                  dd      }	| j                   | j                  |      j                  |       j                  dd      }
 | j                  |      j                  | j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||fd| j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nr3   r   r2   )rm   rl   r   eager        )r   r   )rA   r   r  r   viewrj   r  r   r   r   r   r   r   rK   _attn_implementationr   r   r   r   r  )r+   r;   r  r   r  r   r   input_shapehidden_shapequery_statesr   r   rl   rm   r   attention_interfacer   r   outputs                      r/   r>   zLfm2Attention.forwardh  s    $))#2.88b8$--8''(GM(B(G(G(VWaabcefg%%&Edkk-&@&E&E|&TU__`acde
6t{{=166EOOPQSTU&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
 LL	%
 	%
!\ *k));;;;FFH{+|##r0   r   )rD   rE   rF   r   r   r}   r%   r'   r   r@   r   r   r   r>   rG   rH   s   @r/   r   r   V  s    GKz Kc K( 9=59'$||'$ #5<<#=>'$ !.	'$
 !!45'$ !!1!12'$ 
u||Xell3XeELL>Q5RR	S'$r0   r   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rA   r5   r6   )r;   r   r5   s      r/   apply_mask_to_padding_statesr    sa     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr0   c            
       r    e Zd Zdedef fdZ	 	 	 ddej                  dee	   deej                     deej                     fdZ	 	 	 ddej                  dee	   deej                     deej                     fd	Z	 	 	 dd
ej                  dee	   deej                     deej                     fdZ xZS )Lfm2ShortConvrK   r   c           	      2   t         |           || _        || _        |j                  | _        |j                  | _        t        j                  |j                  |j                  | j
                  |j                  | j                  | j
                  dz
        | _        t        j                  |j                  d|j                  z  | j                        | _        t        j                  |j                  |j                  | j                        | _        y )Nr   )in_channelsout_channelskernel_sizegroupsrz   paddingr   ry   )r$   r%   rK   r   r   L_cache	conv_biasrz   r   Conv1dr,   convr   in_projr  r  s      r/   r%   zLfm2ShortConv.__init__  s    
 	"**$$	II**++%%LL1$
	 yy!3!3Q9K9K5KRVR[R[\		&"4"4f6H6HtyyYr0   rn   r  r   r   c                    t        ||      }| j                  |      j                  dd      }|j                  dd      \  }}}||z  }| j                  j
                  j                  | j                  j
                  j                  d      | j                  j
                  j                  d            }	|c|d   dkD  r[t        |j                  d      |j                  | j                     |	| j                  j                  d       }
|
j                  d      }
n|dt        j                  j!                  || j"                  |j$                  d   z
  df      }|j                  | j                     j'                  |       t)        ||	| j                  j                  d       }
||
z  }| j+                  |j                  dd      j-                               }|S )Nr3   r   r   rd   r   r2   )
activation)r  r   rj   chunkr  r)   r  sizer   squeezer   r   rz   r   r   r   padr  rA   copy_r   r  r   )r+   rn   r  r   r   BCxBCBxconv_weightsconv_outr   ys                r/   cuda_kernels_forwardz"Lfm2ShortConv.cuda_kernels_forward  s    )N;ll1o''B/))A2)&1aUyy'',,TYY-=-=-B-B1-EtyyGWGWG\G\]^G_`%.*;a*?+

2))$..9		H  ))"-H)]]..rDLL288B<4OQR3ST
))$..9??
K'L$))..UYZHLMM!++b"-88:;r0   c                    |j                   d   }t        ||      }| j                  |      j                  dd      }|j	                  dd      \  }}}||z  }	|5|d   dkD  r,|j
                  | j                     }
|j                  d| j                  dz
        }|
j                  dd      }
|	j                  |
j                  |
j                        |
d d d d |f<   |j
                  | j                     j                  |
       t        j                  |
j                  |	j                        | j                   j"                  d d dd d f   z  d      }| j$                  r|| j                   j$                  z  }|j'                  d      }n~|dt(        j*                  j-                  |	| j                  |	j                   d   z
  df      }
|j
                  | j                     j                  |
       | j!                  |	      d	d |f   }||z  }|j                  dd      j/                         }| j1                  |      }|S )
Nr   r3   r   r   rd   r   )shiftsdims)r^   r5   .)rA   r  r   rj   r#  r   r   clampr  rollr6   r^   r5   r'  r'   sumr  r)   rz   r   r   r   r&  r   r  )r+   rn   r  r   r   seqlenr(  r)  r*  r+  r   r-  r.  s                r/   slow_forwardzLfm2ShortConv.slow_forward  s    (N;ll1o''B/))A2)&1aU%.*;a*?'224>>BJ+11!T\\A5EFN#<J/1uuJ<M<MU_UeUeu/fJq!^+,%%dnn5;;JGyyryy!9DII<L<LQPQSTW<U!U[]^HyyDIINN*))"-H)]]..rDLL288B<4OQR3ST
))$..9??
Kyy}S'6'\2HLKKB**,MM!r0   r;   c                     t         rJd|j                  j                  v r2t        j                  j                         s| j                  ||||      S | j                  ||||      S )Ncuda)is_fast_path_availabler^   rO   r'   r   is_compilingr/  r7  )r+   r;   r  r   r   s        r/   r>   zLfm2ShortConv.forward   s\     "f0D0D0I0I&IRWR_R_RlRlRn,,]NN\jkk  P^__r0   )NNN)rD   rE   rF   r   r}   r%   r'   r   r   r   r   r/  r7  r>   rG   rH   s   @r/   r  r    s    ZZ Z2 9=5915 <<  !!45  !!1!12	 
 !. J 9=5915$<<$ !!45$ !!1!12	$
 !.$R 9=5915	`||	` !!45	` !!1!12		`
 !.	`r0   r  c                       e Zd Zdedef fdZ	 	 	 	 ddej                  deej                  ej                  f   de	ej                     de	ej                     de	eej                        d	e	ej                     d
ej                  fdZ xZS )Lfm2DecoderLayerrK   r   c                 f   t         |           |j                  |   dk(  | _        | j                  rt	        ||      | _        nt        ||      | _        t        |      | _	        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   r   )r$   r%   r   is_attention_layerr   	self_attnr  r  rw   feed_forwardr"   r,   r  operator_normffn_normr  s      r/   r%   zLfm2DecoderLayer.__init__  s    "("4"4Y"?CS"S""*69=DN%fi8DI#FO(););Q#F$6$6FOOLr0   r;   r  r   ro   r  r   r   c           
         |}| j                   r, | j                  d| j                  |      |||||d|\  }}	n$| j                  | j                  |      |||      }||z   }|| j	                  | j                  |            z   }|S )N)r;   r  r   ro   r  r   )r;   r  r   r   r   )r?  r@  rB  r  rA  rC  )
r+   r;   r  r   ro   r  r   r   residualr   s
             r/   r>   zLfm2DecoderLayer.forward  s     !""-t~~  "00?$7-)--   M1 !II"00?---	 & M &0%(9(9$--:V(WWr0   )NNNN)rD   rE   rF   r   r}   r%   r'   r   r@   r   r   r>   rG   rH   s   @r/   r=  r=    s    
Mz 
Mc 
M  26378<59|| #5<<#=> !.	
 u//0 !u||!45 !!1!12 
r0   r=  c                   J    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZy)	Lfm2PreTrainedModelrK   modelTr=  r   F)r;   
attentionsN)rD   rE   rF   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr=  r   _can_record_outputsr   r0   r/   rG  rG  ;  sQ    &*#+,#4"5N""&)#r0   rG  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     dee   d	eej                     d
ee   defd              Z xZS )	Lfm2ModelrK   c           	      ,   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |      | _        d| _        t        |      | _        t%        |j                  |j&                        | _        | j+                          y c c}w )N)rK   Fr   )r$   r%   pad_token_idpadding_idx
vocab_sizer   	Embeddingr,   embed_tokens
ModuleListr   r   r=  layersrJ   
rotary_embgradient_checkpointingpos_embr"   r  embedding_norm	post_initr  s      r/   r%   zLfm2Model.__init__O  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabYfi0b
 .V<&+#*62)&*<*<&//R 	 cs   D	input_idsr   ro   r   inputs_embeds	use_cacher   r   r   c           
         |d u |d uz  rt        d      || j                  |      }|r>|<|j                  d   }	t        | j                  |	| j
                  | j                        }|F||j                         nd}
t        j                  |
|
|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f|||||d|} | j                  |      }t!        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   )rK   r   r5   r^   r   )r^   )rK   input_embedsr   r   r   ro   )r   ro   r  r   r  )last_hidden_stater   )
ValueErrorr\  rA   r   rK   r5   r^   r   r'   aranger   r   ra  r^  r   rb  r   )r+   rd  r   ro   r   re  rf  r   r   
batch_sizer   r   r;   r  decoder_layers                  r/   r>   zLfm2Model.forward`  s    -t";<YZZ  --i8M0&,,Q/J1{{:TZZX\XcXcO !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"ll=,G "[[)H4;;+H+HI 		M)*).-$7 M		 ++M:&++
 	
r0   )NNNNNNN)rD   rE   rF   r   r%   r   r   r   r'   r   r   r   r   boolr   r   r   r>   rG   rH   s   @r/   rV  rV  M  s    z "  1515379=59$(59=
E,,-=
 !.=
 u//0	=

 ""56=
   1 12=
 D>=
 !!1!12=
 +,=
 
!=
  =
r0   rV  c                   p    e Zd ZdgZddiZddgdgfiZ fdZd Zd Ze	e
	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     dee   deej                      deej                     dee   deej                     deeej                  f   dee   defd              Z xZS )Lfm2ForCausalLMzlm_head.weightlm_headcolwise_repr;   logitsc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y )NFry   )
r$   r%   rV  rH  rZ  r   r   r,   rq  rc  )r+   rK   r.   s     r/   r%   zLfm2ForCausalLM.__init__  sU     v&
 ++yy!3!3V5F5FUS 	r0   c                     || _         y rt   rH  )r+   decoders     r/   set_decoderzLfm2ForCausalLM.set_decoder  s	    
r0   c                     | j                   S rt   rv  rB   s    r/   get_decoderzLfm2ForCausalLM.get_decoder  s    zzr0   rd  r   ro   r   re  labelsrf  r   logits_to_keepr   r   c
                 z    | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Lfm2ForCausalLM

        >>> model = Lfm2ForCausalLM.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rd  r   ro   r   re  rf  r   N)rs  r{  rZ  )lossrs  r   r;   rI  r   )rH  ri  rT   r}   slicerq  loss_functionrK   rZ  r   r   r;   rI  )r+   rd  r   ro   r   re  r{  rf  r   r|  r   outputsr;   slice_indicesrs  r~  s                   r/   r>   zLfm2ForCausalLM.forward  s    @ ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r0   )	NNNNNNNNr   )rD   rE   rF   _tied_weights_keys_tp_plan_pp_planr%   rx  rz  r   r   r   r'   r   r   r	   r   rn  r   r}   r   r   r   r>   rG   rH   s   @r/   rp  rp    s:   *+=)H_-z:;H  151537+/59-1$(59348
E,,-8
 !.8
 u//0	8

 "%8
   1 128
 ))*8
 D>8
 !!1!128
 c5<</08
 +,8
 
 8
  8
r0   rp  )rp  rV  rG  )Nr   )r
  )Dtypingr   r   r   r   r'   torch.nn.functionalr   r   r   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.import_utilsr   configuration_lfm2r   causal_conv1dr   r   Moduler"   rJ   rw   r   r   r   r   r}   r   rf   r   r   r  kernel_modulesallr:  r  r=  rG  rV  rp  __all__r   r0   r/   <module>r     s  ( 2 1     . ) 7 / 9 O K F & I I / < * DD-7** Y'J")) J (J(<")) <D8bii 8(Q/, Q/h(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%49$BII 9$x #$89^, h`BII h`V,1 ,^ /  " Q
# Q
 Q
h N
)? N
 N
b Br0   