
    rh                     r   d Z ddlZddlmZ ddlmZ ddlmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ  ej6                  e      Zdad Z G d de	j@                  jB                        Z"d(dZ#d(dZ$ G d dejJ                        Z& G d dejJ                        Z' G d de      Z(e G d de             Z)e ed       G d de                    Z*e ed       G d  d!e                    Z+e G d" d#e)             Z, ed$       G d% d&e)e             Z-g d'Z.y))zPyTorch RWKV model.    N)	dataclass)Path)OptionalUnion)nn   )GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_bitsandbytes_availableis_ninja_availableis_torch_cuda_availablelogging   )
RwkvConfigc                    ddl m} t        t              j	                         j
                  j
                  j
                  dz  dz  }dD cg c]  }||z  	 }}t        t        j                  | k(  ry t        j                  d|  d       dd	d
dddd|  g} |d|  |t        j                         t        j                  k(  |      a| t        _        y c c}w )Nr   )loadkernelsrwkv)z
wkv_op.cppzwkv_cuda.cuzwkv_cuda_bf16.cuz2Loading CUDA kernel for RWKV at context length of .z
-res-usagez--maxrregcount 60z--use_fast_mathz-O3z-Xptxas -O3z--extra-device-vectorizationz-DTmax=wkv_)namesourcesverboseextra_cuda_cflags)torch.utils.cpp_extensionr   r   __file__resolveparentrwkv_cuda_kernelmax_seq_lengthloggerinfor   get_verbosityDEBUG)context_lengthload_kernelkernel_folderfcuda_kernel_filesflagss         y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/rwkv/modeling_rwkv.pyload_wkv_cuda_kernelr/   /   s    = N**,33::AAIMPVVM4efq*ff #(8(G(G>(Y
KKD^DTTUVW 	&
.!"E #N#$!&&(GMM9	 '5#/ gs   Cc                   0    e Zd Zedd       Zedd       Zy)RwkvLinearAttentionNc                    |j                         \  }}}	|t        j                  kD  r t        d| dt        j                   d      ||	z  t	        |	d      z  dk7  rt        d| d|	 dt	        |	d       d	      |j
                  | _        |j                  j                  d
k7  sK|j                  j                  d
k7  s2|j                  j                  d
k7  s|j                  j                  d
k7  rt        d      t        j                  |j                         j                                }|j
                  t        j                  k(  r0|j                         }|j                         }|j                         }|j                         }|j                         }|j                         }t        j                  |t        j                        }
|s||Vt        j                   ||	dt        j"                  |j                  t        j                        }|d d d d dfxx   dz  cc<   nBt        j$                  |D cg c]  }|j'                  d       c}d      j                         }|j
                  t        j(                  k(  rt        j*                  }nt        j,                  } ||||||
|       nI|j
                  t        j(                  k(  rt        j.                  nt        j0                  } ||||||
       | j3                  |||||
       |4t        j4                  |dd      D cg c]  }|j7                  d       }}|
j9                  | j                        |fS c c}w c c}w )NzCannot process a batch with z+ tokens at the same time, use a maximum of z with this model.    r   zThe product of batch size (z) and hidden size (z") needs to be a round multiple of r   cudazUCalling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.memory_formatr   )dtypedevicer6      籡*G)dim)sizer"   r#   
ValueErrorminr7   input_dtyper8   typetorchexpfloat
contiguousfloat16
empty_likecontiguous_formatzerosfloat32cat	unsqueezebfloat16forward_with_state_bf16forward_with_stateforward_bf16forwardsave_for_backwardchunksqueezeto)ctx
time_decay
time_firstkeyvaluestatereturn_state
batch_sizeseq_lenhidden_sizeoutputsforward_funcs                r.   rP   zRwkvLinearAttention.forwardP   s   +.88:(
G[%444.wi7b#2233DF  #c+r&::a?-j\9L[M Z";34A7 
 )) ""f,  %%/zz&(||  F*tuuii
 0 0 2 = = ?@@
99%#))+J))+CKKME**,
nn  "!!#U5L5LM5,}--::"'"9"9 aAg$&		5"Aa1;;q>"AqITTVyyENN*/GG/BBZeVUK<?II<W+88]m]u]uLZeVDj*c5&I+0;;uaQ+GHaQYYq\HEHyy)500 #B Is   ?M+4M0c                 <   | j                   }| j                  \  }}}}}t        j                  |t        j                  |t        j
                  k(  rt        j
                  nt        j                        }	t        j                  |t        j                        }
t        j                  |t        j                        }t        j                  |t        j                        }|t        j                  k(  r|j                         }|t        j
                  k(  rt        j                  nt        j                  } |||||||j                         |	|
||
       |	j                  |      |
j                  |      |j                  |      |j                  |      d d fS )N)r6   r7   r5   )r?   saved_tensorsrA   rF   rG   rL   rI   rE   rC   r"   backward_bf16backwardrD   rT   )rU   g_outputg_stater?   rV   rW   rX   rY   r_   g_time_decayg_time_firstg_keyg_valuebackward_funcs                 r.   re   zRwkvLinearAttention.backward   sJ    oo585F5F2
JUF''11$/5>>$A%..u}}

 ''
%BYBYZ  E4K4KL""58O8OP%--'~~'H:E:W(66]m]v]v!	
 OOK(OOK(HH[!JJ{#
 	
    NFN)__name__
__module____qualname__staticmethodrP   re    rm   r.   r1   r1   O   s)    <1 <1| %
 %
rm   r1   c                    |j                         \  }}}t        j                  |      }|t        j                  |d d df   t        j                        }	t        j                  |d d df   t        j                        }
t        j                  |d d df   t        j                        dz
  }n|\  }	}
}t        j                  |        } t        |      D ]  }|d d |f   j                         }|d d |f   }t        j                  |||z         }t        j                  ||z
        }t        j                  ||z   |z
        }||	z  ||z  z   }||
z  |z   }||z  j                  |j                        |d d |f<   t        j                  || z   |      }t        j                  || z   |z
        }t        j                  ||z
        }||	z  ||z  z   }	||
z  |z   }
|} |s||	|
|g}||fS )Nr   )r7   r:   )
r<   rA   
zeros_likerI   rB   rangerC   maximumrT   r7   )rV   rW   rX   rY   rZ   r[   _
seq_lengthr_   	num_state	den_state	max_statecurrent_indexcurrent_keycurrent_valuemax_for_outpute1e2	numeratordenominatormax_for_states                        r.   rwkv_linear_attention_cpur      s    xxzAz1c"F}$$SAYemmD	$$SAYemmD	$$SAYemmDtK	*/'	9i
 ))J''Jz* "!]*+113a./ y+
2JKYYy>12YY{Z/.@ANR-%77	9nr)$-$;#?#?#Mq-  i*&<kJYYy:-=>YY{]23NR-%77	NR'	!	%"( u(Iy15=rm   c                     t        d | |||fD              }|j                  d      dk(  }t        |s|rt        | |||||      S t        j                  | |||||      S )Nc              3   N   K   | ]  }|j                   j                  d k7    yw)r4   N)r8   r@   ).0ts     r.   	<genexpr>z(rwkv_linear_attention.<locals>.<genexpr>   s     Xa!((--6)Xs   #%r   rZ   r[   )anyr<   r"   r   r1   apply)rV   rW   rX   rY   rZ   r[   no_cuda	one_tokens           r.   rwkv_linear_attentionr      sm    XJ
CQV3WXXG q I7i(ZeSXgstt"((ZeUT`aarm   c                   0     e Zd Zd fd	ZddZddZ xZS )RwkvSelfAttentionc                 r   t         |           || _        t        d uxr t        j                  |j
                  k(  }t               r"t               r|s	 t        |j
                         || _        |j                  }|j                  |j                  n|}|| _        t        j                   t#        j$                  |            | _        t        j                   t#        j$                  |            | _        t        j                   t#        j$                  dd|            | _        t        j                   t#        j$                  dd|            | _        t        j                   t#        j$                  dd|            | _        t        j0                  d      | _        t        j4                  ||d      | _        t        j4                  ||d      | _        t        j4                  ||d      | _        t        j4                  ||d      | _        y # t        $ r t        j                  d       Y w xY w)Nz9Could not load the custom CUDA kernel for RWKV attention.r   r   r   r   Fbias)super__init__configr"   r#   r(   r   r   r/   	Exceptionr$   r%   layer_idr^   attention_hidden_sizer   	ParameterrA   emptyrV   rW   time_mix_keytime_mix_valuetime_mix_receptance	ZeroPad2d
time_shiftLinearrX   rY   
receptancer_   )selfr   r   kernel_loadedr^   r   	__class__s         r.   r   zRwkvSelfAttention.__init__   s   (4q9I9X9X\b\q\q9q$;$=mY$V%:%:; !((,2,H,H,TF((Ze 	 &;",,u{{3H'IJ,,u{{3H'IJLLQ;)GH ll5;;q![+IJ#%<<Aq+0N#O ,,}599[*?eLYY{,AN
))K1FUSii 5{O)  YWXYs   H H65H6c                 p   |j                  d      dk(  r||d   d d d d | j                  f   }n3| j                  |      }| |d   d d d d | j                  f   |d d df<   || j                  z  |d| j                  z
  z  z   }|| j                  z  |d| j                  z
  z  z   }|| j
                  z  |d| j
                  z
  z  z   }| j                  |      }| j                  |      }t        j                  | j                  |            }| |d d df   |d   d d d d | j                  f<   ||||fS Nr   r   r   )r<   r   r   r   r   r   rX   rY   rA   sigmoidr   )r   hiddenrZ   shiftedrX   rY   r   s          r.   extract_key_valuez#RwkvSelfAttention.extract_key_value  s<   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1t(((7a$:K:K6K+LL,,,w!d>Q>Q:Q/RRd666AH`H`D`9aa
hhsm

5!]]4??:#>?
,21b5ME!HQ4==()3u,,rm   c                      j                  ||      \  }}}}|t         fd|dd  D              nd }t         j                   j                  ||||      \  }}|T|d   |d   d d d d  j
                  f<   |d   |d   d d d d  j
                  f<   |d   |d   d d d d  j
                  f<    j                  ||z        |fS )	NrZ   c              3   J   K   | ]  }|d d d d j                   f     y wro   r   )r   r`   r   s     r.   r   z,RwkvSelfAttention.forward.<locals>.<genexpr>$  s!     FqAaDMM12Fs    #r9   r   r   r   r      )r   tupler   rV   rW   r   r_   )	r   r   rZ   	use_cacher   rX   rY   layer_stater   s	   `        r.   rP   zRwkvSelfAttention.forward"  s    (,(>(>vU(>(S%
CJOJ[eFE!"IFFae1OOOO"
k ",7NE!HQ4==(),7NE!HQ4==(),7NE!HQ4==(){{:,-u44rm   r   ro   rn   )rp   rq   rr   r   r   rP   __classcell__r   s   @r.   r   r      s    P<-&5rm   r   c                   (     e Zd Zd fd	ZddZ xZS )RwkvFeedForwardc                 B   t         |           || _        || _        |j                  }|j
                  |j
                  nd|j                  z  }t        j                  d      | _        t        j                  t        j                  dd|            | _        t        j                  t        j                  dd|            | _        t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _        y )Nr   r   r   Fr   )r   r   r   r   r^   intermediate_sizer   r   r   r   rA   r   r   r   r   rX   r   rY   )r   r   r   r^   r   r   s        r.   r   zRwkvFeedForward.__init__7  s     (((.(@(@(LF$$RSV\VhVhRh 	 ,,}5LLQ;)GH#%<<Aq+0N#O 99[*;%H))K5IYY0+EJ
rm   c                 z   |j                  d      dk(  r||d   d d d d | j                  f   }n3| j                  |      }| |d   d d d d | j                  f   |d d df<   || j                  z  |d| j                  z
  z  z   }|| j                  z  |d| j                  z
  z  z   }t        j                  t        j                  | j                  |                  }| j                  |      }t        j                  | j                  |            }| |d d df   |d   d d d d | j                  f<   ||z  |fS r   )r<   r   r   r   r   rA   squarerelurX   rY   r   r   )r   r   rZ   r   rX   r   rY   s          r.   rP   zRwkvFeedForward.forwardH  s)   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1t(((7a$:K:K6K+LLd666AH`H`D`9aa
ll5::dhhsm45

3]]4??:#>?
,21b5ME!HQ4==()E!5((rm   r   ro   rp   rq   rr   r   rP   r   r   s   @r.   r   r   6  s    K")rm   r   c                   &     e Zd Z fdZddZ xZS )	RwkvBlockc                    t         |           || _        || _        |dk(  r0t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                        | _	        t	        j
                  |j                  |j                        | _
        t        ||      | _        t        ||      | _        y )Nr   )eps)r   r   r   r   r   	LayerNormr^   layer_norm_epsilonpre_lnln1ln2r   	attentionr   feed_forward)r   r   r   r   s      r.   r   zRwkvBlock.__init__]  s     q=,,v'9'9v?X?XYDK<< 2 28Q8QR<< 2 28Q8QR*68<+FH=rm   c                    | j                   dk(  r| j                  |      }| j                  | j                  |      ||      \  }}||z   }| j	                  | j                  |      |      \  }}||z   }||f}|r||fz  }|S |dz  }|S )Nr   )rZ   r   r   ro   )r   r   r   r   r   r   )r   r   rZ   r   output_attentionsr   r   outputss           r.   rP   zRwkvBlock.forwardk  s    ==A[[(F>>$((6*:%S\>]	5)#"//0@/Ne,&5/	|#G  wGrm   )NFFr   r   s   @r.   r   r   \  s    >rm   r   c                   R    e Zd ZU eed<   dZdgZddgZdZdZ	de
j                  fdZy	)
RwkvPreTrainedModelr   r   r   rV   rW   Tmodulec           	      
   t        |t              rY|j                  }|j                  j                  }|j                  j
                  }|j                  }||dz
  z  }d||z  z
  }t        j                  t        |      D cg c]  }||z  	 c}|j                  j                  |j                  j                        }	|	ddddf   }	t        |      D 
cg c]  }
dd|
|dz
  z  dd|z  z   z  z  z    }}
t        j                  ||j                  j                  |j                  j                        }t        j                  t        |      D cg c]  }|dz   d	z  dz
   c}|j                  j                  |j                  j                        d
z  }||j                  _        t        j                   |j                  t#        j$                  d      z  |z         |j                  _        t        j&                  |	|      |j                  _        t        j&                  |	|      d|z  z   |j(                  _        t        j&                  |	d
|z        |j*                  _        yt        |t,              r|j                  }|j                  j                  }|j                  j
                  }d||z  z
  }t        j                  t        |      D cg c]  }||z  	 c}|j                  j                  |j                  j                        }	|	ddddf   }	t        j&                  |	|      |j                  _        t        j&                  |	|      |j*                  _        yt        |t.        j0                        r|j2                  j                  j4                  }d}d}|j6                  $|j6                  j                  j9                          |d   |d   kD  rt#        j:                  |d   |d   z        }|d   | j                  j<                  k(  r|d   | j                  j
                  k(  rd
}||z  }t.        j>                  jA                  |j2                  |       yt        |t.        jB                        rt|j2                  j                  j4                  }dt#        j:                  tE        |d   |d               z  }t.        j>                  jA                  |j2                  |       yt        |t.        jF                        rJ|j2                  j                  jI                  d       |j6                  j                  j9                          yyc c}w c c}
w c c}w c c}w )zInitialize the weights.r   g      ?r7   r8   N   gffffff?g?r   g      ?g333333?r   )gaing-C6?)%
isinstancer   r   r   num_hidden_layersr^   r   rA   tensorrw   r   r7   r8   rV   rW   data	ones_likemathlogpowr   r   r   r   r   weightshaper   zero_sqrt
vocab_sizeinitorthogonal_	Embeddingmaxr   fill_)r   r   r   r   r^   r   ratio_0_to_1ratio_1_to_almost0itime_weighthdecay_speedzigzagr   r   scales                   r.   _init_weightsz!RwkvPreTrainedModel._init_weights  su   f/0H & ? ? --33K$*$@$@!#'81'<=L!$3D(D!E,,*/*<=Q[=))//**11K
 &dD!m4K 45 Q!4q89sS<EW?WXXXK   ,,{&:K:K:Q:QZ`ZkZkZrZrsK.34I.JKa!eq[1_K ++11!,,33
   &1F"%*__V5F5FRU5VY_5_%`F"',yy>P'QF$).;@R)SVY\hVh)hF!!&.3iiSK]E].^F&&+0H & ? ? --33K!$3D(D!E,,*/*<=Q[=))//**11K
 &dD!m4K',yy>P'QF$.3iiEW.XF&&+		*MM&&,,EDE{{&  &&(Qx%("yyqE!H!45Qx4;;111eAh$++BYBY6YEMDGGD9-MM&&,,E$))Ca%($;<<DGGD9-MM$$S)KK""$ .w > L* >s   UU$U)U.N)rp   rq   rr   r   __annotations__base_model_prefix_no_split_modules_keep_in_fp32_modulessupports_gradient_checkpointing_is_statefulr   Moduler   rt   rm   r.   r   r   ~  s>    $)<8&*#LI%BII I%rm   r   z+
    Class for the RWKV model outputs.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	ee
ej                        ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)
RwkvOutputa  
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    Nlast_hidden_staterZ   .hidden_states
attentions)rp   rq   rr   __doc__r   r   rA   FloatTensorr   rZ   listr  r   r  rt   rm   r.   r   r     sw     6:x 1 129/3E8D**+,3=AM8E%"3"3S"89:A:>Ju00#567>rm   r   zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	RwkvCausalLMOutputap  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    NlosslogitsrZ   .r  r  )rp   rq   rr   r  r  r   rA   r  r   r	  rZ   r  r  r   r  rt   rm   r.   r  r    s     )-D(5$$
%,*.FHU&&'./3E8D**+,3=AM8E%"3"3S"89:A:>Ju00#567>rm   r  c                       e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deeej                        dee   d	ee   d
ee   dee   deeef   fd       Zd Zd Z xZS )	RwkvModelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j
                        | _        d| _        d| _        | j!                          y c c}w )Nr   F)r   r   r   r   r   r^   
embeddings
ModuleListrw   r   r   blocksr   ln_outlayers_are_rescaledgradient_checkpointing	post_init)r   r   idxr   s      r.   r   zRwkvModel.__init__   s     ,,v'8'8&:L:LMmmPUV\VnVnPo$pYv%D$pqll6#5#56#( &+# 	 %qs   &C	c                     | j                   S ro   r  r   s    r.   get_input_embeddingszRwkvModel.get_input_embeddings  s    rm   c                     || _         y ro   r  r   new_embeddingss     r.   set_input_embeddingszRwkvModel.set_input_embeddings  s	    (rm   	input_idsattention_maskinputs_embedsrZ   r   r   output_hidden_statesreturn_dictreturnc	           	      *   ||n| j                   j                  }||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j
                  }|t        j                  d       | j                  | j                  k(  r| j                          ||t        d      ||t        d      || j                  |      }|r||j                  d      | j                   j                  | j                   j                  f}	t        d      D 
cg c]A  }
t!        j"                  |	|
dk  r|j$                  nt         j&                  |j(                  d	C }}
|d
xx   dz  cc<   | j*                  r%| j                  r|rt        j                  d       d}|}|rdnd}|rdnd}t-        | j.                        D ]o  \  }} |||||      \  }}}| j                  r=| j                   j0                  dkD  r$|dz   | j                   j0                  z  dk(  r|dz  }|r||fz   }|sj||fz   }q | j3                  |      }|r||fz   }|st5        d ||||fD              S t7        ||||      S c c}
w )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        NFz<`attention_mask` was passed, but it is unused in this model.zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr      r   r   r   gꌠ9Y>)FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...rt   )rZ   r   r   r9   c              3   &   K   | ]	  }||  y wro   rt   )r   xs     r.   r   z$RwkvModel.forward.<locals>.<genexpr>u  s     tqfgfsts   )r   rZ   r  r  )r   r   r   trainingr   use_return_dictr$   warning_oncer  _rescale_layersr=   r  r<   r^   r   rw   rA   rH   r7   rI   r8   r  	enumerater  rescale_everyr  r   r   )r   r  r  r  rZ   r   r   r   r!  r   r   r  all_self_attentionsall_hidden_statesr  blockr  s                    r.   rP   zRwkvModel.forward  s   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]% ^_==D444  " ]%>cdd=#8TUU  OOI6M"''*DKK,C,CT[[EbEbcE
 q	  a-"5"5U]][h[o[oE  !HH&&4==##p "	%$5b4"6BD#DKK0 	JJC/4UiSd0,M5*
 ((KK--11W 9 99Q> - 1#$58H$H! &9ZM&I#!	J$ M2 1]4D Dt]E;LNa$bttt++*	
 	
[s   5AJc           	         | j                   | j                   k(  ry | j                  j                  dkD  rt	        j
                         5  t        | j                        D ]  \  }}| j                  r|j                  j                  j                  j                  dt        || j                  j                  z        z         |j                  j                  j                  j                  dt        || j                  j                  z        z         t        |j                  j                  j                  d      r|j                  j                  j                  j                   j#                  dt        || j                  j                  z        z         |j                  j                  j                  j                   j#                  dt        || j                  j                  z        z         t        |j                  j                  j                  d      rN| j%                  |j                  j                  |       | j%                  |j                  j                  |       |j                  j                  j                  j#                  dt        || j                  j                  z        z         |j                  j                  j                  j#                  dt        || j                  j                  z        z          	 d d d        | j                   | _         y # 1 sw Y   xY w)Nr   r9   SCBquant_state)r  r'  r   r,  rA   no_gradr+  r  r   r_   r   mul_intr   rY   hasattrr1  div_ _bnb_4bit_dequantize_and_rescale)r   block_idr/  s      r.   r*  zRwkvModel._rescale_layers~  s`   ##DMM(9:;;$$q( r'0'= rOHe}}..55::1HPTP[P[PiPiDi@j;jk**0077<<Q#hRVR]R]RkRkFkBl=lm #5??#9#9#@#@%H!OO2299==BB1HX\XcXcXqXqLqHrCrs!..44;;??DDQ#hZ^ZeZeZsZsNsJtEtu$U__%;%;%B%BMR AA%//BXBXZbc AA%BTBTBZBZ\de!OO2299>>qCTXT_T_TmTmHmDn?no!..44;;@@c(VZVaVaVoVoJoFpApqrr" (,}}#4 #r rs   
KL77M c                    t               st        d      ddl}|j                  j	                  |j
                  j                  |j
                  j                        }|j                  dt        || j                  j                  z        z         |j                  j                  |j                  d      d      j                  |j                        }t!        |d|       y)	z
        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
        be quantized again.
        z/Please install bitsandbytes to use this method.r   Nr9   cpuF)requires_gradr   )r   ImportErrorbitsandbytes
functionaldequantize_4bitr   r   r2  r7  r5  r   r,  r   
Params4bitrT   r8   setattr)r   target_layerr9  bnbdequant_weightsquant_weights         r.   r8  z*RwkvModel._bnb_4bit_dequantize_and_rescale  s    
 )*OPP"..889L9L9Q9QS_SfSfSrSrsQ#h$++2K2K&K"LLM vv((););E)BRW(X[[\k\r\rsh5rm   )NNNNNNNN)rp   rq   rr   r   r  r  r   r   rA   
LongTensorr  r  boolr   r   r   rP   r*  r8  r   r   s   @r.   r  r    s    )  15595937$(,0/3&*g
E,,-g
 !!1!12g
   1 12	g

 U../0g
 D>g
 $D>g
 'tng
 d^g
 
uj 	!g
 g
R506rm   r  z
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                   0    e Zd ZdgZ fdZd Zd ZddZe	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	ee
j                        d
e	e
j                     de	e   de	e   de	e   de	e   deeef   fd       Z xZS )RwkvForCausalLMzhead.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
r   r   r  r   r   r   r^   r   headr  )r   r   r   s     r.   r   zRwkvForCausalLM.__init__  sH     f%	IIf00&2C2C%P	 	rm   c                     | j                   S ro   rL  r  s    r.   get_output_embeddingsz%RwkvForCausalLM.get_output_embeddings  s    yyrm   c                     || _         y ro   rN  r  s     r.   set_output_embeddingsz%RwkvForCausalLM.set_output_embeddings  s	    "	rm   c                 h    ||d d df   j                  d      }||d|i}nd|i}||d<   ||d<   |S )Nr   r  r  rZ   r   )rK   )r   r  rZ   r  r   kwargsmodel_inputss          r.   prepare_inputs_for_generationz-RwkvForCausalLM.prepare_inputs_for_generation  s]     !!R%(2226I $+];L'3L %W$-[!rm   r  r  r  rZ   labelsr   r   r   r!  r"  c
           	      t   |	|	n| j                   j                  }	| j                  |||||||	      }|d   }| j                  |      }d}|* | j                  ||fd| j                   j
                  i|
}|	s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )aJ  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        N)r  rZ   r   r   r   r!  r   r   r   )r  r	  rZ   r  r  )
r   r(  r   rL  loss_functionr   r  rZ   r  r  )r   r  r  r  rZ   rV  r   r   r   r!  rS  rwkv_outputsr  r	  r  r_   s                   r.   rP   zRwkvForCausalLM.forward  s    J &1%<k$++B]B]yy'/!5# ! 
 %Q=)%4%%  ;;11 	D Yab!11F)-)9TGf$EvE!$$&44#..
 	
rm   )NNN)	NNNNNNNNN)rp   rq   rr   _tied_weights_keysr   rO  rQ  rU  r   r   rA   rG  r  r  rH  r   r   r  rP   r   r   s   @r.   rJ  rJ    s    (#"  15595937-1$(,0/3&*F
E,,-F
 !!1!12F
   1 12	F

 U../0F
 ))*F
 D>F
 $D>F
 'tnF
 d^F
 
u((	)F
 F
rm   rJ  )rJ  r  r   rn   )/r  r   dataclassesr   pathlibr   typingr   r   rA   torch.utils.checkpointr   
generationr	   modeling_layersr
   modeling_utilsr   utilsr   r   r   r   r   r   configuration_rwkvr   
get_loggerrp   r$   r"   r/   autogradFunctionr1   r   r   r   r   r   r   r   r   r  r  rJ  __all__rt   rm   r.   <module>rh     s}      !  "    ) 9 -  + 
		H	%  5@g
%..11 g
T)XbC5		 C5L#)bii #)L* D Q%/ Q% Q%h 

? 
? 
? 
? ? ?$ j6# j6 j6Z i
)? i
i
X Brm   