
    rh;                        d dl mZ ddlmZmZmZmZ  e       rd dlZd dlm	Z	 d dl
Z
d dlmZ d dlmZ  e       rd dlmZ  ej$                  e      Ze
j*                  dej,                  fd       Zd!d	ej0                  d
edeej0                  ej0                  f   fdZe
j*                  dej,                  dej,                  dej,                  dej,                  fd       Zej:                  fdej0                  dej0                  dej0                  dej0                  d
ee   dej>                  dej0                  fdZ ejB                  dej:                  fdej0                  dej0                  dej0                  dej0                  d
eeeef      dej>                  dej0                  fd       Z" G d de	jF                        Z$	 	 	 	 	 d"dZ%	 	 d#d Z&y)$    )Optional   )is_accelerate_availableis_torch_accelerator_availableis_torch_availableloggingN)
functional)init_empty_weights
BLOCK_SIZEc                    t        j                  d      }||z  t        j                  d|      z   }t        j                  | |z         j	                  t         j
                        }t        j                  t        j                  |            dz  }||z  }|j	                  |j                  j                        }t        j                  ||z   |       t        j                  ||z   |       y )Nr   axisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	x_ptry_ptrs_ptrr   pidoffsxsys	            |/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/integrations/finegrained_fp8.pyact_quant_kernelr#   $   s    
--Q
Cbii:66D
  ,A
rvvayE!A	AA	U[[##$AHHUT\1HHUS[!    r   
block_sizereturnc                 f     j                         sJ  j                  d   |z  dk(  sJ t        j                   t        j                        }  j
                  g  j                         d d  j                  d      |z  dt        j                  i} fd}t        |    |||       ||fS )Nr   r   r   c                 T    t        j                  j                         | d         fS )Nr   )tritoncdivnumel)metar   s    r"   gridzact_quant.<locals>.grid6   s"    AGGItL'9:<<r$   )r   )	is_contiguousshapetorch
empty_likefloat8_e4m3fn	new_emptysizer   r#   )r   r%   r!   r    r/   s   `    r"   	act_quantr7   0   s    ??772;#q(((%"5"56ARQVVXcr]RAFF2J*$<REMMRA= T1az:a4Kr$   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc                    t        j                  d      }t        j                  ||      }t        j                  ||      }||z  }||z  }||z  }t        ||z
  |      }|||z  z   }||z  |z  } ||z  t        j                  d|      z   |z  }!| |z  t        j                  d|      z   |z  }"t        j                  d|      }#| |!dddf   |
z  |#dddf   |z  z   z   }$||#dddf   |z  |"dddf   |z  z   z   }%||!|z  z   }&|"|z  }'||'|z  z   }(t        j
                  ||ft         j                        })t        dt        j                  ||            D ]  }*t        j                  |$|#dddf   ||*|z  z
  k  d      }+t        j                  |%|#dddf   ||*|z  z
  k  d      },|*|z  }-|-|	z  }.t        j                  |&|.|z  z         }/t        j                  |(|.|z  z         }0|)t        j                  |+|,      |/dddf   z  |0dddf   z  z  })|$||z  z  }$|%||z  z  }% |j                  j                  t         j                  k(  r |)j                  t         j                        }1nf|j                  j                  t         j                  k(  r |)j                  t         j                        }1n|)j                  t         j                        }1||z  t        j                  d|      z   }2| |z  t        j                  d|      z   }3|||2dddf   z  z   ||3dddf   z  z   }4|2dddf   |k  |3dddf   |k  z  }5t        j                  |4|1|5       y)zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and
    store the result in output tensor `C`.
    r   r   Nr)   g        )maskother)r=   )r   r   r,   minr   zerosr   ranger   dotr   r   bfloat16r   float16r   )6ABCAsBsMNKgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nr8   r9   r:   r;   r   	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_masks6                                                         r"   _w8a8_block_fp8_matmulrv   >   sb   J --Q
C<(I<(I#i/&&H\)Ky;.=L3-.E##4E|#bii<&@@AEG|#bii<&@@AEGYYq,'F'!T'"Y.a91LLMF&D/I-a0@90LLMF7[((G'!H8k))G((L,7rzzJK1bgga./ +GGFa1q<7G3G!GsSGGF41q<7G3G!GsSl"W$ggg+ 556ggg+ 556rvva|c!T'l2Sq\AA,**,**+ 	wwR[[(NN2;;'	
		rzz	)NN2::&NN2::&l"RYYq,%??Gl"RYYq,%??GWQW---	GD!G<L0LLFag"wtQw'7!';<FHHVQV$r$   rE   rF   rH   rI   output_dtypec                    t        |      dk(  sJ |d   |d   }}| j                  d   |j                  d   k(  sJ | j                  dd |j                  dd k(  r| j                         sJ t        j                  | j                  d   |      |j                  d   k(  sJ | j                         | j                  d   z  |j                  dk(  r|j                         r|j                  dk(  sJ |j                  \  }t        j                  |      |j                  d   k(  sJ t        j                  ||      |j                  d   k(  sJ | j                  dd fz   }	| j                  |	|      }
d}|k  r!t        j                        }t        |d      }|}||z  dk(  sJ |}fd	}t        |   | ||
|||||| j                  d
      | j                  d      |j                  d      |j                  d      |
j                  d
      |
j                  d      |j                  d
      |j                  d      |j                  d      |j                  d      |||d       |
S )a  This function performs matrix multiplication with block-wise
    quantization.
    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.
    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should
        be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.
    Returns:
        torch.Tensor: The result of matmul.
    r   r      r(   Nr)         c                 l    t        j                  | d         t        j                  | d         z  fS )Nr8   r9   )r+   r,   )METArJ   rK   s    r"   r/   z*w8a8_block_fp8_matmul_triton.<locals>.grid   s1    AtN34v{{1d>FZ7[[]]r$      )r8   r9   r:   r;   )lenr1   r0   r+   r,   r-   ndimr5   next_power_of_2r   rv   stride)rE   rF   rH   rI   r%   rw   block_nblock_krL   C_shaperG   r8   r:   r9   r/   rJ   rK   s                  @@r"   w8a8_block_fp8_matmul_tritonr      sG   . z?a!!}jmWG772;!''"+%%%773B<288CR=(Q__->>>;;qwwr{G,<<<		QWWR[ A66Q;1??,A==77DAq;;q'"bhhqk111;;q'"bhhqk111ggcrlaT!G	G<0AL<--a0<,L\!Q&&&L^ 4 			

									
		"
		"
		!
		!!!!16 Hr$   input_qweight_qinput_scaleweight_scalec                 ~   | j                   dk(  r| j                  nd| j                  d   | j                  d   f\  }}}|j                  d   }	| j                  d|      }
|j                  |j                  d   d      }|	|d   z  }||d   z  }t        j                  ||z  |	ft        j
                  | j                        }t        |      D ]  }||d   z  }||d   z   }t        |      D ]  }||d   z  }||d   z   }|
dd||f   }|||||f   }|dd||dz   f   }|||f   }t        j                  ||j                         t        j                  dt        j
                  | j                        ||      |z  }|dd||fxx   |z  cc<     |j                  |||	      }|j                  |      S )a  
    Performs blocked matrix multiplication with FP8 quantized matrices.

    Args:
        input_q: Quantized input tensor with 1x128 block quantization
        weight_q: Quantized weight tensor with 128x128 block quantization
        input_scale: Scaling factors for input blocks
        weight_scale: Scaling factors for weight blocks
        block_size: Tuple of (M, N) for weight block dimensions
        output_dtype: Desired output dtype
       ry   r   r(   r   deviceN)scale_ascale_b	out_dtype)r   r1   viewr2   r@   r   r   rA   
_scaled_mmttensorr   )r   r   r   r   r%   rw   
batch_sizeseq_len
hidden_dimout_featuresinput_reshapedinput_scale_reshapednum_weight_blocks_mnum_weight_blocks_noutputim_startm_endjn_startn_endinput_blockweight_blockcurr_input_scalecurr_weight_scaleblock_results                             r"   w8a8_block_fp8_matmul_compiler      s   ( 8?||q7HgmmqRYR_R_`aRbdkdqdqrsdtNu#J>>!$L \\"j1N&++K,=,=a,@"E&*Q-7$
15[[*w.=U]][b[i[ijF&' 5jm#*Q-'*+ 	5A*Q-'Gjm+E )GEM)9:K#GEM75=$@AL  4Aq1q5yLA ,QT 2    NN$!LL%--W-* ##  1gem#$4$/	5	5: [[Wl;F99\""r$   c                        e Zd Zej                  Z	 	 	 	 	 d	dedededee	eef      f fdZ
dej                  dej                  fdZ xZS )
	FP8Linearin_featuresr   biasr%   c                    t         
|   ||       || _        || _        t        j
                  j                  t	        j                  ||t        j                  |            | _
        | j                  j                         dk(  rb||d   z   dz
  |d   z  }||d   z   dz
  |d   z  }	t        j                  t	        j                  ||	t        j                  |            | _        n| j                  dd        || _        || _        |r8t        j                  t	        j                  | j                              | _        y | j                  dd        y )Nr   ry   r   weight_scale_invr   )super__init__r   r   r2   nn	Parameteremptyr   r   weightelement_sizer   r   register_parameterr%   activation_schemer   )selfr   r   r   r   r%   r   r   scale_out_featuresscale_in_features	__class__s             r"   r   zFP8Linear.__init__)  s     	l3&(hh((\;V_VeVent)uv;;##%*".A">"BzRS}!T!,z!}!<q!@ZPQ] R$&LL.0A_ef%D! ##$6=$!2U[[1B1B%CDDI##FD1r$   inputr&   c           	         | j                   j                         dkD  r+t        j                  || j                   | j                        S t               r(t        j                  j                         j                  nd}t        t        |t        j                        }|j                  |j                        5  t        || j                  d         \  }}t        || j                   || j                   | j                  |j"                        }d d d        |j%                          | j                  | j                  z   }j'                  |j"                        S # 1 sw Y   PxY w)Nry   cuda)rw   r)   )r   r   Flinearr   r   r2   acceleratorcurrent_acceleratortypegetattrr   r   r7   r%   r   r   r   synchronizer   )r   r   device_typetorch_accelerator_moduleqinputscaler   s          r"   forwardzFP8Linear.forwardK  s   ;;##%)88E4;;		:: KiJj%++??AFFpvK'.uk5::'N$)00> 	 )%1C D5KK))OO!&	 %002yy$$))+995;;9//	 	s   7AEE%)FNNNdynamic)__name__
__module____qualname__r2   r4   r   intboolr   tupler   Tensorr   __classcell__)r   s   @r"   r   r   &  ss    E 04# 2 2  2 	 2 U38_- 2D0U\\ 0ell 0r$   r   c                   	 |g }| j                         D ]<  \  }}|j                  |       t        |t        j                        r||xs g vrdj                  |      	t        	fd|xs g D              st               5  t        |j                  |j                  |j                  du|j                  j                  |j                  j                  |j                  |j                         | j"                  |<   d}ddd       t%        t'        |j)                                     dkD  rt+        ||||||      \  }}|j-                  d       ? | |fS # 1 sw Y   ZxY w)	z%Replace Linear layers with FP8Linear.N.c              3   &   K   | ]  }|v  
 y w)N ).0keycurrent_key_name_strs     r"   	<genexpr>z+_replace_with_fp8_linear.<locals>.<genexpr>u  s     ]ss22]s   )r   r   r   r   r   r   r%   Tr   )has_been_replacedr(   )named_childrenappend
isinstancer   Linearjoinanyr
   r   r   r   r   r   r   r   r   weight_block_size_modulesr   listchildren_replace_with_fp8_linearpop)
modeltp_planmodules_to_not_convertcurrent_key_namequantization_configr   namemodule_r   s
            @r"   r   r   d  sW    ,,. !f%fbii(T:P:VTV-W#&88,<#= ]?U?[Y[]]') 
-+4$*$6$6%+%8%8#[[4%}}33$mm11*=*O*O#6#H#H,ENN4( )-%
- tFOO%&'!+#;& #"3$ A  	R ;!> ###3
- 
-s   A>EE#	c                     |dgn|}|j                   |j                  |j                          t        t        |            }t	        | | j
                  ||      \  } }|st        j                  d       | S )z:Helper function to replace model layers with FP8 versions.lm_head)r   r   r   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.)r   extendr   setr   _tp_planloggerwarning)r   r   r   r   s       r"   replace_with_fp8_linearr     s     -C,Ji[Pf11=%%&9&P&PQ!#&<"=>75/	 E <	

 Lr$   )rz   )NNNNF)NN)'typingr   utilsr   r   r   r   r2   torch.nnr   r+   triton.languagelanguager   r	   r   
accelerater
   
get_loggerr   r   jit	constexprr#   r   r   r   r7   rv   r   r   r   r   compiler   r   r   r   r   r   r$   r"   <module>r     s$     h h  (- 
		H	% bll  
 
3 
u||U\\?Y9Z 
 Q%4 ,,5Q%6 ,,7Q%8 ,,9Q%: ,,;Q% Q%t !&M||M||M 	M 		M
 S	M ++M \\Mb  -1 %>#\\>#ll># ># ,,	>#
 sCx)># ++># \\># >#B;0		 ;0@ +$`  r$   