
    rh:                         d dl mZmZmZ ddlmZ erddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ  e
       rd dlZ ej"                  e      Z G d	 d
e      Zy)    )TYPE_CHECKINGAnyOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_torch_availableis_triton_availableis_triton_kernels_availalblelogging)get_module_from_nameNc                   6    e Zd ZdZdZdZdgZ fdZd Zd!dZ	d	d
ddde
dee
ef   fdZ	 d"d	d
ddde
dddee
ef   deee
      fdZd#dZd	d
dee
   dee
   fdZ	 d"d	d
deee
      fdZdee
   de
dee
   fdZd Zde
de
fdZd"dZedefd        Z xZS )$Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    TF
acceleratec                 4    t        |   |fi | || _        y N)super__init__quantization_config)selfr   kwargs	__class__s      z/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   zMxfp4HfQuantizer.__init__1   s    ,77#6     c                    t               st        d      t        j                  j	                         st        d      t               st        d      | j                  j                  ry t        j                  j                         }|\  }}t        d      r
t               sT| j                  r=| j                  j                  s't        j                  d       d| j                  _        y t        d      |dk  rt        d	      |j!                  d
d       }|t        j                  d       y |N| j                  sAt#        |t$              r0d|j'                         v sd|j'                         v rt        d      y y y y )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z+Using MXFP4 quantized models requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`3.4.0z{MXFP4 quantization requires triton >= 3.4.0 and triton_kernels installed, we will default to dequantizing the model to bf16TzHMXFP4 quantization requires triton >= 3.4.0 and triton_kernels installed	   zcMXFP4 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100, or B100)
device_mapzYou have loaded an FP4 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrortorchcudais_availableRuntimeErrorr
   r   
dequantizeget_device_capabilityr   r   pre_quantizedloggerwarning_once
ValueErrorget
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr    s          r   validate_environmentz%Mxfp4HfQuantizer.validate_environment5   sz   !#]  zz&&(LMM&(YZZ##.."ZZ==?)u"7+3O3Q!!$*B*B*M*M## R 7;((3 !!kll19u  ZZd3
| #&&z40j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                 V    |&t         j                  }t        j                  d|       |S )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.)r$   bfloat16r+   info)r   torch_dtypes     r   update_torch_dtypez#Mxfp4HfQuantizer.update_torch_dtypei   s0    ..KKKF  r   modelr	   param_valueztorch.Tensor
param_name
state_dictc                    ddl m} ddlm} | j                  j
                  r%d|v sd|v rt        ||d t        d              \  }}	nt        ||      \  }}	t        ||      s"t        ||      r| j                  j
                  r|	dv ryy	y)
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)down_proj_biasgate_up_proj_biasFT)	integrationsrC   models.gpt_oss.modeling_gpt_ossrE   r   r(   r   lenr/   )
r   r=   r>   r?   r@   r   rC   rE   moduletensor_names
             r   check_quantized_paramz&Mxfp4HfQuantizer.check_quantized_paramu   s     	6C ##..H
4JhZdNd"6ujIZCPYN?>["\FK"6uj"IFKf01v}-$2J2J2U2UEEr   target_deviceztorch.deviceunexpected_keysc           
      R   t               rt        d      r
ddlm}m}	m}
 ddlm}m}m	}m
} ddlm} | j                  st        ||      \  }}t        j                   j#                  |      5  t%        ||      r|d|v r|j&                  }|j(                  }t        j*                  j,                  j/                  |d|d|ddfdd	      } ||      \  }} 
|  	       
            |_        ||_        t        j*                  j5                  |j6                  j8                  d      |_        nd|v r|j<                  }|j>                  }t        j*                  j,                  j/                  |d|d|ddfdd	      jA                  |      } ||      \  }} 
|  	       
            |_!        ||_"        t        j*                  j5                  |j6                  j8                  d      |_#        d d d        y |jI                  dd       }|jI                  dd       }|jI                  dd       }|jI                  dd       }|jI                  dd       }d|v sd|v r3| jJ                  j                  rt        ||d tM        d              \  }}nt        ||      \  }}||||||d}t%        ||      s"t%        ||      rW| jJ                  j                  r@| jJ                  j                  r|d tM        d        } ||||||fi | y  |||||fi | y y y # 1 sw Y   y xY w)Nr   r   )FlexCtx
InFlexDataPrecisionConfigr   )rC   r(   load_and_swizzle_mxfp4quantize_to_mxfp4rD   gate_up_projconstant)modevalue)rhs_data)weight_scaleflex_ctxF)requires_grad	down_projempty_paramcasting_dtypeto_contiguousrankdevice_meshrF   rG   rH   )rb   rc   rd   re   rf   r=   )'r   r   triton_kernels.matmul_ogsrT   rU   rV   rK   rC   r(   rW   rX   rL   rE   r*   r   r$   r%   devicer/   gate_up_proj_right_padgate_up_proj_bottom_padnn
functionalpadgate_up_proj_precision_configrY   	Parameterstoragedatagate_up_proj_blocksdown_proj_right_paddown_proj_bottom_padtodown_proj_precision_configra   down_proj_blocksr.   r   rM   )r   r=   r>   r?   rQ   r@   rR   r   rT   rU   rV   rC   r(   rW   rX   rE   rN   _	right_pad
bottom_padloaded_weighttriton_weight_tensorr^   rb   rc   rd   re   rf   shard_kwargsdq_param_names                                 r   create_quantized_paramz'Mxfp4HfQuantizer.create_quantized_param   s;    ().A'.JVVllC!!,UJ?IFA""=1 f&89%3$*$A$A	%+%C%C
(-(;(;(?(?'!Y:q!)LS]ef )@ ) >O}=]:,l?N)5Q[Q]@^@< /C+5:XX5G5G088==U 6H 62 %
2$*$>$>	%+%@%@
(-(;(;(?(?'!Y:q!)LS]ef )@ )"]+ & >O}=]:,l<K)5Q[Q]@^=9 ,@(27((2D2D088==U 3E 3/7 D !**]D9K"JJ=M"JJ=M::fd+D **]D9KJ&(j*@dF^F^FiFi0
CTc)n_8UV	0
C	  +!.!.*L &"456=1d6N6N6Y6Y++66 %//@#i.$AMvz;}m`lm*"#%	
 ' 7Z1o s   -F
LL&c                     | j                   j                  r| j                  |       t        j                  j                         rt        j                  j                          y y r   )r   r(   remove_quantization_configr$   r%   r&   empty_cache)r   r=   r   s      r   #_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loading   sD    ##..++E2::""$JJ""$ %r   expected_keyscheckpoint_keysc                 ^   g }|D ]  }|j                  d      r8|d t        d        }|j                  |dz          |j                  |dz          L|j                  d      r8|d t        d        }|j                  |dz          |j                  |dz          |j                  |        |S )	Nz.mlp.experts.gate_up_projrY   rr   gate_up_proj_scalesz.mlp.experts.down_projra   rw   down_proj_scales)endswithrM   append)r   r=   r   r   new_expected_keyskeybases          r   update_expected_keysz%Mxfp4HfQuantizer.update_expected_keys   s      
	.C||781c.112!((0E)EF!((0E)EF67.c+../!((0B)BC!((0B)BC!((-
	. ! r   keep_in_fp32_modulesc                 j   ddl m} | j                  || j                  j                  |      | _        |j                  dd      }|r&t        j                  d       d| j                  _        |j                  } ||| j                  | j                  |      }| j                  |j                  _        y )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
rK   r   get_modules_to_not_convertr   r   r.   r+   r,   r(   r   )r   r=   r   r   r   r   r   s          r   $_process_model_before_weight_loadingz5Mxfp4HfQuantizer._process_model_before_weight_loading   s     	=&*&E&E4++BBDX'
# jj6e 37D$$/)#'#>#> $ 8 8	
 ,0+C+C(r   missing_keysprefixc                 $   ddl m} g }|j                         D ]\  \  }}t        ||      s|D ]E  }||v s
|| d| v s|j	                  d      r#|j	                  d      r5|j                  |       G ^ |D 	cg c]	  }	|	|vs|	 c}	S c c}	w )Nr   rB   .z.weightz.bias)rK   rC   named_modulesr/   r   r   )
r   r=   r   r   rC   not_missing_keysnamerN   missingks
             r   update_missing_keysz$Mxfp4HfQuantizer.update_missing_keys  s    5!//1 	9LD&&"45+ 9GDvhay4I,I ' 0 0 ; ' 0 0 9(//89	9 (Ea14D+DEEEs   <	BBc                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )NGptOssConfigbase_model_tp_plangrouped_gemm)z(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrr   update)r   r   s     r   update_tp_planzMxfp4HfQuantizer.update_tp_plan-  sR    V--666v3T:F))00DRDRAOAO	 r   c                     | j                   j                  r,d|v r|j                  dd      S d|v r|j                  dd      S |S )NrH    _scales)r   r(   replace)r   r?   s     r   update_param_namez"Mxfp4HfQuantizer.update_param_name:  sM    ##..J&!)))R88j(!)))R88r   c                 .    t         j                  d       y)Nz@MXFP4 quantization is not serializable using safetensors for nowFr+   r,   )r   safe_serializations     r   is_serializablez Mxfp4HfQuantizer.is_serializableB  s    ^_r   c                 .    t         j                  d       y)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()Fr   )r   s    r   is_trainablezMxfp4HfQuantizer.is_trainableF  s     x	
 r   )r;   torch.dtyper7   r   r   )r=   r	   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r6   r<   strr0   r   rP   r   listr   r   r   r   r   r   r   r   propertyboolr   __classcell__)r   s   @r   r   r   &   st    (,$ %72h
  $ 	
 cN@ 04W W $W 	W
 &W cNW "$s),Wr%!*; !DQTI !hlmphq !& 59D D 'tCy1D>FtCy F# FRVWZR[ FC C  d  r   r   )typingr   r   r   r   r   modeling_utilsr	   utilsr
   r   r   r   r   quantizers_utilsr   r$   
get_loggerr   r+   r    r   r   <module>r      sR    0 /  0  3 			H	%e{ er   