
    rh%                       U d dl mZ d dlZd dlZd dlZd dlZd dlmZmZ d dl	Z	d dl
mZ d dl	mZ ddlmZ ddlmZmZ ddlmZ  ej(                  e      Ze	j                  j/                         Z ed	      rerd d
lmZmZmZmZ d2dZd3dZd4d5dZ e	jB                  e	jD                  e	jF                  e	jH                  e	jJ                  e	jL                  e	jN                  e	jP                  e	jR                  e	jT                  e	jV                  dZ,d Z-	 d6	 	 	 	 	 	 	 	 	 d7dZ.d Z/	 	 	 d8	 	 	 d9dZ0 G d d      Z1 G d de1      Z2 G d de1      Z3 G d de1      Z4 G d de1      Z5 G d de5      Z6 G d d e1      Z7 G d! d"e7      Z8 G d# d$e1      Z9 G d% d&e1      Z: G d' d(e1      Z; G d) d*e      Z< e<       Z=d*e>d+<   	 	 	 	 	 	 	 	 d:d,Z?	 	 	 	 	 	 d;d-Z@	 d2d.ZA	 d4d/ZBd<d0ZCd1 ZDy)=    )annotationsN)partialreduce)nn   )DistributedConfig)is_torch_greater_or_equallogging)GeneralInterface2.5)DTensor	Placement	ReplicateShardc                   | yt        d      st        d      t        j                  j	                         j
                  }t        t        |      }t        j                  j                         s	 t        t        j                  d         }t        t        j                  d         }t        t        j                  d         }dd	d
dd}|j                  |      }|dk(  r+t        t        j                  j                  dd            rd}|dk(  rt        dd      sd}t        j                  j                  |||       t        t        |      }|dk7  r|j                  |       |dk7  r+|j                  t        t        j                  d                |dk7  r|j!                         nd}
t        j"                  ||
      }|
G|
dkD  rBddl}t'        t        j(                  d      |_        t'        t        j(                  d      |_        |}||nt        j                  j/                         }t        j                  j1                  |j
                  |f      }||||fS # t        $ r}	t        d      |	d}	~	ww xY w)z
    Sets up the device mesh and initilized the backend for tensor parallelism.
    This function is called when the model is loaded and the TP plan is set to 'auto'.
    NNNNr   z3Tensor parallel is only supported for `torch>=2.5`.RANK
LOCAL_RANK
WORLD_SIZEncclglooxcclhccl)cudacpuxpuhpur   CCL_WORKER_COUNTr   cclr   z2.8T)
accept_dev)backendrank
world_sizezWe tried to initialize torch.distributed for you, but it failed. Make sure you init torch distributed in your script to use `tp_plan='auto'`.w)r	   OSErrortorch_C_get_acceleratortypegetattrdistributedis_initializedintosenvirongetinit_process_group
set_device	Exceptioncurrent_devicedevicesysopendevnullstdoutstderrget_world_sizeinit_device_mesh)tp_plantp_sizedevice_typer4   r"   
local_rankr#   backend_mapr!   eindex	tp_devicer6   
device_mapdevice_meshs                  |/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/integrations/tensor_parallel.pyinitialize_tensor_parallelismrH   )   s&   
 $U+KLL ((++-22KUK0N++-	rzz&)*DRZZ56JRZZ56J#)&PVWK!ook2Ge#BJJNN;Mq,Q(Re#,EeX\,]00tXb0c$UK8Ne#))*5 e!!#bjj&>"?@/:e/CN))+E[%0I UQY"**c*
"**c*
J ,g%2C2C2R2R2TG##44Y^^gZPKj+w66+  	Z 	s   1C8I$ $	I>-I99I>c                    t        |t              r8t        |      }| |z  dk(  sJ d|  d|        | |z  }|D cg c]  }||z  	 c}S | |z  dk(  s
J d|        | |z  }|g|z  S c c}w )a  
    Convert block count or proportions to block sizes.

    This function accepts

    - The number of blocks (int), in which case the block size is
      total_size//blocks; or
    - A list of block sizes (list[int]).

    In the second case, if sum(blocks) < total_size, the ratios between
    the block sizes will be preserved. For instance, if blocks is
    [2, 1, 1] and total_size is 1024, the returned block sizes are
    [512, 256, 256].
    r   zCannot split z in proportional blocks: zPrepacked is not divisible by )
isinstancelistsum)
total_sizeblockstotal_blocks	part_sizeblocksingle_sizes         rG   _blocks_to_block_sizesrS   a   s     &$6{L(A-lzlJcdjck/ll-,.	/56e	E!66F"a'R+I&)RR' F*}v%%	 7s   A&c                    t        j                  dd|       }||v r||   S d|v r1|j                  dd      d   |v r|r||j                  dd      d      S y)a  
    Get the TP style for a parameter from the TP plan.

    The TP plan is a dictionary that maps parameter names to TP styles.
    The parameter name can be a generic name with wildcards (e.g. "*.weight") or a specific name (e.g. "layer_1.weight").

    The `is_weight` is important because for weights, we want to support `.weights` and `.bias` cases seamlessly! but
    not parrent classes for `post_init` calls
    \d+*.   r   N)resubrsplit)parameter_namer=   	is_weightgeneric_param_names       rG   _get_parameter_tp_planr_   {   sr     ^<W$)**	"	"'9'@'@a'H'Kw'V[d)00a8;<<    )BOOLU8I8I16F16BF16I32F32F64I64F8_E4M3c                
   | }|j                   |   }|j                         }t        |d      }g }	d}
|D ].  }||z  }||z  }|dz   |z  }|	t        |
|z   |
|z         z  }	|
|z  }
0 |j	                         }d}|dk(  s|dk(  r$|d   j                  t        j                        }d	}|dk(  r||	df   }n6|dk(  s|d
k(  r|dd|	df   }n!|dk(  s|dk(  r|d|	f   }nt        d| d      |r|S |j                  t        |         S )u  
    When weights are packed (gate_up_proj), we need to make sure each shard gets its correct share.
    So if you have: gate_proj       ( 16, 5120, 8190)
    and             up_proj         ( 16, 5120, 8190)
    packed as       gate_up_proj    ( 16, 5120, 2 * 8190)
    And you shard along the last dimension, you need to interleave the gate and up values:

    Now, if we shard along the last dimension across TP_size (Tensor Parallelism size), we must interleave the values from gate and up projections correctly.

    Let's take TP_size = 4 for an example:

    Packed tensor `gate_up_proj`
    ---------------------------------------------------------------
    [ G0  G1  G2  G3 | G4  G5  G6  G7 | ... | U0  U1  U2  U3 | U4  U5  U6  U7 | ... ]
     ↑─────────────↑   ↑─────────────↑        ↑─────────────↑  ↑─────────────↑
       Gate Slice 0      Gate Slice 1            Up Slice 0       Up Slice 1

    Explanation:
    - The first half of the tensor (left of the center) holds the gate_proj values.
    - The second half (right of the center) holds the up_proj values.
    - For TP=4, we divide each half into 4 slices. In this example, we show two slices for brevity.
    - Each shard receives one slice from the gate part and the corresponding slice from the up part.

    For instance:
    • Shard 0 gets: [ Gate Slice 0, Up Slice 0 ] = [ G0, G1, G2, G3, U0, U1, U2, U3 ]
    • Shard 1 gets: [ Gate Slice 1, Up Slice 1 ] = [ G4, G5, G6, G7, U4, U5, U6, U7 ]
    • … and so on.

    This ensures that each shard receives an equal portion of both gate and up projections, maintaining consistency across tensor parallelism.
    r   )rM   rN   r   rX   Frk   F8_E5M2.TNzUnsupported dim z", only dim 0, 1 or 2 are supported)
shapesizerS   range	get_dtypetor&   float16
ValueErrorstr_to_torch_dtype)paramempty_paramrF   r"   dimslice_rM   r#   block_sizestensors_slicesblock_offset
block_sizeshard_block_sizestartstopslice_dtypecastedtensors                     rG   get_packed_weightsr      sZ   > F""3'J!!#J(JqIKNL! #
%3''q,,%u 4lT6IJJ
"# ""$K Fi;)#;.
ax+,	SBY>3./	SBY^+,+C50RSTTyy+K899r`   c                   |dk7  rt        d      |dk\  r|n|| j                  z   }| j                  |   }||z  }||z  }| j                  d| }| j                  |dz   d }	 | j                  g |||||	 }
t	        |      }t	        |      dz   }t        t        |
j                              }||   ||   c||<   ||<    |
j                  | }|j                  |       }|S )as  
    Reorders a tensor that was reconstructed from sharded packed weights into its canonical packed format.

    For example, if a weight was packed (e.g., gate_proj and up_proj) and then sharded,
    DTensor.full_tensor() might produce an interleaved layout like [G0, U0, G1, U1, ...]
    along the sharded dimension. This function reorders it to [G0, G1, ..., U0, U1, ...].
    This is an inverse operation to get_packed_weights.

    Args:
        reconstructed_tensor: The tensor reconstructed from DTensor (e.g., via .full_tensor().contiguous()).
        sharded_dim: The dimension index in the reconstructed_tensor that was originally sharded.
        world_size: The tensor parallel world size.
        num_packed_projs: The number of projections that were packed together (e.g., 2 for gate_up_proj).

    Returns:
        The reordered tensor in canonical packed format.
    r   zNum blocks different from 2 is not supported yet. This is most likely a bug in your implementation as we only pack gate and up projections together.r   NrX   )	rv   ndimrp   viewlenrK   rr   permute
reshape_as)packed_parametersharded_dimr#   
num_blocksactual_sharded_dimtotal_size_on_sharded_dimoriginal_block_size_on_dimshard_chunk_sizeprefix_shapesuffix_shapetensor_viewaxis_ws_absaxis_npp_abspermute_ordertensor_permutedfinal_ordered_tensors                   rG   repack_weightsr      sO   0 Q c
 	
 )4q(8kL\LaLa>a 0 6 67I J!:j!H1Z?#))*=+=>L#))*<q*@*BCL'"'' 	 	 		
 
K l#K|$q(L{//01M>KL>Y[hit[u;M+l ;)k))=9O +556FGr`   c                r   |j                         }|dk  r||z   }||k\  rt        d| d|       |j                  }t        t        j
                  |      }||k\  rt        d| d|       t        j                  |j                  |   |z        }||z  }	t        |	|z   |j                  |         }
t        d      g|z  }|	|j                  |   k  rt        |	|
      ||<   | t        |         S t        | j                        }d||<   t        j                  t        |      t        j                        S )a
  
    Generalized tensor sharding across a multi-dimensional device mesh.
    Extract only the fraction of the parameter owned by the given `rank` when the parameter would have gone sharding at provided `dim`.
    Extraction follows the pytorch `Shard` placement so that sharding and materializing back to full tensor follows `Shard` semantics.
    `Shard` follows torch.chunk style sharding of the tensor. We demonstrate some cases below on how sharding happens including some edge cases
    such as some ranks having an empty tensor as shard. Below implementation is robut to all these cases.

    Case (1)
    empty_param                 (16, 5120, 8190)
    dim                         0
    device_mesh.size()          4
    rank 0 gets					(4, 5120, 8190)			 (0 ... 4, 5120, 8190)
    rank 1 gets					(4, 5120, 8190)			 (4 ... 8, 5120, 8190)
    rank 2 gets					(4, 5120, 8190)			 (8 ... 12, 5120, 8190)
    rank 3 gets					(4, 5120, 8190)			 (12 ... 16, 5120, 8190)

    Case (2)
    empty_param                 (16, 5120, 8190)
    dim                         0
    device_mesh.size()          14
    rank 0 gets					(2, 5120, 8190)			 (0 ... 2, 5120, 8190)
    rank 1 gets					(2, 5120, 8190)			 (2 ... 4, 5120, 8190)
    rank 2 gets					(2, 5120, 8190)			 (4 ... 6, 5120, 8190)
    rank 3 gets					(2, 5120, 8190)			 (6 ... 8, 5120, 8190)
    rank 4 gets					(2, 5120, 8190)			 (8 ... 10, 5120, 8190)
    rank 5 gets					(2, 5120, 8190)			 (10 ... 12, 5120, 8190)
    rank 6 gets					(2, 5120, 8190)			 (12 ... 14, 5120, 8190)
    rank 7 gets					(2, 5120, 8190)			 (14 ... 16, 5120, 8190)
    rank 8 gets					(0, 5120, 8190)
    rank 9 gets					(0, 5120, 8190)
    rank 10 gets			    (0, 5120, 8190)
    rank 11 gets				(0, 5120, 8190)
    rank 12 gets				(0, 5120, 8190)
    rank 13 gets				(0, 5120, 8190)

    Case (3)
    empty_param                 (16, 5120, 8190)
    dim                         0
    device_mesh.size()          3
    rank 0 gets					(6, 5120, 8190)			 (0 ... 6, 5120, 8190)
    rank 1 gets					(6, 5120, 8190)			 (6 ... 12, 5120, 8190)
    rank 2 gets					(4, 5120, 8190)			 (12 ... 16, 5120, 8190)

    In case (2), empty shards are returned with appropriate dimension to allow for operations to work smoothly.
    Args:
        param (torch.Tensor): The tensor to shard.
        empty_param (torch.Tensor): A tensor used for shape reference.
        device_mesh (torch.Tensor): Shape [d_0, ..., d_n] representing the mesh.
        rank (int): Global rank of the current process/device.
        dim (int): Dimension along which to shard the tensor.
    r   zdim z* is out of bounds for tensor of dimension zRank z  is out of bounds for mesh size N)dtype)rz   rv   rp   r   operatormulmathceilminslicetuplerK   r&   emptyint64)rx   ry   rF   r"   rz   	param_dim
mesh_shaper#   
shard_sizer   endslice_indices
dimensionss                rG   get_tensor_shardr   !  s;   h !I
Qw#o
i4u$NykZ[[ ""Jj1Jz5&FzlSTT;,,S1J>?J:E ej +"3"3C"8
9C4[MI-M{  %%"5#.cU=)**ekk"JJsO;;uZ(<<r`   c                    t        | j                        dk(  r.| j                  fd       | j                  fd       | S )z
    Copy pasted from torch's function but we remove the communications (partitioning)
    as well as buffer registering that is similarly not efficient.
    r   c                     | |      S N )modinputsrF   input_fns     rG   <lambda>z#distribute_module.<locals>.<lambda>}  s    #vWbAc r`   c                     | |      S r   r   )r   r   outputsrF   	output_fns      rG   r   z#distribute_module.<locals>.<lambda>  s    iPSU\^iFj r`   )r   _forward_pre_hooksregister_forward_pre_hookregister_forward_hook)modulerF   r   r   s    ```rG   distribute_moduler   q  sG     6$$%*,,-cd (()jkMr`   c                  B    e Zd ZdZdZed        Zed        Zd ZddZ	y)	TensorParallelLayer9
    General tensor parallel layer for transformers.
    Tc                     y r   r   input_layoutsdesired_input_layoutsr   r   rF   s        rG   _prepare_input_fnz%TensorParallelLayer._prepare_input_fn  s    [^r`   c                     y r   r   output_layoutsuse_local_outputr   r   rF   s        rG   _prepare_output_fnz&TensorParallelLayer._prepare_output_fn  s    Y\r`   c                    t         r   )NotImplementedErrorselfrx   ry   
param_typeparam_casting_dtypeto_contiguousr"   rF   s           rG   partition_tensorz$TensorParallelLayer.partition_tensor  s    !!r`   c           
         | j                   rat        ||t        | j                  | j                  | j
                        t        | j                  | j                  | j                               y y r   )	use_dtensorr   r   r   r   r   r   r   r   r   r   rF   s      rG   prepare_module_tpz%TensorParallelLayer.prepare_module_tp  sZ    ..0B0BDD^D^_//1D1DdF[F[\	 r`   Nr   	nn.Modulereturnr   )
__name__
__module____qualname____doc__r   staticmethodr   r   r   r   r   r`   rG   r   r     s4     K^ ^\ \"r`   r   c                  `     e Zd ZdZdddd	 	 	 	 	 d	 fdZed        Zed        Zd
dZ xZ	S )GatherParallelzi
    Simple class used to define the hooks to add to a layer when we just want to gather the outputs
    NT)r   r   r   c                   t         |           |xs
 t               f| _        || _        t               f| _        || _        y r   )super__init__r   r   r   r   r   )r   r   r   r   	__class__s       rG   r   zGatherParallel.__init__  s>     	+:y{<,&/k^" 0r`   c                    |j                         |_        |r&t        |d   t              r|d   j	                         }|S Nr   )	get_groupexpert_parallel_grouprJ   r   to_localr   s        rG   r   z GatherParallel._prepare_input_fn  s:    $/$9$9$;!jG4AY'')Fr`   c                   t        |t        j                        r2t        j                  |t        j
                  j                  d       |S t        j                  |d   t        j
                  j                  d       |S )NF)opasync_opr   )rJ   r&   Tensordist
all_reduceReduceOpSUMr   s        rG   r   z!GatherParallel._prepare_output_fn  sV    gu||,OOG(9(9EJ  OOGAJ4==+<+<uMr`   c           
     t    t        ||t        | j                  d d       t        | j                  d d              y r   r   r   r   r   r   s      rG   r   z GatherParallel.prepare_module_tp  4    D**D$7D++T48		
r`   r   Placement | Noner   r   r   boolr   )
r   r   r   r   r   r   r   r   r   __classcell__r   s   @rG   r   r     sc     +/+/!%1 (1 )	1
 1    
r`   r   c                  B    e Zd ZdZedd       Zedd       Zd ZddZy)	IsolatedParallelz
    This class is used to isolate computation in a TP layer from the rest of the world.
    Parameters need to be LOCAL, so not dtensors
    Nc                P    |d   }t        |t              r|j                         }|S r   rJ   r   r   r   r   r   r   rF   input_tensors         rG   r   z"IsolatedParallel._prepare_input_fn  s+     aylG,'002Lr`   c                    |S r   r   r   s        rG   r   z#IsolatedParallel._prepare_output_fn  s	     r`   c                x    |d   j                  |      }|r|j                         }||j                         z  }|S N.)rt   
contiguousrq   r   s           rG   r   z!IsolatedParallel.partition_tensor  s>    c
12$$&E((**r`   c           
     t    t        ||t        | j                  d d       t        | j                  d d              y r   r   r   s      rG   r   z"IsolatedParallel.prepare_module_tp  r   r`   r   r   )	r   r   r   r   r   r   r   r   r   r   r`   rG   r   r     s9    
    
r`   r   c                  P     e Zd ZdZddd fd
Zed        Zed        Zd Z xZ	S )ReplicateParallelz
    This class is used to replicate computation in a TP layer (used in SP regions when we don't use sequence parallelism for example)
    T)r   r   c                   t         |           t               f| _        t               f| _        t               f| _        || _        || _        y r   )r   r   r   r   r   r   r   r   )r   r   r   r   s      rG   r   zReplicateParallel.__init__  sB    'k^({n&/k^" 0&r`   c                b    |d   }t        |t              st        j                  ||| d      }|S )Nr   F	run_check)rJ   r   
from_localr   s         rG   r   z#ReplicateParallel._prepare_input_fn  s4     ay,0"--lKbghLr`   c                J    |r t        |t              r|j                         S |S r   r   r   s        rG   r   z$ReplicateParallel._prepare_output_fn  s"    %5*Wg:Vw!c\ccr`   c                    |d   j                  |      }|r|j                         }t        j                  ||t	               gd      }|S )N.Fr
  )rt   r  r   r  r   r   s           rG   r   z"ReplicateParallel.partition_tensor  sF    c
12$$&E""5+	}PUVr`   
r   r   r   r   r   r   r   r   r   r   r   s   @rG   r  r    sC     '+T '   d dr`   r  c                  `     e Zd ZdZddddd	 	 	 	 	 d	 fdZed        Zd Zed        Z xZ	S )
ColwiseParallelr   NTr   r   r   r   c                   t         |           |xs
 t               f| _        |xs t	        d      f| _        t               f| _        || _        || _        y Nro   )	r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s        rG   r   zColwiseParallel.__init__  sR     	+:y{<-:r<&/k^" 0&r`   c                    |d   }t        |t              st        j                  ||| d      }| |k7  r|j                  |d      }|S )Nr   Fr
  
placementsr   rJ   r   r  redistributer   s         rG   r   z!ColwiseParallel._prepare_input_fn$  sV     ay,0"--lKbghL 11'44@U`e4fLr`   c           	        |dk(  rt        ||||d      }t        d      g}	nt        d      g}	t        ||||d      }|j                  |      }|r|j                         }| j                  r7t        j                  |||	d|j                         |j                               }t        j                  ||j                               S )Nbiasro   rn   Fr  rp   striderequires_grad)r   r   rt   r  r   r   r  rq   r  r   	Parameteris_floating_point
r   rx   ry   r   r   r   r"   rF   	parametershards
             rG   r   z ColwiseParallel.partition_tensor1  s     ([$PRSI2YKE2YKE([$PRSILL!45	!,,.I**;kFVFVFXalasasauI ||IY5P5P5RSSr`   c                    |j                   | k7  r|j                  | d      }|r t        |t              r|j	                         S |S )NFr  )r  r  rJ   r   r   r   s        rG   r   z"ColwiseParallel._prepare_output_fnE  sG     /**nu*UG%5*Wg:Vw!c\ccr`   r   )
r   r   r   r   r   r   r   r   r   r   r   s   @rG   r  r    si     +/+/!%' (' )	'
 ' 
 
T( d dr`   r  c                      e Zd Zd Zy)PackedColwiseParallelc                   t        ||||d      }|j                  |      }|r|j                         }| j                  r#t	        j
                  ||t        d      gd      }t        j                  ||j                               S )Nrn   Fr
  r  
r   rt   r  r   r   r  r   r   r!  r"  	r   rx   ry   r   r   r   r"   rF   r$  s	            rG   r   z&PackedColwiseParallel.partition_tensorO  v     'uk;bQ	LL!45	!,,.I**9kE"I;Z_`I||IY5P5P5RSSr`   Nr   r   r   r   r   r`   rG   r(  r(  N      
Tr`   r(  c                  h     e Zd ZdZddddd	 	 	 	 	 d
 fdZd Zed        Zed        Zdd	Z	 xZ
S )RowwiseParallela  
    Partition a compatible nn.Module in a row-wise fashion. Currently supports nn.Linear and nn.Embedding.
    Users can compose it with ColwiseParallel to achieve the sharding of more complicated modules.
    (i.e. MLP, Attention)

    Keyword Args:
        input_layouts (Placement, optional):
            The DTensor layout of input tensor for the nn.Module, this is used to annotate the input tensor to
            become a DTensor. If not specified, we assume the input tensor to be sharded on the last dimension.
        output_layouts (Placement, optional):
            The DTensor layout of the output for the nn.Module, this is used to ensure the output of the nn.Module
            with the user desired layout. If not specified, the output tensor is replicated.
        use_local_output (bool, optional):
            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module output, default: True.
    Returns:
        A :class:`ParallelStyle` object that represents Rowwise sharding of the nn.Module.
    NTr  c                   t         |           |xs t        d      f| _        |xs
 t	               f| _        || _        || _        y r  )r   r   r   r   r   r   r   r   r  s        rG   r   zRowwiseParallel.__init__o  sF     	+8uRy:-<> 0&r`   c           	     z   |dk7  rt        ||||d      }t        d      g}	nt               g}	|d d  }|j                  |      }|r|j	                         }| j
                  r7t        j                  |||	d|j                         |j                               }t        j                  ||j                               S )Nr  ro   Fr  r  )r   r   r   rt   r  r   r   r  rq   r  r   r!  r"  r#  s
             rG   r   z RowwiseParallel.partition_tensor}  s     ([$PRSI2YKE[MEaILL!45	!,,.I**;kFVFVFXalasasauI ||IY5P5P5RSSr`   c                   t        |d      r2|j                  &|j                  j                         |_        d |_        |d   }t	        |t
              st        j                  ||| d      }| |k7  r|j                  |d      }|S )Nr  r   Fr
  Tr  )hasattrr  r   _biasrJ   r   r  r  r   s         rG   r   z!RowwiseParallel._prepare_input_fn  s    3CHH$8))+CICHay,0"--lKbghL11'44@U`d4eLr`   c                    |j                   | k7  r|j                  | d      }|j                         }t        |d      r||j                  z  }|S )NTr  r5  )r  r  r   r4  r5  r   s        rG   r   z"RowwiseParallel._prepare_output_fn  sR    
 /**nt*TG""$3 syy Gr`   c           
        d|_         | j                  rt        |t        j                        rt        d      f| _        nbt        |t        j                        rt               f| _        n7t        |t        j                        rt        d      f| _        nt        d      t        ||t        | j                  | j                  | j                        t        | j                  | j                   | j"                               y y )NTro   zBRowwiseParallel currently only support nn.Linear and nn.Embedding!)_distribute_module_appliedr   rJ   r   Linearr   r   	Embeddingr   r!  r   r   r   r   r   r   r   r   r   s      rG   r   z!RowwiseParallel.prepare_module_tp  s    ,0)&")),EJ2YL*FBLL1.7k^*FBLL1.3Bi\*)*noo..0B0BDD^D^_//1D1DdF[F[\	 r`   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   s   @rG   r0  r0  \  sl    * +/+/!%' (' )	'
 'T(   
 
r`   r0  c                      e Zd Zd Zy)PackedRowwiseParallelc                   t        ||||d      }|j                  |      }|r|j                         }| j                  r#t	        j
                  ||t        d      gd      }t        j                  ||j                               S )Nro   Fr
  r  r*  r+  s	            rG   r   z&PackedRowwiseParallel.partition_tensor  r,  r`   Nr-  r   r`   rG   r<  r<    r.  r`   r<  c                  T     e Zd ZdZddddd	 fdZed        Zed        Zd Z xZ	S )
SequenceParallelai
  
    SequenceParallel replicates a compatible ``nn.Module`` parameters and runs the sharded computation with
    input sharded on the sequence dimension. This currently supports ``nn.LayerNorm``, ``nn.Dropout``, and the
    `RMSNorm python implementation <https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34>`__

    This style implements the operation that is described in the paper
    `Reducing Activation Recomputation in Large Transformer Models <https://huggingface.co/papers/2205.05198>`__

    If the input passed in to this ``nn.Module`` is a :class:`torch.Tensor`, it assumes that the input is already sharded
    on the sequence dimension and converts the input to a :class:`DTensor` sharded on the sequence dimension. If the input
    passed in to this ``nn.Module`` is already a :class:`DTensor` but is not sharded on the sequence dimension, it would
    redistribute the input to be sharded on the sequence dimension.

    The output of the ``nn.Module`` will be sharded on the sequence dimension.

    Keyword Args:
        sequence_dim (int, optional):
            The sequence dimension of the input tensor for the ``nn.Module``, this is used to annotate the input tensor to
            become a DTensor that is sharded on the sequence dimension, default: 1.
        use_local_output (bool, optional):
            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module output, default: False.
    Returns:
        A :class:`ParallelStyle` object that represents Sequence Parallel of the ``nn.Module``.

    Example::
        >>> # xdoctest: +SKIP(failing)
        >>> from torch.distributed.tensor.parallel import parallelize_module, SequenceParallel
        >>> from torch.distributed.device_mesh import init_device_mesh
        >>> ...
        >>> m = Model(...)  # m is a nn.Module that contains a "norm" nn.LayerNorm submodule
        >>> tp_mesh = init_device_mesh("cuda", (8,))
        >>>
        >>> # By default, the input of the "norm" will be converted to DTensor that shards on the sequence dim
        >>> # and the output of "norm" will return a sharded on sequence dimension :class:`DTensor`.
        >>>
        >>> sharded_mod = parallelize_module(m, tp_mesh, {"norm": SequenceParallel()}),
        >>> ...

    .. note:: SequenceParallel style assumes ones initialization if there are weights in the nn.Module (i.e.
        ``nn.LayerNorm`` or ``RMSNorm``, and they by default have ones initialization). If you have custom
        inits for the weights on those modules, you need to broadcast the weights before/after parallelizing
        to ensure that they are replicated.
    rX   F)sequence_dimr   r   c                   t         |           t               f| _        t	        d      f| _        t               f| _        || _        d| _        t	        |      f| _	        || _        y )NrX   T)
r   r   r   r   r   r   r   r   r   sequence_sharding)r   r@  r   r   r   s       rG   r   zSequenceParallel.__init__  s\    'k^&+Ah["({n 0"'"5!7 0r`   c                    |d   }t        |t              st        j                  ||| d      }| |k7  r|j                  |d      }|S )Nr   Fr
  Tr  r  r   s         rG   r   z"SequenceParallel._prepare_input_fn  sR    ay,0"--lKbghL11'44@U`d4eLr`   c                Z    |j                  t               fd      }|j                         S )NTr  )r  r   r   r   s        rG   r   z#SequenceParallel._prepare_output_fn  s1    &&!~ ' 
 !!r`   c                    |d   }|j                  |      }|r|j                         }| j                  r"t        j                  ||t               gd      }t        j                  ||j                               S )N.Fr
  r  )	rt   r  r   r   r  r   r   r!  r"  r+  s	            rG   r   z!SequenceParallel.partition_tensor  sk     #J	LL!45	!,,.I**9kIK=\abI||IY5P5P5RSSr`   )r@  r-   r   r   r  r   s   @rG   r?  r?    sF    *X /0%]b 1   " "
Tr`   r?  c                  (     e Zd ZdZ fdZd Z xZS )GroupedGemmParallelzb
    Applies Expert Parallelism to MoE experts by loading the correct experts on each device.
    c                0    t         |           d| _        y NF)r   r   r   )r   r   s    rG   r   zGroupedGemmParallel.__init__*  s     r`   c                   |}|j                   d   }	|	|j                         z  dk7  r t        d|	 d|j                          d      |	|j                         z  }
|||
z  |dz   |
z   j                  |      }|r|j	                         }|S )Nr   zAGlobal number of experts must be divisible by number of devices: z % z != 0rX   )rp   rq   rv   rt   r  )r   rx   ry   r   r   r   r"   rF   ep_rankglobal_num_expertslocal_num_expertss              rG   r   z$GroupedGemmParallel.partition_tensor.  s    (..q1 0 0 22a7STfSggjkvk{k{k}j~  D  E  /+2B2B2DDg 11Wq[DU4UVYYZmn$$&Er`   )r   r   r   r   r   r   r   r   s   @rG   rG  rG  %  s    !r`   rG  c                  D    e Zd ZdZd Zed        Zed        Zd ZddZ	y)	RouterParallelzQ
    Allows to reshape the router scores to support running expert parallel.
    c                .    || _         || _        d| _        y rI  )argskwargsr   )r   rQ  rR  s      rG   r   zRouterParallel.__init__A  s    	 r`   c                F    |d   }t        |t              rt        d      |S )Nr   z5RouterParallel does not support DTensor input for now)rJ   r   r   r   s         rG   r   z RouterParallel._prepare_input_fnF  s'    aylG,%&]^^r`   c                    |j                         |j                         }}|j                  |z  }|\  }}	|dd||z  |dz   |z  f   }|	j                  |	|z  |k7  d      }	|	|z  }	||	fS )aW  
        Imagine if you had 4 tokens, top_k = 4, and 128experts.
        With EP = 8.
        Imagine router_indices being:
        [ 52,  42, 119,  67],
        [102,  89,  61,  40],
        [ 82, 103,   4,  34],
        [ 93,  23, 109,  11],

        then you can map which rank should be getting which values

        [3, 2, 7, 4],
        [6, 5, 3, 2],
        [5, 6, 0, 2],
        [5, 1, 6, 0],

        Thus for say rank 0, you fill with 0 the index tensor

        [ 0, 0, 0, 0],
        [ 0, 0, 0, 0],
        [ 0, 0, 4, 0],
        [ 0, 0, 0, 11],

        This works well. For another rank you need to make sure you round to num_local_expert
        because the next operation will one hot encode the router index vector.

        This allows us to know directly which local expert is hit.
        Similarly the scores are indexed with something created form
        router_indices.

        The kinda naive training loop that we use for device_map "auto" uses a similar logic.
        Here we are just making each rank believe that he is alone, and he computes his part of the hiddenstates.
        NrX   r   )get_local_rankrq   num_expertsmasked_fill)
r   r   r   r   rF   rK  ep_sizenum_local_expertsrouter_scoresrouter_indicess
             rG   r   z!RouterParallel._prepare_output_fnM  s    F '5579I9I9KOOw6(/%~%a3D)DRSWhGh)h&hi'33^GX5X]d4dfgh'*;;n,,r`   c                R    |d   j                  |      }|r|j                         }|S r  )rt   r  r   s           rG   r   zRouterParallel.partition_tensorx  s+    c
12$$&Er`   c           
     t    t        ||t        | j                  d d       t        | j                  d d              y r   r   r   s      rG   r   z RouterParallel.prepare_module_tp  s4    D**D$7D++T48		
r`   Nr   )
r   r   r   r   r   r   r   r   r   r   r   r`   rG   rO  rO  <  s?    !
   (- (-T
r`   rO  c                      e Zd Z ed      rherf e        e        e e              e e              ed       ed       e        e	        e
d       e        e        e        e       dZyi Zy)ParallelInterfacer   )r   )r   F)r   )colwiserowwisecolwise_reprowwise_replocal_colwiselocal_rowwiselocalgatherlocal_packed_rowwisesequence_parallel	replicategrouped_gemm	ep_routerN)r   r   r   r	   _torch_distributed_availabler  r0  r   r   r   r<  r?  r  rG  rO  _global_mappingr   r`   rG   r_  r_    s    & %U+0L '(&(*)+F*E,?,?%'$&$9e$L!1!3*,/1')	
 " # r`   r_  ALL_PARALLEL_STYLESc                F   d|v r|j                  dd      n|\  }}t        ||      }|s| S |dvr| S |dk(  rt        d      g}nF|dk(  r|dk(  rt               g}n0t        d      g}n#|dk(  r|dk(  rt        d      g}nt        d	      g}t	        j
                  | |d
      S )z
    Converts a local variant of weights to a DTensor with corresponding placements. Shouldn't be done ever except of before saving the model.
    rW   rX   )rh  re  rd  rh  ro   re  r  rd  rn   Fr
  )r[   r_   r   r   r   r  )r$  r\   rF   r=   _r   tp_styler  s           rG   convert_local_tensor_to_dtensorrs    s     69N5JN))#q1P^MAz%ng>HQQ))Bi[
	_	$#+J)J	_	$)J)JijERRr`   c                    | j                         D ]B  \  }}t        |t        j                        s!t        |t              r2t        ||||      | |<   D | S )z
    Replaces all tensors that were sharded with `local_*` strategy with DTensor to make determining their proper size possible.
    )itemsrJ   r&   r   r   rs  )
state_dictr=   rF   keyvalues        rG   %replace_state_dict_local_with_dtensorry    sV     !&&( `
UeU\\*:eW3M=eS+W^_JsO` r`   c           	         /t            }	 |j                  |       _        fd_        yy# t        $ r}t        d| d d|        Y d}~6d}~ww xY w)ai  
    This function is called in `PretrainedModel.post_init()`. It is responsible of adding hooks
    to the modules of the `model`, based on the `PretrainedModel._tp_plan`.

    This is the place where we add the `pre_forward` and `post_forwards` hooks. These are defined
    for each `TensorParallelLayer` as `_prepare_input_fn` and `_prepare_output_fn`.

    NTrying to prepare 0, but it's not supported. Corresponding module: z Fix it's TP plan: c                 .    j                          d  S )Nz

TP Plan: )__repr__)current_module_planr   s   rG   r   z5add_tensor_parallel_hooks_to_module.<locals>.<lambda>  s    V__%6$7{CVBW"X r`   )ro  r   r   print_hf_tp_planr~  )	modelr   r=   
layer_namer  rF   r\   tp_layerrB   s	    `  `    rG   #add_tensor_parallel_hooks_to_moduler    s|     &&':;	&&v{; 1X ' # 	$ZL0`ag`hh{|}{~ 	s   5 	AAAc	                $   d|v r|j                  dd      n|\  }	}
| j                  xs i }|j                  t        t	        |       dd      xs i        | j                  |	      }t        |      }t        ||      }t        j                         dk(  r7|t        j                  d|	 d       nt        j                  d|	 d|        |"	 t        |   }|j                  |||
||||      }n|dd j!                  |      }t#        |t$        j&                  j(                        s/t$        j&                  j)                  ||j+                               }t-        ||
|       |S # t        $ r!}t        d	| d
| d d|        Y d}~d}~ww xY w)a  
    This function is called in `from_pretrained` when loading a model's checkpoints.
    It receives the pointer to the parameter (or the parameter itself) and takes care of "sharding".
    All process run this function, so they just load the partition of the tensor that they require.

    Main uses cases:
    - column / rowise parallelism, you just shard all the weights of the layer (weight and bias)
    - packed layers: you slice the weights, then shard like above
    - custom operation:
        - you want to add an all-gather at the end of a local layer.
        - you want to have a layer that is isolated from the rest of the world (because torch.DTensor does not work well with `.view` for instance)

    rW   rX   _tp_planNr   zTensor sharding plan for z+ not found, using default 'replicate' plan.z: r{  r|  z" Fix it's TP plan, current layer: z : r  )r[   r  updater*   r)   get_submoduler-   r_   r   get_rankloggerinforo  r   r   r  rt   rJ   r&   r   r!  r"  setattr)r  rx   ry   r\   r   is_contiguousr"   rF   	set_param
param_namer   r=   module_to_tpcurrent_shard_planr  rB   s                   rG   shard_and_distribute_moduler    s     ?B^>S^223:YgJ
nn"GNN74;
D9?R@&&z2Lt9D/H}}!%KK3J<?jklKK3J<rBTAUVW%	*+=>H--{J0C]TXZeE a/0 eUXX//0""58U8U8W"XL*e,L # 	$^$44deqdr  sU  V^  U_  _b  cd  be  f 	s    E% %	F.F

Fc                n   |y| D ch c]  }t        j                  dd|       }}t        |      }|}|D ]  }d|v r|j                  dd      d   n|}t        j                  dd|      }||v r#|j	                  |       |j                  |       \d|v r<|j                  dd      d   x}|v r#|j	                  |       |j                  |        t        |      dkD  rt        j                  d|        t        |      dkD  r(t        j                  dd	j                  |              yyc c}w )
z
    Verify the TP plan of the model, log a warning if the layers that were not sharded and the rules that were not applied.
    NrU   rV   rW   rX   r   z>The following TP rules were not applied on any of the layers: z'The following layers were not sharded: z, )
rY   rZ   setr[   popdiscardr   r  warningjoin)	expected_keysr=   rw  generic_keysunsharded_layersunused_rulesr  r^   parent_param_names	            rG   verify_tp_planr    sJ   
 8EFBFF63,FLF<(L .1SjSZZQ'*c
VVFC<(/0$$S)&&ASAZAZ[^`aAbcdAe,e,=jq+q./$$S) <1WXdWefg
q @K[A\@]^_ !' Gs   D2c           
     z   d}t        | dd       xs i j                         }t        | j                  d      j                         | _        | j                  j	                  |       || _        || _        |gt        |t              rt        j                  |      }|j                  r6d}t        | j                  d| j                        j                         | _        | j                         D ]  \  }}t        ||t        |dd             x}rP| j                  j	                  |j                         j                         D 	
ci c]  \  }	}
| d|	 |
 c}
}	       t        |d      s~t        |j                  d| i       }|i k(  rt        |j                  di       }| j                  j	                  |j                         j                         D 	
ci c]  \  }	}
| d|	 |
 c}
}	        | j                  t        d	      rt         r| j                  j#                         D ]  }
|
t$        vst'        d
|
 dt$                | j)                         D ]O  \  }}t        |dd      s6ddlm} t/        || j                  d      } || || j                  d||       d|_        Q | S c c}
}	w c c}
}	w )Nr  base_model_tp_plan_ep_planbase_model_ep_planr=   rW   config
base_modelr   z"Unsupported tensor parallel style z. Supported styles are 
_is_hookedFr   )r  )r\   r=   r]    )r  r   r=   r  r  rF   T)r*   copyr  r  r  _tp_size_device_meshrJ   dictr   	from_dictenable_expert_parallelnamed_childrenru  r4  r	   rm  valuesro  rv   named_modules)transformers.integrations.tensor_parallelr  r_   r  )r  distributed_configrF   r>   _planr=   namer   plankvr  s               rG   distribute_modelr  9  s   Euj$/52;;=GU\\+?@EEGEN	NN'"EN$E%($/!2!<!<=O!P44E$U\\3GX]]_EN ,,. Vf65'&)T*JKK4KNN!!		@Q@Q@S"T1dV1QC=!#3"TU68$6==Jug*>CDrzv}}.BBGNN!!		@Q@Q@S"T1dV1QC=!#3"TUV ~~!&?&FKg&&( 	wA++ #EaSH_`s_t!uvv	w "//1 	%LD&6<7i-T5>>ejk3!!NN!(, + !%F	% L3 #U
 #Us   J1J7r   )rM   r-   rN   zint | list[int]r   z	list[int])T)r\   strr=   dict[str, str]r   z
str | None)r   )
r   torch.Tensorr   r-   r#   r-   r   r-   r   r  r   r   )r$  r  r\   r  r=   r  r   r   )rv  dict[str, torch.Tensor]r=   r  r   r  )r  z	list[str]r=   zdict[str, str] | None)E
__future__r   r   r   r.   rY   	functoolsr   r   r&   torch.distributedr+   r   r   r   utilsr	   r
   utils.genericr   
get_loggerr   r  is_availablerm  torch.distributed.tensorr   r   r   r   rH   rS   r_   r   uint8int8int16ru   bfloat16int32float32float64r   float8_e4m3fnrw   r   r   r   r   r   r   r   r  r  r(  r0  r<  r?  rG  rO  r_  ro  __annotations__rs  ry  r  r  r  r  r   r`   rG   <module>r     sf   #   	 	 %     + 6 , 
		H	%  %00==?  U#(DMM57p&4& JJ
++
**;;==NN;;====;;"" A:P 	> "> >  >  	> 
 > BM=d 	
 $ 8'
( '
T!
* !
H +  F;d) ;d|TO Td) dNTO TQT* QTh- .J
( J
Z( 0 *;)< & <SS-0SHVSS:' 	 Z^Y2 qu0f`>*r`   