
    rha                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dl m!Z! d dl"m#Z#m$Z$ d d	l%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJmKZK ddlLmMZMmNZN ddlOmPZPmQZQmRZRmSZS ddlmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZa ddlbmcZdmeZemfZfmgZg ddlhmiZi ddljmkZk ddllmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{ dd l|m}Z}m~Z~mZmZmZmZ dd!lmZmZmZmZmZ dd"lmZ er&d d#lmZ d d$lmZ d d%lmZ dd&l8mZ dd'lmZ  ed(      Z ej(                  e      Zej.                  j1                  ed)      Zej.                  j1                  ed*      Zej.                  j1                  ed+      Z e;       Z: G d, d-      Z ed      dWd.       Z ed      dWd/       Z G d0 d1      Zej@                   G d2 d3             Zej@                   G d4 d5             Z	 	 	 	 	 	 	 	 dXd6Z G d7 d8ev      Z e       jJ                  ZdYd9ZdYd:ZdZd;ZdYd<Zd[d=Zd\d>Z G d? d@eq      Zd]dAZd^d_dBZ G dC dDeu      Zejc                  dE        G dF dGe      Z G dH dI      Zej@                   G dJ dK             Z G dL dM      Zej@                   G dN dO             Z G dP dQepeeeeeef   f   f         Z G dR dSee         Z G dT dUe      Zd`dVZy)a    )annotationsN)IterableSequence)	lru_cache)AnyCallablecastOptionalTYPE_CHECKINGUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)has_triton_package   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetrics)AsyncCompile)	code_hashget_pathPyCodeCachewrite_atomic)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRoot
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleType)TypeVarDtypePropagationOpsHandler)IRNode)SIMDKernelFeatures_T
perf_hintsschedulefusionc                  @    e Zd ZU dZi Zded<   i Zded<   edd       Zy)	OpDtypeSupportz
    Some Triton ops such as libdevice and tl.math only support float32 and float64.
    This class records which dtypes are supported by specific IR ops.
    z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsc                    |j                   }t        t        j                  t        j                  g      | j
                  |<   || j                  |<   y N)__name__r   torchfloat32float64ro   rp   )clsfuncconvert_outputop_names       q/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/codegen/triton.pyregister_upcastzOpDtypeSupport.register_upcast   s=    --(2EMM5==3Q(RW%'5G$    N)rx   zCallable[..., str]ry   boolreturnNone)	rs   
__module____qualname____doc__ro   __annotations__rp   classmethodr|    r}   r{   rn   rn   w   s1    
 <>8=')O_)6 6r}   rn   c                 d    t               syddl} t        | j                  j                  d      ryy)zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    r{   gen_attr_descriptor_importr      s-     # v''):;Er}   c                     t               } | j                  d       t               x}r| j                  |       | j                  d       | j	                         S )NzD
        import triton
        import triton.language as tl
        a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        )rN   splicer   	writelinegetvalue)imports	attr_descs     r{   gen_common_triton_importsr      s[    GNN	 /00y0)$NN	 r}   c                     e Zd ZdZ eej                  ej                  g      Z eej                  ej                  ej                  ge      ZeD  ci c]%  }|t        j                  t        |    ddd      ' c}}}} ZeD  ci c]3  }|t        j                  t        |   j#                          ddd      5 c}}}} Zed
d       Zed
d       Zy	c c}}}} w c c}}}} w )TritonSymbolszU
    Stores sympy.Symbol instances and constants associated with triton codegen.
    offsetTintegernonnegativeBLOCKr   positivec                4    | j                   |j                     S rr   )block_sizessymtrw   trees     r{   get_block_sizezTritonSymbols.get_block_size   s    tyy))r}   c                4    | j                   |j                     S rr   )block_offsetsr   r   s     r{   get_block_offsetzTritonSymbols.get_block_offset   s      ++r}   N)r   rX   r   zsympy.Symbol)rs   r   r   r   r   r   R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKblock_typessympySymbolr   r   upperr   r   r   r   ).0r   r   r   s   0000r{   r   r      s    !$--!?@Odkk4;;VoVWK    	ellj./v6RVWWM  	   	ell$%%'(.t
 	
K * * , ,#
s   *C
8C'
r   c                  z    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   ddZddZddZddZddZe	dd       Z
y)IndexingOptionsstr	index_strOrderedSet[str]	mask_varsOptional[str]
expand_strr~   _has_rindex
sympy.Exprindexc                ,    t        | j                        S rr   )r~   r   selfs    r{   has_maskzIndexingOptions.has_mask   s    DNN##r}   c                J    t        | j                  t        j                        S rr   )r   r   r   TMPr   s    r{   has_indirectzIndexingOptions.has_indirect   s    "4::txx88r}   c                    | j                   S rr   )r   r   s    r{   
has_rindexzIndexingOptions.has_rindex   s    r}   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)tmpNr   
startswithr   masks     r{   	<genexpr>z.IndexingOptions.has_tmpmask.<locals>.<genexpr>   s     J43t9''.J   $&anyr   r   s    r{   has_tmpmaskzIndexingOptions.has_tmpmask   s    J4>>JJJr}   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)rNr   r   s     r{   r   z,IndexingOptions.has_rmask.<locals>.<genexpr>   s     H3t9'',Hr   r   r   s    r{   	has_rmaskzIndexingOptions.has_rmask   s    HHHHr}   c                    | j                   r2dj                  t        t        t        | j                                     S dS )N & r   )r   joinsortedmapr   r   s    r{   mask_strzIndexingOptions.mask_str   s4     =ANNEJJvc#t~~678	
PV	
r}   Nr   r~   r   r   )rs   r   r   r   r   r   r   r   r   propertyr   r   r}   r{   r   r      sN    N$9 KI 
 
r}   r   c                  ^   e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   ded<   d	ed<   dZded<   ed!d       Zed!d       Zed!d       Zed!d       Z		 	 	 	 	 	 	 	 	 	 d"dZ
e	 	 	 	 	 	 	 	 	 	 	 	 d#d       Z	 	 	 	 	 	 	 	 d$dZd%d&dZ	 	 	 	 	 	 d'dZd(dZd)dZd*dZd*dZd*dZd*dZd*d Zy)+BlockPtrOptionsBlockParametersparamsr   constant_offset	list[int]orderr   r   Sequence[sympy.Expr]broadcast_shapez
list[bool]broadcasting_dimsfinal_shapeNzOptional[list[int]]_boundary_checkc                .    | j                   j                  S rr   )r   shaper   s    r{   r   zBlockPtrOptions.shape   s    {{   r}   c                .    | j                   j                  S rr   )r   block_shaper   s    r{   r   zBlockPtrOptions.block_shape   s    {{&&&r}   c                .    | j                   j                  S rr   )r   stridesr   s    r{   r   zBlockPtrOptions.strides      {{"""r}   c                .    | j                   j                  S rr   )r   offsetsr   s    r{   r   zBlockPtrOptions.offsets  r   r}   c                0  	 t        | j                  | j                        D cg c]#  \  }}|rt        j                  j
                  n|% }}}t        |||      }t        j                  j                  	|xr7 t        |      t        |      k(  xr t        	fdt        ||      D              }t        | j                        r2|s0d| dt        j                  j                  | j                         d}t        || j                  |      }|S c c}}w )z
        Generate a broadcast and a reshape for the block pointer.
        This restores stride-0 dimensions which were removed from the block pointer.
        c              3  p   K   | ]-  \  }}j                  |d       xs j                  ||       / ywrF   N)statically_known_equals)r   pre_dimpost_dimsizevarss      r{   r   z@BlockPtrOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>%  sH       &GX 00!< G33GXFGs   36tl.broadcast_to(, ))zipr   r   r   SOnetriton_reshaperD   graphr   lenallr   kernelindex_to_str)
r   valueinitial_shaper   allow_implicitdimis_broadcastingpre_broadcast_shapesupports_implicit_broadcastr   s
            @r{   codegen_broadcast_and_reshapez-BlockPtrOptions.codegen_broadcast_and_reshape	  s    ),$$d&<&<)
$_ +EGGKK3
 
 um5HI 77##&4 '
#$K(88   *--@+)N  	$ t%%&/J&ugR0E0EdFZFZ0[/\\]^E ud&:&:KH9
s   (Dc                    t         j                  j                  d	fd} || j                        | _         || j                        | _        | j                  D cg c]  }j                  |d       }}| j                  D cg c]  }j                  |d       }	}t        |	      rd|	d<   t        | j                  |	      D 
cg c]	  \  }}
|
s| }}}
t        |	|      D cg c]  }t        |       c}fd}t        d
i t        j                  |       j                         D ci c]  \  }}| ||       c}}} |D cg c]  }t        j                  |       }}t         j                   j"                  r%|d   j$                  dk(  sJ |j'                  d       t         j                   j(                  }t         j                   j*                  st-        | j                        t-        t         j                   j.                        |z
  k(  rIt         j                   j0                  j3                         r!|t4        j6                  j8                  g|z  z  }t;        | t         j                  j                  j=                  |      t?        tA        tC        t-        | j                                          ||||      }|jE                  ||       |S c c}w c c}w c c}
}w c c}w c c}}w c c}w )z,Helper to create a  BlockPtrOptions instancec                L    | D cg c]  }j                  |       c}S c c}w rr   )lookup_precomputed_size)exprsexprr   s     r{   lookup_sizez+BlockPtrOptions.create.<locals>.lookup_sizeA  s"    GLMtH44T:MMMs   !r   rF   Fc                R    t        |       D cg c]	  \  }}|s| c}}S c c}}w )z@Removes any broadcasting or singleton dims from a given sequence)r   )ititemis_removableremovable_dimss      r{   remove_dimsz+BlockPtrOptions.create.<locals>.remove_dimsc  s3     +.b.*A&D,#   s   #x)r   r   r   r   r   r   r   )r  zIterable[sympy.Expr]r   list[sympy.Expr]r   )#rD   r  r   r   r   r   r   r  r   r   r   dataclassesasdictitemsr   r   r  no_x_dimprefixpopnum_reduction_dimsinside_reductionr  numelsfeaturesis_reductionr   r   r  r   r  listreversedrangecompute_boundary_check)r   r   range_treesr   get_max_blockr  strider   r  singleton_dimsis_singletonr   dimsr  keyvalr   r   reduction_ndimresultr  r   s                       @@r{   createzBlockPtrOptions.create4  s    77##	N #6<<0$V^^4
 GMnn
<BH,,VQ7
 
 AG@R@R
9<H,,S!4
 
 ~!&N2 &)););^%L
!\ 
 
 14NDU0VW#d)W	 ! 
5@5G5G5O5U5U5WXcsK$$X

 GRRd}33D9RR88q>((C///OOA44))FNN#s188??';n'LL!!..0 EGGKK=>99K GG,,DD_Uxc&,,&7 89:#+/
 	%%m[A


 X Y Ss$   K K%K*+K02K5K;c                D    t         j                  |   }t        |||i      S )zN
        Replaces instances of {symt}_offset with the new expression.
        )r   r   r=   )r   r  replacementr   roffsets        r{   replace_offsetzBlockPtrOptions.replace_offset  s&      --d3$+ 677r}   c           	         d fd}t         j                  j                  }g  j                  }|s|D cg c]
  } ||       }} j                  dk7  r| d | j                         dn|d | j
                         d | j                         d | j                         d | j                         d	 ||       g}d
dj                  |       dS c c}w )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should rn_offset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        c                ~    t         j                  D ](  }j                  | t        j                  d      |      } * | S Nr   )r   r   r<  r   Integer)r  r   r   s     r{   remove_roffsetsz/BlockPtrOptions.format.<locals>.remove_roffsets  s<    %55 I**4q1A4HIKr}   r    + (r   zshape=zstrides=zblock_shape=zorder=zoffsets=ztl.make_block_ptr(r   )r  r   r   r   )
rD   r  r  r   r   r   r   r   r   r   )r   namer;  rA  fr   r   argss   `       r{   formatzBlockPtrOptions.format  s    	
 HH!!!DLL/=DE6v.EGE ''1, &Qt3345Q7Qtzz]O$q'(1T--./0Qtzz]O$qzl#
 $DIIdO#4A66 Fs   C c           
        t         j                  j                  }|D ci c]7  }t        j                  |j
                      |t        |j
                           9 }}t        t        t         j                  j                  |            }t        t        | j                              D cg c]%  }|j                  | j                  |   t         j"                  j$                        s|r:t        j                  t&        j(                     | j*                  |   j,                  v sb|j/                  | j                  |   | j*                  |         s|j/                  | j                  |   t1        | j*                  |   |            sMt         j                  j2                  r1| j*                  |   t        j                  t&        j4                     k(  s|( c}| _        yc c}w c c}w )z6List of indices to pass to tl.load(boundary_check=...)N)rD   r  r   r   r   r   r   r   r   r  needs_yz_grid_overflowr,  r  r   r   r   r   r   Zeror   r   r   free_symbolsstatically_known_multiple_ofr=   r"  r   r   )r   r/  r.  r   tblock_to_maxneeds_overflow_grididxs           r{   r-  z&BlockPtrOptions.compute_boundary_check  s    77## !/
 %%aff-}Z=O/PP/
 /
 "#ahh&E&E{"ST S_- 
44T\\#5FU ,)55dkkB++C0==> %AA JJsOT-=-=c-B !) E E JJsO&t'7'7'<lK! HH%%((-1J1J4;;1WW-  
/
 
s   <G*D+G#c                6    | j                   J | j                   S rr   )r   r   s    r{   boundary_checkzBlockPtrOptions.boundary_check  s     ##///###r}   c           	         t         j                  |   }| j                  D cg c]A  }| j                  |||      | j                  |t        j
                  j                  |      z
  C }}|S c c}w )av  
        Codegen string to pass to tl.advance(name, ...).

        Advance is the difference between offsets in each loop iteration.
        To compute it, we replace rN_offset with multiples of RN_BLOCK.
        Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
        iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
        )r   r   r   r<  r   r   rI  )r   r   rblockr   advances        r{   advance_roffsetzBlockPtrOptions.advance_roffset  st     **40 ,,

  ##FFD9%%feggllDAB
 
 
s   AA,c                     yNFr   r   s    r{   r   zBlockPtrOptions.has_indirect      r}   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |t        j                           y wrr   )r   r   r   )r   r  s     r{   r   z-BlockPtrOptions.has_rindex.<locals>.<genexpr>  s%      
  m&C&CD
r   )r   r   r   s    r{   r   zBlockPtrOptions.has_rindex  s"     
((
 
 	
r}   c                "    | j                         S rr   )r   r   s    r{   r   zBlockPtrOptions.has_rmask  s      r}   c                     yrW  r   r   s    r{   r   zBlockPtrOptions.has_tmpmask  rX  r}   c                4    t        | j                               S rr   )r~   rQ  r   s    r{   r   zBlockPtrOptions.has_mask  s    D'')**r}   r   r  )
r  r   r	  r   r   r   r
  r~   r   r   )r   r   r   r   r.  list[IterationRangesRoot]r   r   r/  Callable[[str], int]r   r   )r  r   r:  r   r   r   r   r   T)rC  r   r   r   )r/  r`  r.  r_  r   r   )r   r   )r   r   r   r   r   )rs   r   r   r   r   r   r   r   r   r   r  staticmethodr8  r<  rF  r-  rQ  rU  r   r   r   r   r   r   r}   r{   r   r      s   ))!!%%+/O(/! ! ' ' # # # #)) ,) *	)
 ) 
)V TT $T /	T
 #T ,T 
T Tl88-78?C8	8!7F2
+2
 /2
 
	2
h$&
!+r}   r   c                r   t        |t              rt        |t              sJ |D cg c]!  }t        j                  j	                  |      # }}|D cg c]!  }t        j                  j	                  |      # }}||k(  r| S |D cg c]
  }|dk7  s	| c}|k7  rd|  ddj                  |       dS d}g }|D ]G  }	|t        |      k  r|	||   k(  r|j                  d       |dz  }0|	dk(  sJ |j                  d	       I |t        |      k(  sJ |  d
dj                  |       dS c c}w c c}w c c}w )z<Workaround https://github.com/triton-lang/triton/issues/28361ztl.reshape(z, [r   z])r   :rF   r   [])
isinstancer*  rD   r  r  r   r  append)
r  	old_shape	new_shaper   old_shape_strnew_shape_strsrO  expandsizes
             r{   r  r    sE    i&:i+FFF?HIeQXX**51IMI?HIeQXX**51IMI% -aAH->UG3tyy'?&@CC
CF "]##c0B(BMM#1HC3;;MM&!" #m$$$$WAdii'(**% JI .s   &D*&D/
D4D4c                      e Zd Zd dZd dZd dZd dZd dZd dZd dZ	d dZ
d d	Zd d
Zd dZd dZd dZd dZd!dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZ y)"TritonPrinterc                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrF   libdevice.trunc(r   ).to(r   r  rE  _printrD   r  index_dtyper   r  s     r{   _print_TruncToIntzTritonPrinter._print_TruncToInt5  M    499~"""t{{499Q<89qxx?S?S>TTUV	
r}   c                x    t        j                         rt        j                  j                  r| }|S d| d}|S )Nztl.full([], z, tl.float64))r   	is_fbcodert   versionhip)r   r  rets      r{   _print_FloatzTritonPrinter._print_Float;  s=    %--"3"3FC 
 !m4C
r}   c                    t        |j                        dk(  sJ | j                  |j                  d   t        d   dz
        }| dS )NrF   r   Atom      ?z.to(tl.float64))r  rE  parenthesizer   )r   r  rn  s      r{   _print_ToFloatzTritonPrinter._print_ToFloatB  sI    499~"""diilJv,>,DEO$$r}   c                    |j                   \  }}|j                  r3|j                  r'| j                  |j                   dt        d   dz
        S | j	                  |      }| j	                  |      }d| d| dS )N % r  r  z!triton_helpers.remainder_integer(r   r   )rE  is_nonnegative	stringifyr   rw  r   r  quotdivquot_sdiv_ss         r{   _print_PythonModzTritonPrinter._print_PythonModG  sr    II	c3#5#5>>$))UJv4F4LMMT"C 26("UG1EEr}   c                   |j                   sJ |j                  \  }}|j                  r3|j                  r'| j                  |j                  dt        d   dz
        S | j                  |      }| j                  |      }d| d| dS )N // r  r  z!triton_helpers.div_floor_integer(z,  r   )
is_integerrE  r  r  r   rw  r  s         r{   _print_FloorDivzTritonPrinter._print_FloorDivO  s~    II	c3#5#5>>$))VZ5G#5MNNT"C 26(#eWAFFr}   c                P    | j                  |j                  dt        d   dz
        S )N / r  r  )r  rE  r   ry  s     r{   _print_IntTrueDivzTritonPrinter._print_IntTrueDivZ  s#    ~~dii
60BS0HIIr}   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrF   libdevice.floor(r   ru  r   rv  ry  s     r{   _print_floorzTritonPrinter._print_floor_  r{  r}   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r  rv  ry  s     r{   _print_FloorToIntzTritonPrinter._print_FloorToInte  r{  r}   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrF   libdevice.ceil(r   ru  r   rv  ry  s     r{   _print_ceilingzTritonPrinter._print_ceilingk  K    499~""" TYYq\!: ;5AUAU@VVWXXr}   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r  rv  ry  s     r{   _print_CeilToIntzTritonPrinter._print_CeilToInto  r  r}   c                ,    d| j                  |       dS )Nzlibdevice.sqrt(().to(tl.float32)))rw  ry  s     r{   _helper_sqrtzTritonPrinter._helper_sqrts  s    !$++d"3!44EFFr}   c                    d| j                  |j                  d          d| j                  |j                  d          dS )Nlibdevice.pow(r   r   rF   r   )rw  rE  ry  s     r{   _print_FloatPowzTritonPrinter._print_FloatPowv  s?    T[[167r$++diiPQl:S9TTUV	
r}   c                ,   |j                   d   j                  r;dt        |j                   d          d| j                  |j                   d          dS d| j                  |j                   d          d| j                  |j                   d          dS )Nr   r  r   rF   r   )rE  
is_Integerfloatrw  ry  s     r{   _print_PowByNaturalz!TritonPrinter._print_PowByNatural{  s    99Q<""#E$))A,$7#84;;tyyQR|;T:UUVWWT[[167r$++diiPQl:S9TTUV	
r}   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   rF   r   	tl.where(r   r   )doprintrE  )r   r  cpqs        r{   _print_WherezTritonPrinter._print_Where  s_    LL1&LL1&LL1&1#Rs"QCq))r}   c                   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }t        |      }| j                   ||j                  d|        }| j                   ||j                  |d        }t	        d ||fD              \  }}|dv sJ d| d       d	| d
| d| d| d| d
| d| d| dS )zI
        Helper for max/min code generation.
        cmp: > or <
        rF   r   r   Nc              3  (   K   | ]
  }d | d  yw)(r   Nr   r   r  s     r{   r   z6TritonPrinter._print_min_max_helper.<locals>.<genexpr>  s     .!q1X.s   )><zUnexpected comparator: ''r  z * ( z= z) + )))r  rE  rw  typetuple)r   r  cmpmidrw   abs          r{   _print_min_max_helperz#TritonPrinter._print_min_max_helper  s    
 tyy>Q;;tyy|,,$))n!4jKKTYYt_-.KKTYYst_-. .1v..1j C$<SE"CC 1#T!AcU"QCtA3d1#Qse1QCrBBr}   c                &    | j                  |d      S )Nr  r  ry  s     r{   
_print_MinzTritonPrinter._print_Min      ))$44r}   c                &    | j                  |d      S )Nr  r  ry  s     r{   
_print_MaxzTritonPrinter._print_Max  r  r}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   tl_math.abs(r   r   r  rE  rw  ry  s     r{   
_print_AbszTritonPrinter._print_Abs  s9    499~"""dkk$))A,78::r}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.cos((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_cosz&TritonPrinter._print_OpaqueUnaryFn_cos  :    499~""" TYYq\!: ;;LMMr}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.cosh((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_coshz'TritonPrinter._print_OpaqueUnaryFn_cosh  :    499~"""!$++diil";!<<MNNr}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.acos((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_acosz'TritonPrinter._print_OpaqueUnaryFn_acos  r  r}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.sin((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_sinz&TritonPrinter._print_OpaqueUnaryFn_sin  r  r}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.sinh((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_sinhz'TritonPrinter._print_OpaqueUnaryFn_sinh  r  r}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.asin((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_asinz'TritonPrinter._print_OpaqueUnaryFn_asin  r  r}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.tan((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_tanz&TritonPrinter._print_OpaqueUnaryFn_tan  r  r}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.tanh((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_tanhz'TritonPrinter._print_OpaqueUnaryFn_tanh  r  r}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.atan((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_atanz'TritonPrinter._print_OpaqueUnaryFn_atan  r  r}   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrF   zlibdevice.log2((r   r  r  ry  s     r{   _print_OpaqueUnaryFn_log2z'TritonPrinter._print_OpaqueUnaryFn_log2  r  r}   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrF   zlibdevice.llrint(r   ru  r   rv  ry  s     r{   _print_RoundToIntzTritonPrinter._print_RoundToInt  sM    499~"""DIIaL 9:%@T@T?UUVW	
r}   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  S )	Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulzlibdevice.nearbyint(1e * z) * 1e)r  rE  r  
ValueErrorr  r   )r   r  numberndigits
number_strs        r{   _print_RoundDecimalz!TritonPrinter._print_RoundDecimal  s    499~"""))Q;;abiajjkl  &&vz%/@A
'yJ<vwhZPPr}   N)r  r   r   r   )r  r   r  r   r   r   )!rs   r   r   rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r}   r{   rr  rr  4  s    
%
FGJ


YYG


*C&55;NOONOONOOO
Qr}   rr  c                *    t        t        |             S )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)r>   r@   dtypes    r{   triton_compute_typer    s    *5122r}   c                ^    | t         j                  k(  rt         j                  } t        |       S )z@Convert torch.dtype to triton type, with fix for storing tl.bool)rt   r~   int8r>   r  s    r{   triton_store_typer    s"    



ur}   c                    t        |       r+| j                  r| j                  dk  rt        j                  S t        |       S )z0Implicit upcasts used for Triton reduction types   )r   	is_signeditemsizert   int32r@   r  s    r{   upcast_acc_dtyper    s0    5??u~~7J{{u%%r}   c                *    t        t        |             S )z:Convert torch.dtype to triton type, with reduction upcasts)r  r  r  s    r{   triton_acc_typer    s    /677r}   c                <    | j                   dk  xr | j                  S )Nr   )r  is_floating_pointr  s    r{   low_precision_fpr    s    >>Q:5#:#::r}   c                    t        | t              sy| j                  }t        |t        j                        rt	        |      S dS rW  )rh  rL   r  rt   r  )varr  s     r{   low_precision_fp_varr     s6    c;'IIE&0&DE"O%Or}   c                  &     e Zd Zd fdZd Z xZS )TritonCSEVariablec                X    t         |   |||       t               | _        |J d       y )Nz!TritonCSEVariable must have dtype)super__init__r   r   )r   rC  boundsr  	__class__s       r{   r
  zTritonCSEVariable.__init__	  s/    vu-*4, E"EE r}   c                F   |D ]  }t        |t              r&| j                  j                  |j                         9t        |t        j
                        sTt        j                  D ]6  }t        ||      s| j                  j                  t        |    dg          y )Nr   )
rh  r  r   updater   r   r   r   r   r   )r   rC  rE  kwargsargr   s         r{   update_on_argsz TritonCSEVariable.update_on_args  s     
	C#01%%cmm4C. *55 D%c40--*T2B1C4/H.IJ
	r}   )r  zValueRanges[Any]r  torch.dtyper   r   )rs   r   r   r
  r  __classcell__r  s   @r{   r  r    s    Fr}   r  c                     ddl m}   |        S )Nr   re   )!torch._inductor.dtype_propagationrf   re   s    r{   get_dtype_handlerr    s    L%''r}   c                0     dddfdd fd}|S )z
    Codegen helper to upcast arguments to float32, depending on the config and dtype.
    This decorates tl.math/libdevice codegen functions.
    c                    t         j                  j                   xr> t        | t              xr, | j
                  t        j                  t        j                  fv S rr   )	r   r   codegen_upcast_to_fp32rh  rL   r  rt   float16bfloat16)r  s    r{   needs_upcastz*maybe_upcast_float32.<locals>.needs_upcast)  sD    444 =3,=		emmU^^<<	
r}   c                (     |       rdnd}|  | S )N.to(tl.float32)r   r   )r  upcast_stringr  s     r{   maybe_upcast_argz.maybe_upcast_float32.<locals>.maybe_upcast_arg0  s!    -9#->)B}o&&r}   c                H     t         j                          d fd}|S )Nc                    | D cg c]
  } |       }}|j                         D ci c]  \  }}| |       }}} |i |}xr6 t        fdt        j                  | |j	                               D              }|sd n# t        t               j                        | i |}	|	t        j                  d fv}
|
r|	dt        |	       dnd}| | S c c}w c c}}w )Nc              3  .   K   | ]  } |        y wrr   r   )r   r  r  s     r{   r   zKmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>?  s      6&)S!6s   .to(r   r   )r!  r   	itertoolschainvaluesgetattrr  rs   rt   ru   r>   )rE  r  r  upcast_argsr4  r5  upcast_kwargsr7  any_needs_upcastresult_dtypeneeds_downcastdowncast_stringry   rx   r!  r  s               r{   wrappedz8maybe_upcast_float32.<locals>.decorator.<locals>.wrapped8  s   <@AS+C0AKAHNWHCS"23"77WMW ;8-8F-  # 6-6__T6==?-S6 3
 ( @W.0$--@$Q&Q 
 *%--1FFN "l&> {<013 
 Xo.//' BWs
   CCr   )rn   r|   )rx   r0  ry   r!  r  s   ` r{   	decoratorz'maybe_upcast_float32.<locals>.decorator4  s$    &&t^<	0 	0. r}   r   r   )rx   Callable[..., Any]r   r2  r   )ry   r1  r!  r  s   ` @@r{   maybe_upcast_float32r3  #  s    
': r}   c                     e Zd ZdZ ej
                  ej                        Ze	 	 dN	 	 	 dOd       Z	edPd       Z
ed        Zed        Ze e       d               Zed	        Zed
        Ze e       d               Ze e       d               Ze e       d               Ze e       d               Zed        Zed        Zed        Zed        Zedej6                  dddd       Ze e       d               Ze e       d               Zed        Zed        Z e e       d               Z!e e       d               Z"e e       d               Z#e e       d               Z$e e       d               Z%e e       d               Z&e e       d                Z'e e       d!               Z(e e       d"               Z)e e       d#               Z*e e       d$               Z+e e       d%               Z,e e       d&               Z-e e       d'               Z.e e       d(               Z/e e       d)               Z0e e       d*               Ze e       d+               Z1ed,        Z2ed-        Z3ed.        Z4ed/        Z5ed0        Z6ed1        Z7ed2        Z8ed3        Z9ed4        Z:ed5        Z;ed6        Z<ed7        Z=ed8        Z>ed9        Z?e e       d:               Z@e e       d;               ZAe e       d<               ZBe e       d=               ZCe e       d>               ZDed?        ZEe e       d@               ZFe e       dA               ZGe e       dB               ZHe edCD      dE               ZIe edCD      dF               ZJe e       dG               ZKe e       dH               ZLedI        ZMedJ        ZNe e       dK               ZOedL        ZPe e       dM               ZQy)QTritonOverrideszMap element-wise ops to TritonNTc                :   	 	 	 	 	 	 dd}|>t         |||      t        j                  j                        t        j                  _        |t        j
                  k(  rd|  dS |t        j                  k(  r|  dS |rt        |      }nt        |      }|  d| dS )Nc                   | |k(  ryt         j                  t         j                  f}| |v r||v r| |k7  rJ d       | t         j                  k(  s|t         j                  k(  ry| t         j                  k(  s|t         j                  k(  ryy)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!r  r   )rt   float8_e4m3fnfloat8_e5m2)	src_dtype	dst_dtype
fp8_dtypess      r{   _get_min_elements_per_threadz>TritonOverrides.to_dtype.<locals>._get_min_elements_per_thread`  s     I% ##!!J Z'+*U U	U 
 E---e>O>O1OE///9@S@S3Sr}   r  z != 0)z.to(tl.int8).to(tl.uint8)r%  r   )r:  r  r;  r  r   int)	maxrD   r  min_elem_per_threadrt   r~   uint8r  r  )r  r  r:  use_compute_typesr=  	out_dtypes         r{   to_dtypezTritonOverrides.to_dtypeY  s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk! S122+E2I)%0ID1%%r}   c                    |j                   |j                   k(  sJ | j                  |k7  r|  dt        |       d} |  dt        |       d}t        |      |k7  r| dt        t        |             d}|S )Nr%  r   z, bitcast=True))r  r  r>   r@   )r  r  r:  outs       r{   to_dtype_bitcastz TritonOverrides.to_dtype_bitcast  s    !!U^^333 77i#T+i013A4E*+?;u%.Ek*=e*DEFaHC
r}   c           	         t         j                  j                  |      }t         ||             }t	        |      }|dk(  r|S | dk  r#|j
                  sd|dd   }d| d| d| d| d	S d| d| d| dS )	Nz
tl.float32r   ztl.r  tl.full(r   ru  r   )rt   _prims_commondtype_to_typerW   r  r  )r  r  r   type_
triton_valr>   triton_signed_types          r{   _shaped_constantz TritonOverrides._shaped_constant  s    ##11%8"5<0
)%0,& 19U__#&{12&7!8eWBzl"5G4Hk]Z[\\eWBzl"[MCCr}   c                *    | j                  ||g       S )Nr   )rO  )rw   r  r  s      r{   constantzTritonOverrides.constant  s    ##E5#;;r}   c                    d|  dS )Nr  r   r   r  s    r{   abszTritonOverrides.abs       aS""r}   c                    d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j
                  fv r| dt        |       d}|S )Nr  r  r   r%  )r  r  truedivrt   r  ru   r>   r  yrF  rC  s       r{   rX  zTritonOverrides.truediv  sl    !Cs!n"&:1&=)+33Aq9IU]]EMM::T+i"8!9;
r}   c                    d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j
                  fv r| dt        |       d}|S )Nr  r  r   r%  )r  r  modrt   r  ru   r>   rY  s       r{   r\  zTritonOverrides.mod  sl    !Cs!n"&:1&=)+//15IU]]EMM::T+i"8!9;
r}   c                \    t         j                  rd|  dt        j                   dS d|  dS )z
        When use_fast_math, use the ftz (flushing to zero) variant
        of exponent computation.

        Check https://github.com/triton-lang/triton/issues/5735 for
        more details.
        libdevice.exp2(r  r   ztl_math.exp()r   use_fast_mathr5  _LOG_2_ErT  s    r{   expzTritonOverrides.exp  s8     $QCs?+C+C*DAFF!!A&&r}   c                    d|  dS )Nr^  r   r   rT  s    r{   exp2zTritonOverrides.exp2       !1%%r}   c                    d|  dS )Nzlibdevice.expm1(r   r   rT  s    r{   expm1zTritonOverrides.expm1       "!A&&r}   c                    d|  dS )Nzlibdevice.sqrt(r   r   rT  s    r{   sqrtzTritonOverrides.sqrt  rd  r}   c                   t         j                  j                  }|dk(  ry|dk(  r	d|  d|  dS |dk(  r|  dS |8t        j                  t        j
                  d	t        j                        |       S t        d
|      )Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r   accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   r   inject_relu_bug_TESTING_ONLYopsmaximumrR  rt   r  AssertionError)r  bugs     r{   reluzTritonOverrides.relu  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;s||Au{{;Q?? LSGT r}   c                    d|  d| dS )Nztriton_helpers.minimum(r   r   r   r  r  s     r{   minimumzTritonOverrides.minimum      (2aS22r}   c                    d|  d| dS )Nztriton_helpers.maximum(r   r   r   ru  s     r{   rp  zTritonOverrides.maximum  rw  r}   c                    d|  d| d| dS )Nr  r   r   r   )r  r  r  s      r{   wherezTritonOverrides.where  s    1#Rs"QCq))r}   rF   )constraintsr  is_purepackc                    t        |      }dj                  |D cg c]  }t        |       c}      }|#dj                  dg|D 	cg c]  }	d c}	z         }d|  d| d| d| d| d	| d
S c c}w c c}	w )Nr   z=rr   ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=r   )r  r   r   )
asmr{  r  r|  r}  inputsr>   i
input_refs_s
             r{   inline_asm_elementwisez&TritonOverrides.inline_asm_elementwise  s     *%0YY71A78
))TF6-Bac-B$BCK,SEk]$zlR[\g[hhrszr{  |C  DH  CI  IJ  K  	K  8-Bs   A.	A3
c                    d|  dS )Nztl_math.cos(r   r   rT  s    r{   coszTritonOverrides.cos  rV  r}   c                    d|  dS )Nztl_math.sin(r   r   rT  s    r{   sinzTritonOverrides.sin  rV  r}   c                    t        d      )Nz/ops.index_expr not implemented outside a kernelNotImplementedError)rw   r  r  s      r{   
index_exprzTritonOverrides.index_expr$  s    !"STTr}   c                    t        d      )Nz+ops.masked not implemented outside a kernelr  )r   bodyothers      r{   maskedzTritonOverrides.masked(  s    !"OPPr}   c                    d|  dS )Nzlibdevice.lgamma(r   r   rT  s    r{   lgammazTritonOverrides.lgamma,       #1#Q''r}   c                    d|  dS )Nzlibdevice.erf(r   r   rT  s    r{   erfzTritonOverrides.erf1        s!$$r}   c                    d|  dS )Nzlibdevice.cosh(r   r   rT  s    r{   coshzTritonOverrides.cosh6  rd  r}   c                    d|  dS )Nzlibdevice.sinh(r   r   rT  s    r{   sinhzTritonOverrides.sinh;  rd  r}   c                    d|  dS )Nzlibdevice.acos(r   r   rT  s    r{   acoszTritonOverrides.acos@  rd  r}   c                    d|  dS )Nzlibdevice.acosh(r   r   rT  s    r{   acoshzTritonOverrides.acoshE  rg  r}   c                    d|  dS )Nzlibdevice.asin(r   r   rT  s    r{   asinzTritonOverrides.asinJ  rd  r}   c                    d|  dS )Nzlibdevice.asinh(r   r   rT  s    r{   asinhzTritonOverrides.asinhO  rg  r}   c                    d|  d| dS )Nzlibdevice.atan2(r   r   r   r  rZ  s     r{   atan2zTritonOverrides.atan2T       "!Bqc++r}   c                    d|  dS )Nzlibdevice.atan(r   r   rT  s    r{   atanzTritonOverrides.atanY  rd  r}   c                    d|  dS )Nzlibdevice.atanh(r   r   rT  s    r{   atanhzTritonOverrides.atanh^  rg  r}   c                    d|  d| dS )Nzlibdevice.copysign(r   r   r   r  s     r{   copysignzTritonOverrides.copysignc  s     %QCr!A..r}   c                    d|  dS )Nzlibdevice.erfc(r   r   rT  s    r{   erfczTritonOverrides.erfch  rd  r}   c                    d|  dS )Nzlibdevice.erfinv(r   r   rT  s    r{   erfinvzTritonOverrides.erfinvm  r  r}   c                    d|  d| dS )Nzlibdevice.hypot(r   r   r   r  s     r{   hypotzTritonOverrides.hypotr  r  r}   c                    d|  dS )Nzlibdevice.log10(r   r   rT  s    r{   log10zTritonOverrides.log10w  rg  r}   c                    d|  dS )Nzlibdevice.log2(r   r   rT  s    r{   log2zTritonOverrides.log2|  rd  r}   c                    d|  d| dS )Nzlibdevice.nextafter(r   r   r   r  s     r{   	nextafterzTritonOverrides.nextafter  s     &aS1#Q//r}   c                    |  d| S Nr   r   ru  s     r{   logical_andzTritonOverrides.logical_and      Cs|r}   c                    |  dS )Nz == 0r   r  s    r{   logical_notzTritonOverrides.logical_not  s    E{r}   c                    |  d| S Nz | r   ru  s     r{   
logical_orzTritonOverrides.logical_or  r  r}   c                    d|  d| dS )Nr   ^ r   r   ru  s     r{   logical_xorzTritonOverrides.logical_xor  s    1#S1~r}   c                    |  d| S r  r   ru  s     r{   bitwise_andzTritonOverrides.bitwise_and  r  r}   c                    d|  S )N~r   r  s    r{   bitwise_notzTritonOverrides.bitwise_not  s    1#wr}   c                    |  d| S r  r   ru  s     r{   
bitwise_orzTritonOverrides.bitwise_or  r  r}   c                    |  d| S )Nr  r   ru  s     r{   bitwise_xorzTritonOverrides.bitwise_xor  r  r}   c                    |  d| S )Nz << r   ru  s     r{   bitwise_left_shiftz"TritonOverrides.bitwise_left_shift      D}r}   c                    |  d| S )Nz >> r   ru  s     r{   bitwise_right_shiftz#TritonOverrides.bitwise_right_shift  r  r}   c                     d| d}d|  d| dS )Nr  ).to(tl.uint32)ztl.rand(r   r   r   seedr   s     r{   randzTritonOverrides.rand  s%    VHO,$r&++r}   c                     d| d}d|  d| dS )Nr  r  z	tl.randn(r   r   r   r  s     r{   randnzTritonOverrides.randn  s%    VHO,4&6(!,,r}   c           	     ,    d| d}d|  d| d| d| d	S )Nr  r  ztriton_helpers.randint64(r   r   r   )r  r   lowhighs       r{   	randint64zTritonOverrides.randint64  s1    VHO,*4&6("SED6KKr}   c                    t        d      )Nz.ops.load_seed not implemented outside a kernelr  )rC  r   s     r{   	load_seedzTritonOverrides.load_seed  s    !"RSSr}   c                    d|  dS )Nzlibdevice.rsqrt(r   r   rT  s    r{   rsqrtzTritonOverrides.rsqrt  rg  r}   c                    d|  dS )Nzlibdevice.log1p(r   r   rT  s    r{   log1pzTritonOverrides.log1p  rg  r}   c                    d|  dS )Nzlibdevice.tan(r   r   rT  s    r{   tanzTritonOverrides.tan  r  r}   c                    d|  dS )Nzlibdevice.tanh(r   r   rT  s    r{   tanhzTritonOverrides.tanh  rd  r}   c                    d|  dS )Nztl.sigmoid(r   r   rT  s    r{   sigmoidzTritonOverrides.sigmoid  s     QCq!!r}   c                    d|  d|  d|  dS )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r   rT  s    r{   signbitzTritonOverrides.signbit  s#     "!L3NqcQUV	
r}   c                    d|  d| dS )Nzlibdevice.fmod(r   r   r   ru  s     r{   fmodzTritonOverrides.fmod  s     !2aS**r}   c                    d|  d| dS )Nr  r   r   r   ru  s     r{   powzTritonOverrides.pow  s      s"QCq))r}   c                    d|  dS )Nztl_math.log(r   r   rT  s    r{   logzTritonOverrides.log  rV  r}   F)ry   c                    d|  dS )Nzlibdevice.isinf().to(tl.int1)r   rT  s    r{   isinfzTritonOverrides.isinf       "!M22r}   c                    d|  dS )Nzlibdevice.isnan(r  r   rT  s    r{   isnanzTritonOverrides.isnan  r  r}   c                    d|  dS )Nzlibdevice.nearbyint(r   r   rT  s    r{   roundzTritonOverrides.round  s     &aS**r}   c                    d|  dS )Nr  r   r   rT  s    r{   floorzTritonOverrides.floor  rg  r}   c                H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r   r   )r  r  r  rems       r{   floordivzTritonOverrides.floordiv  sV    
 D}3qclA3j+;C5vVTXSYY\]a\bbcddr}   c                f   t        j                  dt        j                        }t        j                  t        j
                  ||       t        j                        }t        j                  t        j
                  | |      t        j                        }t        j                  ||      }| d|  dS )Nr   r%  .dtype))ro  rR  rt   r  rD  ltr  sub)r  zleftrightr  s        r{   signzTritonOverrides.sign  su    LLEKK(||SVVAq\EJJ7cffQlUZZ8ggdE"d1#W%%r}   c                    d|  dS )Nrt  r   r   rT  s    r{   trunczTritonOverrides.trunc  rg  r}   c                    |  d| S )Nr  r   ru  s     r{   truncdivzTritonOverrides.truncdiv  s     D}r}   c                    d|  dS )Nr  r   r   rT  s    r{   ceilzTritonOverrides.ceil   rd  r}   )NT)r  r  r:  zOptional[torch.dtype])r  r  r:  r  )Rrs   r   r   r   mathr  er`  rb  rD  rG  rO  r   rR  r3  rU  rX  r\  ra  rc  rf  ri  rs  rv  rp  rz  rt   ru   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r
  r  r  r   r}   r{   r5  r5  T  sw   (tyy H ,0	6&6& )6& 6&p    D D" < < #  #     '  ' &  & '  ' &  &  " 3 3 3 3 * * "&emmTPQK K #  # #  # U U Q Q (  ( %  % &  & &  & &  & '  ' &  & '  ' ,  , &  & '  ' /  / &  & (  ( ,  , '  ' &  & 0  0                     , , - - L L T T '  ' '  ' %  % &  & "  " 
 
 +  + *  * #  # /3 0 3 /3 0 3 +  + '  ' e e & & '  '  
 &  &r}   r5  r   c                       e Zd ZdZ fdZeej                  d               Zed        Z	ed        Z
ed        Zed        Zed        Z xZS )	TritonKernelOverridesa   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                D    t        |   |i | | j                          y rr   )r	  r
  _setup_libdevice_routing)r   rE  r  r  s      r{   r
  zTritonKernelOverrides.__init__1  s#    $)&) 	%%'r}   c                   ddl m t        j                  j                  j
                  D ]  }t        | |      sJ t        | |      }fd}|dk(  rDt        d      sJ t        j                  |||      }||_
        t        | |t        |             kd }t        j                  |||      }||_
        t        | |t        |              y)z<Set up routing to libdevice implementations for fp64 inputs.r   )OpDecompositionsc                    | j                   t        j                  k7  r ||       S  t        |      |       j                  S rr   )r  rt   rv   r)  r  )r  _original_impl_fn_namer  s      r{   decomposition_routerzLTritonKernelOverrides._setup_libdevice_routing.<locals>.decomposition_routerC  s9    77emm+)!,,>7#3X>qAGGGr}   r  )r  r  c                ^    | j                   t        j                  k(  r	d| d|  dS  ||       S )Nz
libdevice.r  r   )r  rt   rv   )r  r  r  s      r{   dtype_routerzDTritonKernelOverrides._setup_libdevice_routing.<locals>.dtype_routerR  s2    77emm+'z1#Q77)!,,r}   N)torch._inductor.codegen.commonr  rt   	_inductorutilsop_requires_libdevice_fp64r   r)  	functoolspartialrs   setattrrb  )rw   fn_nameoriginal_implr  fnr  r  s         @r{   r  z.TritonKernelOverrides._setup_libdevice_routing8  s    
 	D,,GG 	4G3(((#C1MH )#/;;;&&(QX &Wl2&67- ""]WB "BKC,r"23;	4r}   c                r    t         j                  j                         }dg|z  }| j                  |||      S )NrF   rQ  )rD   r  triton_tensor_ndimrO  )rw   r  r  ndimr   s        r{   rR  zTritonKernelOverrides.constant^  s9    
 xx**,d
##E5#>>r}   c                0   t         j                  j                  |d      }t        |t              sJ t         j                  j                         }|t        j                  t        j                  fvr|n|}t        j                  j                  }	 dt        j                  _        t         j                  j                  j                  t         j                  j                  |j                  t!        |      |      }|t        j                  _        |t        j                  t        j                  fvr^t         j                  j                  j                  t         j                  j                  | j#                  ||      t%        |            }n|}|j&                  D ]l  }t)        |t*        j,                        st        j.                  |t         j                  j                  j0                  |j2                     j4                        }n ||k7  rTt         j                  j                  j                  t         j                  j                  | j#                  ||      |      }|j6                  |_        |S # |t        j                  _        w xY w)NF	block_ptrr  r  r  )rD   r  indexingrh  r   get_index_dtype_as_torch_dtypert   r  int64r   test_configsruntime_triton_dtype_assertcsegeneratecomputer   r5   rD  r@   rJ  r   r   r   promote_typesvarname_maprC  r  r   )rw   r  r  r.  rx  origr  	index_vars           r{   r  z TritonKernelOverrides.index_exprg  s   88$$TU$;(O444 hh==?u{{EKK&@@k "">>		C>CF;((,,''  "",T2	 ( C ?CF;ekk22((,,''  S%()%0 ( C  E!.. 	!)TXX6!//qxx||77	GMME #hhll++HH$$LLk2% ,  !**
9 ?CF;s   A-I> >Jc           
        | ot         j                  j                  Ut        j                  j
                  j                  t        j                  j                  |  dt         j                        } |j                  j                  d      }|sJ d       d}|D ]>  }|j                  D ]-  }|j                  dk7  st        |j                  d         s+d	} > @ |rd n|}t        j                  j                  | |
      5 } |       }	d d d        |r	j                  j                   rt        |      }t        j                  j
                  j                  t        j                  j                  d|	 dt#        |       d|	 dt%        j&                  |      |	j(                        }t+        j,                  |	|      }
n	}
|
j.                  j1                         |
S # 1 sw Y   xY w)N.to(tl.int1)r  output)opz)graph for body does not contain an outputFloadrF   Tr  rI  z.shape, r   r  r-  )rt   r~  r  rD   r  r3  r4  r5  r~   r  
find_nodesrE  targetr`   
mask_loadsr  is_boolrW   r   wrapr  ro  rz  r   discard)r   r  r  nodes
need_wherenoder  r  new_maskr7  r  s              r{   r  zTritonKernelOverrides.masked  s    1 1 =88<<((  &%jj ) D 

%%%2AAAu
  	Dyy ::'+CCHHQK+P!%J	 #XX  U 3 	xVF	 }}$$UHHLL))  6((=+?*@6('R"''.ll	 * E ))Hfe4CCh'
'	 	s   G""G+c                    t         j                  j                  j                  |       }d| dt         j                  j                  j	                  d|       dS )Ntl.load( + load_seed_offsetr   )rD   r  rE  inputseed_offset)rC  r   r  s      r{   r  zTritonKernelOverrides.load_seed  sI    hhmm!!$'se3qxx}}889KVTUUVW	
r}   c                   d|  d}t         j                  j                  j                  |      x}r|S t         j                  j                  j	                  | j
                        }t         j                  j                  j	                  t        j                        }t         j                  j                  j                  | d| d|  d       t         j                  j                  j                  |||f       ||fS )Nzfrexp(r   r  r   z = triton_helpers.frexp()rD   r  r3  try_getnewvarr  rt   r  r5  r   put)r  	cache_keycse_valmantissaexponents        r{   frexpzTritonKernelOverrides.frexp  s    QCqM	hhll**95575N88<<&&QWW&588<<&&U[[&9	""j8*$<QCqA	
 	
Xx$89(##r}   )rs   r   r   r   r
  r   r!  cacher  rR  r  rb  r  r  rX  r  r  s   @r{   r  r  )  s    ( __"4  "4H ? ? 0 0d * *X 
 
 $ $r}   r  c                  H    e Zd ZU dZded<   ded<   ddZdddd	Zd
 Zd Zy)HelperFunctionsz#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersc                     i | _         g | _        y rr   )r\  r^  r   s    r{   r
  zHelperFunctions.__init__  s    !!#r}   _triton_helper_fn	base_namec                   | j                   j                  |      }||S | t        | j                         }|| j                   |<   | j                  j	                  |j                  |             |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        )rC  )r\  getr  r^  ri  rF  )r   template_coderb  existing_namerC  s        r{   addzHelperFunctions.add  sw     ,,00?$  S!7!789:.2]+%%m&:&:&:&EFr}   c                ,    t        | j                        S rr   )iterr^  r   s    r{   __iter__zHelperFunctions.__iter__  s    D**++r}   c                     | j                   |   S rr   )r^  )r   rO  s     r{   __getitem__zHelperFunctions.__getitem__   s    %%c**r}   Nr   r   )re  r   r   r   )	rs   r   r   r   r   r
  rg  rj  rl  r   r}   r{   r[  r[    s+    -##  $ 4G ,,+r}   r[  c                      e Zd ZU dZ ej
                  e      Zded<    ej
                  e      Z	ded<    ej
                  e      Z
ded<    ej
                  e      Zded<   d
dZy	)r   zM
    Class representing ND block dimensions, for block pointer analysis.
    )default_factoryr  r   r   r   r   c                    t        |       }t        d | |fD              \  }} |di |D ci c]  }|||   ||   z    c}S c c}w )z0
        Concatenates block parameters.
        c              3  F   K   | ]  }t        j                  |        y wrr   )r  r   r  s     r{   r   z*BlockParameters.__add__.<locals>.<genexpr>  s     Bq[''*Bs   !r   )r  r  )r   r  rw   r  r  r4  s         r{   __add__zBlockParameters.__add__  sR     4jBT5MBB19a8sc1S6AcF?*8998s   AN)r  r   r   r   )rs   r   r   r   r  fieldr*  r   r   r   r   r   rr  r   r}   r{   r   r     sn     0k//EEE$5K$5$5d$KK!K 1 1 1$ GGG 1 1 1$ GGG:r}   r   c                  *    e Zd ZdZd ZddZd Zd Zy)"CooperativeReductionWorkspaceCachez
    The scratch space used for cooperative reductions can be reused
    after two reduction loops.  This keeps track of what can be reused.
    c                    || _         g | _        g | _        t        j                  t        j
                        | _        d| _        d| _        y r?  )	rE  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r   rE  s     r{   r
  z+CooperativeReductionWorkspaceCache.__init__  s@    	*66{7H7HIr}   c                    | j                   j                  |      }|r|j                         S | j                  j	                  |d      \  }}| j
                  j                  |||f       ||fS rW  )r|  rd  popleftrE  	workspacerw  ri  )r   nbytescachedws_name	ws_offsets        r{   allocatez+CooperativeReductionWorkspaceCache.allocate&  si    %%))&1>>##!YY00?  &'9!=>##r}   c                    | j                   D ]&  \  }}}| j                  |   j                  ||f       ( | j                  | _         g | _        | xj                  dz  c_        y NrF   )rx  r|  ri  rw  r}  )r   r  r  r  s       r{   on_loop_endz.CooperativeReductionWorkspaceCache.on_loop_end.  s_    *.// 	F&FGY  (//)0DE	F++1r}   c                H    | j                   }| xj                   dz  c_         |S r  )r~  )r   priors     r{   increment_store_countz8CooperativeReductionWorkspaceCache.increment_store_count6  s#      Ar}   N)r  r   )rs   r   r   r   r
  r  r  r  r   r}   r{   ru  ru    s    
$r}   ru  c                  $    e Zd ZU ded<   d Zd Zy)FixedTritonConfigzdict[str, int]r   c                     | j                   |   S rr   r   r   r  s     r{   rl  zFixedTritonConfig.__getitem__@  s    {{4  r}   c                    || j                   v S rr   r  r  s     r{   __contains__zFixedTritonConfig.__contains__C  s    t{{""r}   N)rs   r   r   r   rl  r  r   r}   r{   r  r  <  s    !#r}   r  c                      e Zd ZdZddZy)	TritonCSEz
    Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
    variables across separate masked blocks.
    c                Z    t         j                  j                  x}r||j                  fS |S rr   )rD   r  
_load_maskrC  )r   rT  r   s      r{   augment_keyzTritonCSE.augment_keyM  s,    88&&&4&tyy))r}   N)rT  r   r   zUnion[str, tuple[str, str]])rs   r   r   r   r  r   r}   r{   r  r  G  s    
r}   r  c                  *    e Zd ZU eZded<   eZded<   dZ	 	 	 dI	 	 	 	 	 dJ fdZ	dKdZ
dLd	Zd
 Zd Zd Zd ZdLdZd ZedMd       Zddddd	 dNdZ	 dO	 	 	 	 	 	 	 dPdZdOdZ	 	 	 	 	 	 	 	 dQdZd ZdRdZ	 dS	 	 	 	 	 	 	 	 	 dTdZd Z	 	 dU	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dVdZdMdZdWdZ	 	 	 	 	 	 	 	 	 	 dXdZ	 	 dYdZ dYd Z!d! Z"d" Z#d# Z$d$ Z%d% Z&	 	 	 	 	 	 dZd&Z'd[d'Z(	 	 	 	 	 	 	 	 d\d(Z)	 	 	 	 	 	 	 	 	 	 d]d)Z*d* Z+d^d+Z,d, Z-d- Z.d. Z/e0d/        Z1dSd0Z2e0d1        Z3e0d2        Z4d3 Z5d_d4Z6d5 Z7dSd`d6Z8dad7Z9dbd8Z:dcd9Z;ddd:Z<	 	 	 	 	 	 ded;Z=ddd<Z>dfd=Z?dgd>Z@dhd?ZAdLd@ZBdidAZCeDd^dB       ZEdjdCZFdkdDZGeDdldE       ZHdmdFZIdjdGZJ	 	 	 	 	 	 dndHZK xZLS )oTritonKernelr[  helper_functionszCallable[[sympy.Expr], str]kexprTNc                   || _         || _        t        |   |fi | t	        | j
                  | j                        | _        t               | _	        t               | _
        t        t                  | _        || _        t        j                          | _        t%        t&        t&        f          | _        t+               | _        t/        j0                  t$              | _        t/        j4                         | _        t        t8                  | _        d | _        | j>                  r| jA                  | jB                         | jD                  r| jG                          | jI                          | jD                  r| jK                          y y rr   )&optimize_maskfixed_configr	  r
  r  newvar_prefixsuffixr3  rN   post_loop_combinepost_loop_storer   r   outside_loop_varsr@  r&  countblock_ptr_iddictr   block_ptr_to_bufferr[  r  ry  rz  pointer_advancementsCounter_load_countsr)   autotune_hintstriton_metar&  codegen_reduction_numelsr  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_mask)r   tilingr@  r  r  r  r  s         r{   r
  zTritonKernel.__init__Z  s3    $1(*6*T//=1?1A/=/?!+C!2#6 %OO-#'S>#3  / 1##D) 	! 7B6I6I6K )6859  ))$))4%%++-!%%002 &r}   c                    t        |      S rr   )r>   )r   r  s     r{   dtype_to_strzTritonKernel.dtype_to_str  s    5!!r}   c                p    | j                   xr) t        j                  j                  | j                        S rr   )r&  rD   choices should_use_cooperative_reductionr(  r   s    r{   r  z-TritonKernel.should_use_cooperative_reduction  s-    $$ 
)S)SMM*
 	
r}   c                     j                   sJ  j                  D ]$  }|j                  |xj                  dz  c_        &  j                  d   } j                  rt        | j                  d         } j                  j                  |       _        t         j                         _
         j                  j                  d       t         fd j                  D              r j                  j                  d       yy)z/One time setup code for cooperative reductions.NrF   r  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c              3  Z   K   | ]"  }|j                   rj                  |        $ y wrr   )r)  _has_constant_mask)r   r   r   s     r{   r   z:TritonKernel.init_cooperative_reduction.<locals>.<genexpr>  s0      
   ''--
s   (+z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))r  r.  grid_dimr'  r  r   rE  
semaphoressemaphores_nameru  %cooperative_reduction_workspace_cacher  r   r   r   )r   r   	sem_counts   `  r{   r  z'TritonKernel.init_cooperative_reduction  s    )))) $$ 	#D}}("	# KK$		4+<+<X+FGI#yy33I>5WII6
2 					
  
((
 

 IIP
r}   c                   d}| j                   s| d}| j                  j                  d|        | j                         r| j                  j	                  d       y | j                   rJ | j                  j                  d       y )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))r"  r  r   _has_constant_xmaskr   )r   rsplit_aranges     r{   r  z,TritonKernel.init_cooperative_reduction_mask  s{    >}},oY7M		.}o>?##%II }}$$IIer}   c                2   | j                   D ]q  }|j                  s| j                  || j                         ,| j                  s9| j                  j                  |j                   d| j                  |              s | j                  rt        d | j                   D              rS| j                  ddd      }| j                  |      }| j                  j                  d| j                  |              y | j                  | j                         y y )Nzbase = c              3  4   K   | ]  }|j                     y wrr   )is_loopr   r   s     r{   r   z2TritonKernel.codegen_range_tree.<locals>.<genexpr>  s     =D4<<=s   baseTr   zrbase = )r.  r  iteration_ranges_codegen_headerr  r&  r   r#  iteration_ranges_ranges_coder   _get_reduction_symbols_flatten_reduction_indicesr   r  codegen_reduction_indices)r   r   rn_basesrbases       r{   r  zTritonKernel.codegen_range_tree  s    $$ 		D<<44T499E&& 		##{{m74+L+LT+R*ST		   =D,<,<==66Dd 7  77A		  8D,=,=e,D+E!FG ..tyy9 !r}   c                     y)z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Tr   r   s    r{   need_numel_argszTritonKernel.need_numel_args  s     r}   c                    | j                   xr4 t        j                  j                  | j                  | j
                        S rr   )r&  rD   r  should_use_persistent_reductionr(  r  r   s    r{   r  z,TritonKernel.should_use_persistent_reduction  s5    $$ 
)R)RMM455*
 	
r}   c                    | j                   rlt        | j                        | j                  dz   k(  rG| j                  r| j                  d   dk(  S t
        j                  j                  | j                        S y)NrF   r   F)	persistent_reductionr  r'  r%  r  rD   r  want_no_x_dimr(  r   s    r{   r  zTritonKernel.want_no_x_dim  sb    %%DKK D$;$;a$??  ((2a7799**4==99r}   c                     y)Nztl.device_assertr   r   s    r{   assert_functionzTritonKernel.assert_function  s    !r}   F)
copy_shapedense_indexingoverride_maskr,  c          
     t     j                        j                  }d}t               t        |t	        j
                  d            D ]j  }t        |t        j                        sJ |xs t        |t        j                        }|rAt        |t        j                        r? j                  j                  |j                      }	j#                  |	j$                         t        |t        j&                  t        j(                  t        j*                  t        j,                  t        j.                  t        j0                  f      rt        j2                  D 
cg c]  }
t        ||
      r	t4        |
    }}
t7        |      dk(  sJ d|j                           j9                  |d    d       m t:        j<                  j>                  xs |xs  j@                  duxr dk7  }d	}d}t               } jC                         D ]@  }|jE                  |jF                        rd	}nd}|j9                  |jH                   d       B |r jJ                  rt:        j<                  jL                  rx|sv j@                  sjt7        |z
        dk(  rY jO                        sH|rF jP                  d
k(  r7	 	 	 	 	 	 dd	 	 	 	 	 	 d fd	 	 	 	 	 	 dfdd fd} |       }||S d} jS                        }t        t        jT                        r|r| dn jW                         }d| d| d} jX                  r j[                         st        dg      n
t                j@                  rj9                   j@                         t]        |||      S |r%|s#|r| dn jW                         }d| d| d}|n|s|rd| d| d}||rt        |g       j@                  rj9                   j@                          j_                         t]        |||      S c c}
w )zO
        Compute the index and mask to pass to tl.load() or tl.store()
        FrC  r4  rF   zAmbiguous type: r   r   NTtl.int32c                    t        j                  | |j                               }|yt        |j                  gt
        j                  |      g|gt
        j                  |      g      S )z
                Matches expressions of the form:
                    idx = s * xindex

                This implies stride (s,), and shape (XBLOCK,).
                Nr   r   r   r   )rG   match_affine_block_exprsymbolr   numelr   r   r   )r   
range_treer0  s      r{   match_affine_blockz1TritonKernel.indexing.<locals>.match_affine_blockH  sl     -DD:,,. >&%++,!.!=!=j!I J#H*;;JGH	 r}   c                   |j                         }t        j                  dt        j                  t        j
                  |g            \  }}t        dt        j                        | j                  t        ||            | j                  t        |||            z         }t        j                  | ||j                  |      }|y|\  }}}	t        j                  |      }
t         j"                  j$                  j'                  |j(                        t+        fd|
D              ryt,        j/                  |      }t1        ||
d         gt3        |
dd |dd       D cg c]%  \  }}t        j4                  t1        ||      |      ' c}}z   }|	D cg c]#  }t7        ||t,        j9                  |      i      % }}t;        ||||	      S c c}}w c c}w )
a  
                Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

                Example expression to match:
                   sN * ((rindex//(d1 * ... * d(N-1))))
                       + s1 * ModularIndexing(rindex, 1, d1)
                       + ...
                       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

                This iterates over a block of shape (dN, ..., d1) and stride
                (sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
                wildcards that we match.

                Note that dN does not appear in the expression, but we solve for it
                using range tree numels and the other dims.
                zdenom modulo)exclude)rw   r   Nc              3  l   K   | ]+  }j                  |       xr j                  |        - y wrr   )rK  statically_known_power_of_2)r   r  	max_blockr   s     r{   r   zETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>  sH        !==eYOO H$@@GGHs   14r   rF   r  )r  r   symbolsr!  r"  Wildr?  r  range_tree_nodesr  r   r   rG   match_mod_div_block_exprr  get_slice_numelsrD   r  r   r  r#  r   r   r   r   r   Minr=   r   r   )r   r  r9  denommodulonum_dimsmatch_resultr3  r   block_index_exprsslice_numelslinear_block_sizer  r  r   r  r   r  r   r   s                    @@r{   match_mod_div_blockz2TritonKernel.indexing.<locals>.match_mod_div_block^  s   ( '--/	 !&"!))%**ykJ!v --.HY$>?++oi&OPQ	  3KK9j&6&6   ' !	%2CCDI 77++ NN:+<+<=	  ". 
   %2$@$@$L!-|A?1 '*,qr*:DH&E"s IIg&7?E1 !2	3  y-*H*H*TU3 3 ' +#)	 3s   ,*G (Gc                6    fD ]  } || |      }||c S  y)ze
                Match a block indexing subexpression involving a single range tree.
                Nr   )r  r  
match_funcmatchr  r  s       r{   match_block_pointer_subexprz:TritonKernel.indexing.<locals>.match_block_pointer_subexpr  s:     ''# %J 'tZ8E($% r}   c            	     |   t        j                  j                         D  ci c]  \  } }| |j                   c}}       }j	                         }|D cg c]&  }t        j                  ||j                               ( }}t        d |D              }t               }t        ||      D ]@  \  }}t        |j                  |j                              dkD  r y  ||      }	|	 y ||	z  }B |t        |      z
  }
j                         t         j#                  ||
|j$                        S c c}} w c c}w )Nc              3  <   K   | ]  }|j                           y wrr   )r  r  s     r{   r   zETritonKernel.indexing.<locals>.match_block_pointer.<locals>.<genexpr>  s     *QT4;;=*Q   rF   )r   r   r.  r   r/  )r=   r  r!  r  active_range_treesrG   get_subexpr_involving_symbolr  r   r   r   r  intersectionrJ  sumfilter_masksr   r8  r  )vrL  index_relative_to_xyr_indexr.  r   index_subexprsrange_symbolsblock_paramssubexprr   r   r   r   r  r   s              r{   match_block_pointerz2TritonKernel.indexing.<locals>.match_block_pointer  sO   .8$2G2G2M2M2OP$!QAqvvIP/+ #557 !,	"  (DD3T[[]" " !+*Q[*Q Q.0%(n%E 
+MD' =55g6J6JKLqP# 9$GF~# F*L
+ 5s>7JJ !!),&--'$* +'"&.. .  E Q"s   D3+D9z.shaperI  r   z, tl.int32)xmaskr   r   .shape))r   r   r  rZ   r   Optional[BlockParameters])r  r   r  rZ   r   r  )r   zOptional[BlockPtrOptions])0prepare_indexingrJ  r   r   operator
attrgetterrh  r   r   r   r   r   r   r   r3  r7  rC  r  r   UNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATr   r   r  rg  r   r   r  r  r   r  var_listr#  allow_block_ptruse_block_ptris_indirect_indexingrx  r  r@  dense_size_strr  r  r   r  )r   r   r  r  r  r,  
index_varsr   r  cse_varr   prefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   r  optionsr   r   r   r  r  r  s   ``                   @@@@r{   r.  zTritonKernel.indexing  s)    %%e,''

%/\	*(*=*=f*EF 	:Cc5<<000# ~]22(J TXX.((..sxx8  !2!23%%II))JJJJ''
 
 !. 9 9"%c40 t$" "
 >*a/N3CCHH:1NN/!2 3489?	:D MM(( ++d* qj	 	 
+5<++- 	6D&&t}}5!%"
4;;-t 45	6 $$++!OOI/0A5--e4  J.!/B*,`!`/B`*`D .A* * *Z *+G"
%%e,	eU]]+2<J<v.$BUBUBWJ":,b;GI  )A)A)C&y1	&L	doo."9iZQVWWj2<J<v.$BUBUBWJ*9+R
|1EI'IJ*9+R
|7KI'I"M?3I??MM$//*)$y)ZUSSy"s   $P5c                f   |j                         }|sd}n|r|dk(  sJ d|d}nd|}| j                  r| j                  d   j                  r|j	                         rdt        | j                         }| j                  j                  t        || d|j                  |d	                    || j                  |<   t        j                  D ]E  }|j                  |      }t        d
 |D              r'| j                   |   }	||	vsJ d       ||	|<   G ||fS |j                  |      }||fS )Nr   , other=0.0, boundary_check=z, padding_option='zero'r  r,   = F)r;  c              3     K   | ]A  }t         j                  j                  j                  |t	        j
                  d              C ywr   N)rD   r  r   r   r   r@  )r   r   s     r{   r   z1TritonKernel.codegen_block_ptr.<locals>.<genexpr>@  s9       GG$$<<VU]]STEUVs   AA	z@duplicate advancement for pointer '{block_ptr}' at type '{symt}')rQ  r&  r.  r  r   nextr  r  r   rM   rF  r  r   r   rU  r  r  )
r   rC  r  r.  r  checkr,  r   advance_offsetsadvancementss
             r{   codegen_block_ptrzTritonKernel.codegen_block_ptr   sp    '')EM)))'y0GHE'y1E!!  $,,##%#D):):$;#<=IIIYKs8??3?+N*OP 37D$$Y/ &55 :"*":":4"@  "1  #88> 4 V4 +:Y':" % !,I%r}   c                   d| d|j                    d}t        t        |j                   |j                              D ]B  \  }\  }}t        j
                  j                  j                  ||      s4d|j                  |<   D |j                  ||j                   |j                  d      }| dt        t        j
                  j                  |             d}d| d| | dS )Nr   r   r   Fr%  	tl.store()r   	enumerater   r   rD   r  r   r   r   r  r   r  	get_dtype)	r   rC  r.  r,  r  r  rO  r  broadcast_dims	            r{   codegen_block_ptr_store_linez)TritonKernel.codegen_block_ptr_store_lineO  s     #5'H,@,@+AC *3$$h&>&>?*
 	8%C%#} ww77]K27**3/		8 668'')=)=u

 '/0A0A$0GHIK9+RwugQ77r}   c                   |s|sy t        |t        j                        sJ | j                  |d      }t        |t              sJ |j
                  }|j                         r|j                  nd }|rt        | j                  |            nd }| j                  ||rdnd ||      }	| j                  |      }
| j                  j                  |
|	dt        j                         y )NFr+  0)
assignmentr  )rh  r   Exprr.  r   r   r   r   texprrename_indexingindirect_assertget_load_bufferr3  r4  rt   r  )r   r  rp  lowerr   r.  r   r   size_strlinebuffers              r{   check_boundszTritonKernel.check_boundsf  s     $

+++===7(O444&&	(0(9(9(;8$$8=5--d344 ##esx
 %%h/&$5Lr}   c                    |j                         s|j                         r| j                  S | j                  r5| j                  d   j
                  r|j                         s| j                  S | j                  S )Nr  )	r   r   r5  r&  r.  r  r   r  loads)r   r.  s     r{   r=  zTritonKernel.get_load_buffer  sb      "h&:&:&<<<!!  $,,'') 99::r}   c           
     	   | j                   j                        }| j                  xx   dz  cc<   t        }| j	                  |      |}| j                  |d      }|j                         |j                         }t        d | j                  |      j                         D              }| j                  |      rd}	nX|sd}	nS| j                  rE| j                  d   j                  r,fd}
   d}	t        j                   t"        d	|
      }nd
}	|sr8|j%                         r(| j&                  rdt)        | j&                         }nd}nd
}	 d}t*        j,                  j.                  r"| j0                  j3                         }|   dkD  }	 | j                  |       xr | j                   xr | xr |}d
}|rd}d }t4        j6                  j9                        }t;              r7|}|t<        j>                  t<        j@                  fv rWt<        jB                  }nEtE        |tF              rL| jI                  |||      \  }}d| | |	 | d}|jK                  ||jL                  |jN                  d      }nVtE        |tP        jR                        rd| d| d}|jT                  }n&d| d|jV                   d|jX                   |	 | | d
}|t<        j>                  t<        j@                  fv r/t*        j,                  jZ                  r|dz  }t<        jB                  }|t<        j\                  k(  r/t<        j^                  j`                  |dz  }t<        j\                  }| jc                  |      }| jd                  jg                  | ||      |      }|jh                  dkD  rxx   dz  cc<   tE        |tj              sJ |jl                  |_6        |rd| d| d}| jd                  jg                  |||      }|jl                  r~|jn                  rd}n|t<        j\                  k(  rd}nd}| j&                  rt)        | j&                        n|}d|jX                   d| d| d}| jd                  jg                  |||      }| j                  r|jq                         ss| jr                  ju                  |       |S )NrF   Tr+  c              3  &   K   | ]	  }|d k(    ywr   r   )r   r  s     r{   r   z$TritonKernel.load.<locals>.<genexpr>  s      
AF
   z, eviction_policy='evict_last'r  c                          kD  rsryy)N
evict_lastevict_firstr   )expected_countr   indirect_indexingload_countsrC  s   r{   decide_laterz'TritonKernel.load.<locals>.decide_later  s    t$~5"3'$r}   z, eviction_policy='<EP>'z<EP>r   z, other=r&  z, cache_modifier='.cg'rK  r   rB  r  r  r  r;  r  r   r   z0.0Truer7  r  );rE  rN  r  r   r  r.  r   r   r   get_strides_of_loadr(  is_broadcastedr&  r.  r  r!  r"  r4   r   _load_otherrW   r   r   skip_l1_cacher(  buffer_read_countsrD   r  r3  r`   rt   r  r  ru   rh  r   r/  r  r   r   r   r@  r   r   r   r  r~   r~  r  r=  r3  r4  	use_countr  r   r  r   r  rg  )r   rC  r   r  	make_lineoriginal_indexr.  r   is_coalescedeprN  r  has_read_depsrT  rS  cachemodappend_broadcastr  r@  r,  load_buffer
result_varzero	other_valrK  r   rL  rM  s    `                      @@@@r{   r>  zTritonKernel.load  s   iiood#''DQCK	 55e<==$=7((*
**,  
 44^DKKM
 
 ~.1B1B""t'7'7';'C'C% % ).N+B!))*:FLQIB:8+<+<+>"=1A1A#B"CD%E	 ==&&!%!A!A!C.t4q8M	 ##N33 )))!! 	 	 /H!!$'#D)D 77 (O4#'#9#9$Xu#U 	5!)UGB4zC==(..0D0Dd NEMM:!#d>*:"=#+#6#6 !#d8+=+=*>c(BSBSATUWTXY^X_`h_iijk %--88MM88))

"u}}'8'8'@ &

**84XX&&{IdO5&Q
!#"*&7888'11
%j\4D3EQGD**;E*JJ!!** Dejj(!DD7;7G7GM$"2"23T  #8#4#4"5R
|2i[PQR!XX..{D.N
$$X-?-?-A*""&&z2r}   c           	        | j                   j                  |      }|}| j                  |d|d u       }|| j                   j                  v }| j	                  |      }	|r'|	r%| j
                  j                  t        |d             t        |t              r,| j                  |||      \  }
}| j                  |||
||      }n]|$d| d|j                   d| d|j                   d	}n7|d	k(  r$d
| d|j                   d| d|j                   d	}nt        d|       t        j                          }| j"                  s7| j$                  r+|j'                  | j)                  || j
                               | j
                  j                  t        ||             | j"                  s| j*                  j-                  |       |j/                          y )NT)r  r,  ztl.debug_barrier()r1  rB  r  r   r   
atomic_addztl.atomic_add(z, sem='relaxed')zstore mode=)rE  r<  r.  inplace_buffersrQ  storesr   rM   rh  r   r/  r5  r   r   r  
contextlib	ExitStackr&  r  enter_contextguard_cooperative_storer  rg  close)r   rC  r   r  moder  rW  r.  
is_inplacerQ  r,  r  r@  
exit_stacks                 r{   storezTritonKernel.store	  s    iit$==ttt|=T TYY666
,,^<.KK!!,t5I"JKh0#55dCJIu44h	5%D \se4(:(:';3ugRHYHYGZZ[\D\!#C5X-?-?,@E7"XM^M^L__opD%D6&:;;))+
$$)C)C$$T%A%A$%TUl467$$""&&u-r}   c                    | j                   j                         }|j                  t        |d| d             |j	                         S )z
        For cooperative reductions only one thread block should write out the result.
        We rotate which thread block does each write for better parallelism
        zif rsplit_id == (z % RSPLIT):)r  r  r   rM   indent)r   rC  rA  rO  s       r{   rh  z$TritonKernel.guard_cooperative_store@	  sC    
 88NNPd.?uK,PQR}}r}   c                   | j                   j                  t        j                         | j                  j                  |d         }| j                  |d         }	| j                  |d         }
| j                  |d         }|r| j                  j                  |d         nd}|r| j                  |d         nd}|t        j                  k(  rd}n!|t        j                  k(  rd}nt        d      | j                  j                  | j                  d	| d
| d
|	 d
|
 d
| d
| d
| d
| d
| d
| d
| d|      }|S )z3
        See [Note: Inductor bucketize op]
        r   rF   r   r   r   r  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r   z, )r  )r  rg  r)   ONE_ELEMENT_PER_THREADrE  rN  r  rt   r  r0  r  r3  r4  r5  )r   r(  
boundariesboundary_indicesindexing_dtyper  sortersorter_indicesboundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_stridetriton_dtyper7  s                   r{   	bucketizezTritonKernel.bucketizeI	  s^   $ 	 C CDA7))*Q-8&*&7&7
1&F#++JqM:39TYY__VAY/v
8>))&)4FU[[(%Lu{{*%L%G  ""LL5fXRbr2M1NbQ`Paac nBgRl"]O2  ! # 
 r}   c                    | j                         }|dk(  rd| dS | j                  }dg||z
  z  dg|z  z   }| ddj                  |       dS )	NrF   z!triton_helpers.promote_to_tensor(r   re  r   rf  r   rg  )r(  r%  r   )r   r  ndimsnreducesizess        r{   reduction_resizezTritonKernel.reduction_resize|	  sj    '')A:6ugQ??)))VHw,>>$))E*+1--r}   c           
         | j                   dk(  r|S | j                         | j                   z
  }| j                         }|d| dgz   }t        | j                  j                  |t        |||      |            S )zC
        Reshape to RBLOCK, collapsing all reduction dims.
        rF   NRBLOCKr  )r%  r(  dense_size_listr   r3  r4  r  )r   rA  r  r  target_ndimr	  target_shapes          r{   reduction_collapse_dimsz$TritonKernel.reduction_collapse_dims	  s    
 ""a'L--/$2I2II,,.$\k2hZ?HHum\JRW  
 	
r}   c                   345678 d7d}t        j                  |      D cg c]  }|j                   }}t        j                  ||      }t	        d |D              rHt        j                  |t
        j                        }t        j                  t
        j                         j                  sJ t        d  j                  D              } j                  |       t        |      } j                  r|j                   j                          j                  d   j                  d   }	 j!                         4 j#                  4 fd|      } j%                          j&                  z
  5	 	 	 	 	 	 d85 fd6	 	 	 	 	 	 	 	 d96fd}
57 fd	}||f}| j(                  j*                  v r j(                  j*                  |   S t-        |      }t/        |      } j(                  j1                  |
      }t        d |D              |_        dj5                  |      33fd8 j6                  rt8        j:                  j=                  |      } j#                  t>        |      }d: 8fd}dk(  rn?tA        |tB              r&tE        ||      D cg c]  \  }} |||       }}}n	 |||      }dv r|tF        jH                  jK                         }tM         j(                  jO                   jP                  d|	 d d|
            }ddd   7 | jP                  |||       ||_        ndk(  r8 jR                  r jU                  ||8|      }np jW                  |      }n\dk(  rMtA        tX              sJ |\  }}}tC         fd j[                   jP                  |||5      D              }n
dk(  r j]                  |      }ntA        t^              sJ  j(                  jO                   jP                   6 jP                  tM        |      d       |j                  
      }n j(                  ja                  d| |
      }t8        j:                  jc                  |      } j#                  t>        |      }tA        |tB              s5 jd                  jg                  | d j!                          d| d| d       dv rd| d} jh                  jk                         } jd                  jg                  | d j!                          dt        jl                  |      jn                   d jq                  |       d       ddd   7 jP                  js                  d| d| d 7 d!| d| d| d|	 d"| d# 8| d$|       d%| d# 8| d$|       d%        | jt                  |||       n tw              r jU                  ||8|      }n݉dk(  r(d| d&}d| d'} jd                  jg                  | d j!                          d(| d        jd                  jg                  | d) j!                          d| d        jP                  js                  d*| d| d+| d| d| dtx        jz                   d,        jP                  js                  d*| d# 8| d$|       d*| d# 8| d$|       d*	       |} j(                  j1                  
      } j}                   jt                  ||||5      }nt9        j~                  |      }  | ||      }! jP                  jg                  | d# 8|!|              |t
        j                  k(  r/| d-}"t              }# |
 jt                  tM        |      |"|#       n' |
 jt                  tM        |      tM        |      d         jR                  r{t8        j:                  jc                  |      }t        j                         }$ jt                   j                  fD ]2  }%|%jg                  d.       |$j                  |%j                                4 dv r jt                  jg                  | d/ j                  | d0               j                  | d1||      }& jh                  jk                         } j                  ||t        jl                  |      jn                        }' | j                  ||&|'       n=tw              rdk(  sJ |\  }(})}* j                  |(t/        |      |d         }+ j                  |)t/        |      |d2         }, j                  |*t/        |      |d3         }- j                   j                  |(|)|*|+|,|-5	       ndk(  re|\  }} j                  |t/        |      |d         }. j                  |t/        |      |d2         }/ j}                   j                  |||.|/5       n: j                  |t/        |      |      }0 |
 j                  tM        |      |0d        |$j                          | j(                  j*                  |<   tA        |tB              rt        d4 |D              sJ  j                  j                  |       d5v rt        |      d2k(  sJ t        |      |z  }t        |      t        |      k(  sJ tE        ||      D ]F  \  }1}2|2J |1j                  |2k7  s jt                  jg                  |1 d#|1 d6t        |2       d       H |S tA        |t              sJ  j                  j                  |       |j                  |d   k7  r7|d   J  jt                  jg                  | d#| d6t        |d          d       |S c c}w c c}}w );Nc                    | j                   t        j                  t        j                  fv r$t	        j
                  | t        j                        S | S rr   )r  rt   r  r  ro  rD  ru   r?  s    r{   maybe_upcastz,TritonKernel.reduction.<locals>.maybe_upcast	  sF     ;;MMNN UEMM2 r}   c              3  `   K   | ]&  }|t         j                  t         j                  fv  ( y wrr   )rt   r  r  r  s     r{   r   z)TritonKernel.reduction.<locals>.<genexpr>	  s"     MqU]]ENN33Ms   ,.c              3  :   K   | ]  }|j                    d   ywr   Nr#  r  s     r{   r   z)TritonKernel.reduction.<locals>.<genexpr>	       MDdkk]$/M   r  r   c                v    j                   j                  j                  d|  d d| j                        S )Nr   r   r   r  r3  r4  r5  r  )r  r  r   s    r{   <lambda>z(TritonKernel.reduction.<locals>.<lambda>	  s<    dhh''"1#R'7q9gg (  r}   c           
         dv }|rdnd}j                  | |      }dv rj                  | d d| d d      }nj                  | d d	| d d      }|| d
| d}|S )zK
            Helper to generate a reduction call, e.g. tl.sum.
            )r   r?  minprodtriton_helperstl)r?  r  r  z2(r   r   r  r%  )r  r  )	rA  r  result_type
use_helpermoduler  r  reduction_typer   s	        r{   final_reductionz/TritonKernel.reduction.<locals>.final_reduction	  s     (+HHJ)3%F00FE/--ha/r%3%qA --ha/qr#a@ & 'k]!4Lr}   c                F     | ||      }| j                  | d|        y)zU
            Generate a reduction and assign it to an existing variable.
            r(  N)r   )rA  r^  r  r  r  s       r{   final_reduction_definez6TritonKernel.reduction.<locals>.final_reduction_define	  s*     $FE;?EMMZLE734r}   c                    j                  | |      }j                  | |      }| j                  d| d| d d| d| d d| dj                  | d       d	       y )
N                z_val, z_idx = triton_helpers.z_with_index(r   )
                r(  _idx
                )r  r   r  )rA  r^  r  r   r  r  root_opr   s       r{   final_argreducez/TritonKernel.reduction.<locals>.final_argreduce	  s    00FE00FEMMF:,.DWI\Z_Y``bchbiiklokp qC 5 5D6I JK Lr}   r  c              3  >   K   | ]  }t        |d          r|  ywr*  )r:   )r   r  s     r{   r   z)TritonKernel.reduction.<locals>.<genexpr>
  s!      *
(;CF(CC*
s   r   c                :    s| S t         j                  | |      S rr   )r  rz  )tvalfvalconds     r{   
where_condz*TritonKernel.reduction.<locals>.where_cond	
  s     (..tT4@@r}   c                v    j                   j                  j                   | |      | j                        S )Nr  r  )r  defaultr   r  s     r{   _mask_valuez+TritonKernel.reduction.<locals>._mask_value
  s5    xx((LL*UG"<EKK )  r}   online_softmax_reduce)argmaxargminr   zindex, r  r?  r  welford_reducewelford_combinec              3  n   K   | ],  }j                   j                  j                  |        . yw)r  N)r3  r4  r5  )r   r  r  r   s     r{   r   z)TritonKernel.reduction.<locals>.<genexpr>=
  s3      # HH%%dllE%G#s   25r   = tl.full(r   r   _indexr  _next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                r(  _nextr  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrF   r   c              3  <   K   | ]  }t        |t                y wrr   )rh  r  r  s     r{   r   z)TritonKernel.reduction.<locals>.<genexpr>  s     LAz!%67Lr  )r  r  r%  )r  rL   r   rL   )r  r   r  r   r   r   )r^  r   r  r   r  r   r   r   )r   rL   )Qpytreetree_leavesr  tree_mapr   rt   r6  ru   r&  r   r.  r  r   r  ri  r#  r  _map_tuple_or_scalarr(  r%  r3  reduction_cacher  r  rR  r   r   r  r   	Reductiondefault_valuerW   rh  r  r   rD   r  r/  r   r4  r5  r  r  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbackrL   namedvardefault_accumulatorr  r   r(  select_index_dtypeiinfor?  r  r   r  r8   r   r_  %online_softmax_reduce_final_reductionget_reduction_combine_fnr~   r  re  rf  r  rg  ro  r  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionri  r  r  r  r  r  rg  )9r   r  r:  r  r  r  r5  original_dtypesmasksreduction_range_prefixr  r  rT  acc_typetorch_acc_typer^  r  r  r  dmasked_valueaccumulator_dtypeaccumulator_indexmeanm2weightaccumulatorrx  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedaccumulator_casted_strr  rl  bufpeer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersr  
orig_dtyper  r  r  r  r  r  s9   `` `                                               @@@@@@r{   	reductionzTritonKernel.reduction	  s   	 170B0B50IJ399JJe4M_MM++Iu}}EI''u}}=E$$$$MD<L<LMM% u??LL)!%!1!1"!5!<!<Q!? ,,.))
 
 %%'$*A*AA		 '	 		 	4
	5
	5 
	5 '	
	5
 
	5	 6	00088++I66"9-))4((///?
) *
 *
  

 zz% 	A
 $$ll00KG//wGG !88 E5)>A%>QRdaAq 1RR*5':!55$%HH$K$K$M!$'HH%%*+A*B',W^_/ & %! &+e<^LLL*l<M $5
 #33--!%!4!4"NE:xQV"J "&!=!=eU!KJ#44!,999%1"r6" #!%dBU"# 
  #:: "BB5%P
!,<<<!XX..LL#DLL#l2CTJ&,, / 
 ((++a
|,<N+SKll66~yQG//wGGgu-		##"m;t/B/B/D.ERyPRS[R\\]^ !55&'
|6$:!"mm>>@		##()T5H5H5J4K2{{;/334Bt7H7H7U6VVWY &+e<^L##W%6$77Nwi X M$5#6brBXAY ZS{m5,A;!O P Q"#3z5F4Gu2MO`'a&b c  **JEV &n5!00z8U
  #::$%j\"6$%j\"6 		##&'{43F3F3H2IIZ[cZddef 		##&'|D4G4G4I3J"XJVWX ##$%W_,= >()O+<BugRH\H\G] ^ ##$%S6Gu4M)_(` a$%S6Gu4M)_(` a (
!XX__5_9
!GG**##
  88S
$[%8&&"m3z';'G&HI 

* 1<}L-I*"5e"<K*..J.#	 +..J[AQSW %%ll66~yQG#--/J..0D0DE 7./((67
 !55&&00!l(4+@+@J<tAT+U*VW  JJ!l%()W #mm>>@JJU[[-E-I-I   4 4j(HU%n5%)99998B5Y KK$Y/AJ	
 II$Y/AJ
 #MM!$Y/AJ
 33((!
  #::)3&
JJJ 0 ;WQZ  JJ 0 ;WQZ ::(( GG 0 ;W '((#j/5$ .8  +j%(LLLLL""))*5 !LL?+q000"%j/O"Cz?c/&::::#&z?#C Z!---99
***44%s3%t,?
,K+LAN"  j*;<<<""&&z2 ?1#55&q)555&&00!l#j\6I/Z[J\6]5^^_` k Kd  Ss   ooc                   | j                  |||      }| j                  |||      }t        d      D cg c]'  }t        | j                  j	                  |            ) c}\  }}|j                  d| d| d| d| d| dt        j                   d| d| j                  |        d| d| j                  |        d       ||fS c c}w )Nr   r  
            r   z9 = triton_helpers.online_softmax_reduce(
                )
            r(  )	r  r,  r   r3  rR  r   r   r_  r  )	r   rA  r  r  r  r  r  r  r  s	            r{   _online_softmax_reducez#TritonKernel._online_softmax_reduce"  s     66vPUV66vPUVMRSTX!V#dhhooEo&B"C!V
JL:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 :%% "Ws   ,Cc           	     D     fd|||fD        \  }}}d| d| d| d| d	}t        d      D cg c]'  }t         j                  j                              ) }	}j	                  dj                  |	       d|        t         fd|	D              }
|
S c c}w )	z;
        Helper to codegen triton_helpers.welford.
        c              3  D   K   | ]  }j                  |        y wrr   )r  )r   r  rA  r  r   s     r{   r   z(TritonKernel._welford.<locals>.<genexpr>7  s(      
 ((>
s    ztriton_helpers.welford(r   r   r   r  r(  c              3  @   K   | ]  }j                  |        y wrr   )r  )r   r  r   s     r{   r   z(TritonKernel._welford.<locals>.<genexpr>?  s     Xud33E:Xs   )r,  r   r3  rR  r   r   r  )r   rA  r  r  r  r  r  welfordr  welford_resultsresult_valuess   ``    `    r{   r  zTritonKernel._welford3  s    
F+
b& ,D6B4r&C5JFKAhO3txxU;<OODIIo67s7)DEXXX	 Ps   ,Bc                   | j                         | j                  z
  }| d}| d}	| d}
| j                  j                  | d| j	                          d| d       | j                  j                  |	 d| j	                          d| d       | j                  j                  |
 d| j	                          d| d       |dk(  r>|\  }}}| j
                  j                  d| d	|	 d	|
 d
| d|	 d|
 d| d| d| d       n8|dk(  sJ | j
                  j                  d| d	|	 d	|
 d| d| d|	 d|
 d       | j
                  j                  d| d || d|       d|	 d ||	 d|	       d|
 d ||
 d|
       d       |}| j                  j                  |      }| j                  j                  |      }| j                  | j                  |||||	|
||	      S )z%Helper to codegen a welford reduction_mean_m2_weightr  r   r   r  r  r  z<_next = triton_helpers.welford_combine(
                    z,
                    z#
                )
                r  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r(  r  r  r  )r(  r%  r  r   r  r5  r   r3  rR  r  r  )r   r^  r  r  r  r  r  r  r  accumulator_m2accumulator_weightr  r  r  r  r  r  s                    r{   r  zTritonKernel.welford_reduceB  s    %%'$*A*AA#E*&<s+ *|73		m<(;(;(='>b
!L	
 			l4+>+>+@*AH:QO	
 			!",t/B/B/D.ERzQRS	
 ..$D"fLLW^$4G<N;O P MN#326H5I JF"RD6( + "%5555LLW^$4G<N;O PG2k]"^,<B?Q>R S 	MZ;-u(={KL MC
n-=U+C^ TU V J2D1EU/KM_$`#a b	
 !HHOO%O0	e422""

 
	
r}   c
                    | j                  ||||||	      }
|||g}t        ||
      D ]  \  }}|j                  | d|         |||fS )z0Helper to codegen call to triton_helpers.welfordr(  )r  r   r   )r   rA  r  r  r  r  r  r  r  r  r(  result_exprsresult_exprr  s                 r{   r  z+TritonKernel.welford_reduce_final_reduction}  si     vtReD#Y>"%lF"; 	6KMM[MUG45	6 I}44r}   c                    | j                  |||||      }||g}	t        |	|      D ]  \  }
}|j                  |
 d|         ||fS Nr(  )r  r   r   )r   rA  r  r  r  r  r  r  r(  r  r  r  s               r{   r  z2TritonKernel.online_softmax_reduce_final_reduction  se     ,,VXxeT"J/"%lF"; 	6KMM[MUG45	6 :%%r}   c                D    | j                   r| j                   d   S t        S )NRSPLIT)r  r,   r   s    r{   
max_rsplitzTritonKernel.max_rsplit  s"    $$X..  r}   c                   | j                   d   }| j                         sdnd}||j                  z  | j                         z  }| j                  j                  |      \  }}| j                  j                  d| d| d| j                  |       dt        |       d| d	| d
| dd       | j                  j                  | d| dt        |       d       | dS )a	  
        Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
        column.  After the barrier, every thread block loads the completed value so that it can compute the final
        value independently.
        r  zxindex < xnumelNr  z_ws = (rL  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), r   r  Tstripz_peers = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  _peers)r'  r  r  r  r  r  r  r   r  r>   r  r   rW   )	r   r^  r  default_valxnumelr   r  r  r  s	            r{   r  z7TritonKernel.codegen_cooperative_reduction_peer_combine  s$    S!(,(@(@(B %..(4??+<<!GGPPQWX%%GG9C0A0A)0L/MMbcnotcubv w$%J:,VXY]X^ _  	& 	
 	&&l+J< 8eers~e  eA  ACD	
 V$$r}   c                   | j                   sJ d| _         | j                  |d      }d| _         | j                  j                  |      }t	        j
                         }| j                  r+|j                  | j                  || j                               t        |t              rY| j                  j                  t        || j                  |||j                  |      |d|j!                                            nXt        |t"              sJ | j                  j                  t        |d| d|j$                   d| d|j&                   d		             |j)                          y )
NFTr+  r'  r1  rB  r  r   r   )r&  r.  rE  r<  re  rf  r  rg  rh  r  rh  r   r   rM   r5  rF  rQ  r   r   r   ri  )r   rC  r   r  r.  r  rl  s          r{   store_reductionzTritonKernel.store_reduction  sR    $$$$ %==$=7 $iit$))+
%%$$,,T43G3GH h0  **55  ,+H,C,C,E+HI	 h888  **uD););(<CwbIZIZH[[\] 	r}   c           	        
 t               j                  d       t               
t        d      D cg c]!  t	        
fdt        |      D              # }}dj                  d t        j                  j                  |      D              }j                  d| d       t               dd	d
l
m}  |        G 
fddt              }j                         5  t        j                   |             5   || }	dj                  d |	D              }	j                  d|	        d d d        d d d        | j                   j#                  j%                               S c c}w # 1 sw Y   AxY w# 1 sw Y   ExY w)Nz@triton.jitr   c              3  V   K   | ]   }j                  d  d| |          " yw)r  r  r  N)r  )r   nr3  dtypesr  s     r{   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s.     X1#,,QCq}F1I,>Xs   &)r   c              3  2   K   | ]  }t        |        y wrr   r   r  s     r{   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s     Rc!fR   zdef {name}():r`  r   re   c                  2    e Zd Z	 	 	 	 	 	 	 	 d fdZy)+TritonKernel._lift_helper.<locals>.CSEProxyc                    d| z   t        |      |i |}j                   t        	|      |i ||      S )Nr  r  )r)  r4  )
r   rC  rE  r  output_dtyper3  dtype_handlerhelperhelper_name	overridess
        r{   _defaultz4TritonKernel._lift_helper.<locals>.CSEProxy._default   sq     4&z) w!   # " #
 ||,GIt,d=f=& $  r}   N)rC  r   rE  ztuple[Any, ...]r  dict[str, Any]r   r   )rs   r   r   r&  )r3  r"  r#  r$  r%  s   r{   CSEProxyr    s-    '6@N r}   r(  c              3  2   K   | ]  }t        |        y wrr   r  )r   r<  s     r{   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s     BFBr  return ra  )rN   r   rK   r,  r  r   r&  r'  from_iterabler5  r  rf   r&   ro  rD   set_ops_handlerr  rg  r   )r   r&  num_argsr  r  rE  	signaturerf   r(  outputsr3  r"  r#  r$  r%  s      ``     @@@@@r{   _lift_helperzTritonKernel._lift_helper  s]     !'e 1X
 XhXX
 
 IIRioo.K.KD.QRR	=267#%	 *P24	 	~ 	$ ]]_ 	2a//
; 	2$iGiiB'BBGwwi01	2 	2
 $$(():k(RRU
J	2 	2 	2 	2s)   &E3+F2E88F8F	=FFc                     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d       g }g }t        d |D              }t        j                   j                  j                   j                        } j                  |t        |      |      } j                          j                   z
  }	t#        ||      D ]1  \  }
} j                  j                   j                  |
 dt%        |       d|      } j                  j                   j                  d| d	 j'                          d|      }
|j)                  |
       t+        |      } j,                  r j                  j/                  |      } j1                         }d
|d<   dd	j3                  |       d}|j4                  rdnd} j6                  j9                  | d| d	| d	| d       |j)                  |       4 d  fd} |d |       d|	 d	| d|||      } j,                  s|D cg c]#  } |d| dt;        |j<                              % }} |t        |      t        |            } |t        |      |      }t#        ||      D cg c]   \  }} |d| d	| d|j<                        " }}}t#        |||      D ]*  \  }}} j                  j9                  | d| d	| d       , n|}|D ]$  }t?        |t@              sJ t        |      |_!        & t        |      S c c}w c c}}w )NTODOc              3  :   K   | ]  }|j                    d   ywr  r  r  s     r{   r   z$TritonKernel.scan.<locals>.<genexpr>"  r  r  z(ops.scan not supported inside ops.maskedc              3  2   K   | ]  }t        |        y wrr   r@   r   r  s     r{   r   z$TritonKernel.scan.<locals>.<genexpr>*       Fe*51Fr  r%  r   r  r   r   rd  r  rf  rg  zfloat('nan')z-1r  c                2    dj                  d | D              S )Nr  c              3  &   K   | ]	  }| d   yw,Nr   r   r  s     r{   r   z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>L       <EugQK<rG  r   r(  s    r{   csvzTritonKernel.scan.<locals>.csvK      88<V<<<r}   c                   t        |      }t        |      D cg c]  }|  d| d|  }}t        fd|D              r)|D cg c]  }j                  j	                  |       c}S |D cg c]  }j                  j                  |        }	}j                  j                   |	       d|         t        |	|      D ]*  \  }
}|r||
_	        j                  j                  ||
       , t        |	      S c c}w c c}w c c}w )Nr   c              3  T   K   | ]  }j                   j                  |       ! y wrr   r3  containsr   rT  r   s     r{   r   z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>Q        LI488$$Y/L   %(r  r(  )r  r,  r  r3  rd  rR  r5  r   r   r   rS  r  )r@  r(  r  r  r  r  
cache_keysrT  _dtyperesult_varsr^  r@  r   s              r{   cse_multiplez'TritonKernel.scan.<locals>.cse_multipleN  s    FA;@8DaTF"QCr%1DJDLLLAKLIY/LLGMNV488???8NKNLL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% ELNs   C="D.#Dztl.associative_scan((r  ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)ztl.where(roffset > 0, z = tl.where(roffset > 0, )"r&  r  r   r.  r  r   r  r  r!  r"  r3  r4  r5  r0  r  r(  r%  r   r  r  ri  r  r  rR  r  r   r  r  r   r@   r  rh  r  r   )r   r  r  r(  r  broadcasted_valuesaccumulatorscse_computecombine_helper_fnr  r  r  value_dtyper  r  reduced_sizer  rL  partial_scan_varspartial_scan_varpartial_reduce_vars	accs_nextfull_scan_vars	full_scanpartial_scanrK  acc_nextpartial_reducer^  r@  s   `                            @r{   scanzTritonKernel.scan  s    $$$$--5v5-MD<L<LMM% u??N$NN"FvFF''(9(94<<H --j#f+vN%%'$*A*AA/ 	1LE5((++'1%89; , K
 HH%%";-r$2E2E2G1HJ & E
 %%e,&u-H,,"hhooEo:#335#&R !"499\#:";1=,1,C,C.		##"m;|nBwir(STU ##K05	18	=	& )#C(:$;#<CuBGXFYYZ[	
 (( ):#
 %	 12B1CCtu-.>.D.DE# # #5#6>Q8RSI'l(;=NON 03>CT/U
 ,I|	 ,YKr,qI&,,K  :=<)<: 5+~ &&"m#<XJbHXXYZ ,K% 	5Jj*;<<<#-e#4J 	5 [!!;#s   '(M&%M+c                     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d        j                  sJ d       t        j                   j                  j                   j                        } j                          j                  z
  }t        d |D              }t!        |      t!        |      k(  sJ t#        |      D 	cg c]'  \  }}	 |d|	 d j%                          d||   	      ) }
}}	d
  fd} j                  d   j&                  sJ  j)                   j                  d         rdnd}t!        |      dk(  r0d|
d    d|
d    d| d| d| d| d} ||t!        |      ||      }nt+        d      t-        ||      D ]  \  }}||_        |j0                  |_         t        |      S c c}	}w )Nr2  c              3  :   K   | ]  }|j                    d   ywr  r  r  s     r{   r   z$TritonKernel.sort.<locals>.<genexpr>  r  r  z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc              3  2   K   | ]  }t        |        y wrr   r5  r6  s     r{   r   z$TritonKernel.sort.<locals>.<genexpr>  r7  r  r   r   r   r  c                2    dj                  d | D              S )Nr  c              3  &   K   | ]	  }| d   ywr:  r   r<  s     r{   r   z1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>  r=  rG  r>  r?  s    r{   r@  zTritonKernel.sort.<locals>.csv  rA  r}   c                   t        |      D cg c]  }|  d| d|  }}t        
fd|D              r)|D cg c]  }
j                  j                  |       c}S t        |      D cg c]!  }
j                  j	                  ||         # }}
j
                  j                   	|       d|         t        ||      D ]*  \  }}|r||_        
j                  j                  ||       , t        |      S c c}w c c}w c c}w )Nr   c              3  T   K   | ]  }j                   j                  |       ! y wrr   rD  rF  s     r{   r   z:TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>  rG  rH  r  r(  )r,  r  r3  rd  rR  r5  r   r   r   rS  r  )r@  r  r  r  r  rI  rT  rK  r^  r@  r   s            r{   rL  z'TritonKernel.sort.<locals>.cse_multiple  s    ;@8DaTF"QCr%1DJDLLLAKLIY/LLEJ1XN488???;NKNLL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% ELNs   C>"D,&Dr  r   rnumelr   ztriton_helpers.sort_with_index(r   rF   z	, stable=z, descending=zUnhandled sort)r&  r  r   r.  r  r   r  r  r!  r"  r3  r4  r5  r(  r%  r  r  r2  r  r)  r  rq  r   r   r  )r   r  r(  stable
descendingr  rO  r  r  r  rM  rL  rd  r@  rK  r^  	input_varr@  s   `                @r{   sortzTritonKernel.sort  s2    $$$$--5v5-MD<L<LMM% u??N$NN"(( 	
A	
(  ''(9(94<<H%%'$*A*AAFvFF6{c&k)))
 &f-	
 5 "5'D,?,?,A+B!DFSTI
 
	=	& #00002243C3CB3GHhv;!12DQ2G1HK]^_K`Ja b82cU)F8=AO  'tS[%HK !122%(f%= 	1!J	#(J  ) 0 0J	1 [!!Q
s   ,Hc                   | j                   s=| j                  s1| j                  s%| j                  s| j                  s| j
                  sy| j                  D cg c]  }|j                  s| }}| j                  rWt        |      dkD  rHt        |      D ]  \  }}| j                  j                  |      5  |j                  }| j                  rdnd}| j                  rdn| d}| j                  j                  d| d	| d
| d
|j!                          d	       ddd       | j                  j                  |dz         5  | j#                  || j                         ddd        | j                  j                  t        |            5  | j%                  | j                         | j                  j'                  | j                          | j                  j'                  | j                         | j                  j'                  | j                         | j                  j'                  | j                         ddd       t)        g t        |            D ]o  \  }}| j                  j                  |dz         5  | j*                  |j,                     j/                         D ]  \  }}|t        |      dz
  k  rs||dz      }	| j*                  |	j,                     |   }
t0        j3                  |	      }t5        |	j6                  |      }t9        ||
      D cg c]  \  }}|||z  z
   }}}| j                  j                  t;        | j<                  |   | d| d
t>        j@                  jC                  |       d              	 ddd       | jD                  jG                  | jH                         |jK                          r n| j                  j'                  | j                          | j                  j'                  | j                         | j                  j'                  | j                         | j                  j'                  | j                         | j                  j'                  | j                         | j                  rb| j                  s| j
                  rJ| jL                   d}| j                  j'                  d| dd       | jN                  jQ                          | j                  j'                  | j
                         | j                   jS                          | j                  jS                          | j                  jS                          | j                  jS                          | j                  jS                          | j
                  jS                          yc c}w # 1 sw Y   xY w# 1 sw Y   BxY w# 1 sw Y   xY wc c}}w # 1 sw Y   ^xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nr   )r   rsplit_startr7  
rsplit_endr  zfor zoffset in range(r   zBLOCK):rF   z = tl.advance(r   z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(r  Tr  )*indexing_coderD  rd  r5  r  r  r.  r  r&  r  r2  r  ro  r#  r  r   r   r  r  r   r+  r  r   r!  r   r   r   r  r   rM   r  rD   r  r  r3  
invalidater  cache_clearr  r  r  clear)r   r   
loop_treeslevelr#  
loop_startloop_endr,  advancement	prev_treeprev_advancement
prev_blockprev_num_itercurprevsem_ptrs                   r{   codegen_bodyzTritonKernel.codegen_body  s	    zz{{||%%##'+'7'7Ht4<<dH
H  S_q%8(4 JtYY%%U%3 ![[F373M3MSVJ(,(B(B6(RWHX  II''vh&6zl"XJbQWQ]Q]Q_P``gh YY%%UQY%7 J88tyyIJ JJ !!Z!9 ...tyy9		  !3!34		  ,		  .		  -.  ((@)J*?(@A #tYY%%UQY%7 262K2K		3eg.	; !3z?Q#66(2519(=I/3/H/H )0'0), *7)E)Ei)PJ,3IOOZ,PM 25[BR1S+$-C !$d]&: :+K +
 		++( $ 8 8 C#,+^I;bI^I^_jIkHllm n!4 ##D$:$:;  "9#< IIT//0IITZZ(IIT\\*IIT[[)		//0%%""d&:&:--..ABGII33:) <    66BBD		--.  "

$$&""$] I J J. .,+ sR   V=*V=9A&WWB0WBW/#W)5AW/W	W	W&)W//W9	c                V   g }| j                         rg }| j                  d|g        |D ]  }t        |t              r|j	                  t        |             .t        |t              rL|j	                  t        t        j                  j                  j                  |j                                     t        |t        j                        rB|j	                  t        t        j                  j                  j                  |                   t        dt        |              |S )Nr   z!Unsupported numel argument type: )r  add_numel_to_call_argsrh  r>  ri  r   rb   rD   r  r   	size_hint
inner_exprr   r9  r  r  )r   rE  
numel_argsr  s       r{   kernel_benchmark_extra_argsz(TritonKernel.kernel_benchmark_extra_args+  s    !+-J''J;! Vc3'KKC)_5KKAGG$4$4$>$>s~~$N OPUZZ0KKAGG$4$4$>$>s$C DE$'Hc%TUUV r}   c                   t               }| j                  j                         \  }}}}|j                  g d       |j	                         5  t        j                         }g }t        ||      D ]  \  }	}
dt        |       }t        j                  j                  |	      }|r|j                  | dt        j                  j                  j                  |j                                dt        j                  j                  j                  |j!                                d|j#                          d|j%                          d
       n|	t        j                  j&                  v rt        j                  j&                  |	   }|j                  | dt        j                  j                  j                  |j)                                dt        j                  j                  j                  |j+                                d|j,                   d|j.                   d
       nt1        |
t2              rZt        j                  j                  j5                  |
j6                        }d|
j8                  v rd	}|j                  | d
|        nt1        |
t:              ryt        j                  j=                         }t        j                  j                  j5                  |
j                        }|j                  | d| d| d|
j.                   d       nt?        d|	       |jA                  |        |jC                  | jE                                |j                  ddjG                  |       d       d d d        |j                  g d       t        j                  j=                         }|jH                  }|j	                         5  |j                  dt        j                  jJ                  jM                  |       d       |j	                         5  |j                  t        j                  jJ                  jO                  |             d| }|j                  | d| d       |j                  tQ        tR        jT                         d| d       d d d        d d d        |j                  g d       |j	                         5  |j                  dt        j                  jJ                  jM                  |       d       |j	                         5  |j                  t        j                  jJ                  jO                  |             |j                  dtQ        tR        jT                         d       d d d        d d d        |j                  g d       |j	                         5  |j                  d       |j                  d       |j                  d       |j                  d       |j                  d|        |j                  d       |j                  d       d d d        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   |S xY w)N)r   r   zdef get_args():arg_z = rand_strided(r   z
, device='z	', dtype=r   rO  r   r(  z = torch.zeros(z*Don't find the buffer or const tensor for r*  r;  )
r  zdef call(args):zwith re  streamz = get_raw_stream(z.run(*args, stream=)r  r  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))r  r  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z:ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))+rN   rE  python_argdefs
writelinesro  r&  r  r   r+  rD   r  try_get_bufferr   r   
size_hintsget_size
get_stride
get_devicer3  	constantsrp  r0  devicer  rh  rS   r  r  rC  rU   get_current_device_or_throwKeyErrorri  extendr  r   r   
device_opsdevice_guard
set_devicer   r9   KERNEL_NAME)r   num_gbr7  _argdefs	call_argsr.  r  name_cnt	var_namesarg_namearg_sigvar_namer  const_tensorsymval_hintr  r  current_devicer   stream_names                       r{   codegen_kernel_benchmarkz%TritonKernel.codegen_kernel_benchmark;  s>   !,0II,D,D,F))Y56]]_ %	@ (HI%(I%>  +!'!$x.!12gg,,X6$$#*$4QWW5E5E5P5PQTQ]Q]Q_5`4aacdedkdkdtdtdd  AD  AO  AO  AQ  eR  dS  S]  ^a  ^l  ^l  ^n  ]o  ox  y|  yF  yF  yH  xI  IJ  K !2!22#$77#4#4X#>L$$#*$4QWW5E5E5P5PQ]QbQbQd5e4ffhijipipiyiy  jE  jE  FR  FY  FY  F[  j\  i]  ]g  ht  h{  h{  g|  |E  FR  FX  FX  EY  YZ  [  1"#''"2"2"<"<W\\"JK
 %4&'$$z[M%BC6WW@@BFGG,,66w}}EE$$#*OE7*VHIV]VcVcUddef #DXJO    *A +B T==?@wtyy';&<A>?K%	@N 	9:<<>$$]]_ 
	uQWW%7%7%D%DU%K$LANO   GG&&11%8 !'ug.  K=0B5'!KL  ;22344G}TUV
	 	JK]]_ 	uQWW%7%7%D%DU%K$LANO   GG&&11%8   c+"9"9:;;XY		 	DE]]_ 	N R 01L y12=>N	  g%	@ %	@X 
	 
	  	 		  sf   L2YAY!BY$YAY4$A%Y(	Y4<A;Z YY	YY%(Y1	-Y44Y= Z
c                    t        j                  dj                  t        j                  j
                  j                  d                  S )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentrF  rD   r  r  import_get_raw_stream_asr   s    r{   imports_for_benchmark_kernelz)TritonKernel.imports_for_benchmark_kernel  s:     F177%%>>?OPQ
 	
r}   c                    | j                   ry| j                  ry| j                  r| j                  sJ y| j                  ryy)Nr  r  r  r  	pointwise)r  r  r  r&  r   s    r{   _get_heuristiczTritonKernel._get_heuristic  sD    !''*&&(((()""r}   c                    t         j                  j                  j                         t        j                         t
        j                  t
        j                  t
        j                  j                  t
        j                  t
        j                  t
        j                  t
        j                  t
        j                  t
        j                  j                  t
        j                  j                   t
        j                  j"                  d} t         j$                  j&                  d| d<   t        j(                         rd| d<   t
        j*                  rLt
        j*                  | d<   t
        j,                  | d<   t
        j.                  | d<   t
        j0                  | d<   t
        j2                  r9t
        j2                  | d	<   t
        j4                  | d
<   t
        j6                  | d<   | S )N)backend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTis_hipr}  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)rt   r  _tritontriton_hash_with_backendr  r   r  r  r   r  r  r  r  r  r  r  r  r  r~  r  r}  r  r  r  r  r  r  r  )inductor_metas    r{   inductor_meta_commonz!TritonKernel.inductor_meta_common  su    "KK//HHJ494^4^4`(.(G(G$*$?$?"(--"B"B%+%A%A$*$?$?$*$?$?"//&,&C&C%+]]%H%H%}}<<!==44
 ==(&*M(#)-M+&##171I1IM-.7=7U7UM348>8W8WM45FF KL ++00 56 77 <= >> CD r}   c                   ! t               }i }| j                  j                         D ]  \  }}t        |      r| j                  st
        j                  j                  j                  |      }t        |t        t        j                  f      sd}nt        t        |            }|||<    ||j                  t                      t
        j                  j!                         j"                  }|dk(  r|j                  d       n|j                  d       t$        j&                  r|j                  | j)                                | j*                  j-                         \   }	!}	t/        !      D ]  \  }
}t        |t0              st3        t        j4                  |j6                        }|t
        j                  j                  j8                  v sbt1        |j:                  t
        j                  j                  j8                  |         !|
<    t=               }| j>                  D ]  }|| j*                  j@                  v r(|jC                  | j*                  j@                  |          || j*                  jD                  v rj|t
        j                  jF                  vrN|| jF                  vr@|jC                  t3        tH        | j*                  jD                  |         jJ                         || j*                  jL                  v s| j*                  jL                  |   }t        |tN              rJ |jC                  |        tQ         !      D ]O  \  }}t        |tR              s|jT                  tV        jX                  k(  s5|jC                  |j:                         Q t[        |      }| j]                         D ]Z  }t1        |j^                   d|j`                        }!jc                  |        jc                  te        |j:                               \  !fd}| jf                  D ]K  }|jh                  r| jj                  r|jl                  ) ||j^                  jo                          d       M | jp                  r |d       ts        !| jt                   	      }|tw        jx                  t
        j                  j!                               i d
}t
        j                  jz                  xs t
        j                  j|                  }| j                         j                  t        | j                        t        t        j                        ||| j                  | j                  | j                  d| j                         }| j                  r| j                  |d<   | jp                  r| jj                  |d<   d }t$        j&                  st$        j                  r| j                         dz  }||d<   t        !      g|d<   t        !      D ]  }d|d   !|   j:                  <    || _O        | j                          | j                  D ]$  }|j                  d       |j                  |       & | j                  r2d| j                          d| j                  j$                  d|d|d	}n| j                  r;| j                  j                         }d| j                          d|d| d|d|d}nYd}t        |      dk(  rt        t        !            dk(  rd}nd}d| j                          d|d| d |d|d!| j                   d}|j                  |       |j                  d"|xs t        t        j                         d#dj                  d$  D               d%       |j                         5  | j                  |       | j*                  j                         D ]  \  }}|j                  | d&|         |j                  | j                         d d d        t$        j&                  r |j                  | j                  |             |j                         S # 1 sw Y   IxY w)'Ni    cpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()r  c                    t               rj                  t        |              j                  t        | d             y )NT)is_constexpr)r?   ri  rJ   rH   )r  argdefsr.  s    r{   add_constexpr_argz6TritonKernel.codegen_kernel.<locals>.add_constexpr_arg3  s/    -/  h!78NN78$?@r}   r   r  )
size_dtyper  )r.  r  r  )	grid_typer  kernel_namemutated_arg_namesoptimize_memr"  num_loadnum_reductiontiling_scoresr  g    eAkernel_num_gbconfigsrF   r  r   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=r   r  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r   zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c              3  <   K   | ]  }|j                           y wrr   )	full_namer  s     r{   r   z.TritonKernel.codegen_kernel.<locals>.<genexpr>  s     CcVWAKKMCcr  r  r(  )brN   r'  r!  r:   r&  rD   r  r   symbolic_hintrh  r>  r   r@  r.   r   r   r  r  r   benchmark_kernelr  rE  r  r2  rS   r	   r   r  inv_precomputed_replacementsrC  r   	mutationsinput_buffersrg  rc  removed_buffersrO   
inner_nameoutput_buffersrR   r   rU   	zero_moderV   ZERO_ON_CALLr   r   r#  r  ri  rH   r.  r)  r  
tensor_dimr   r  ra   rx  r*   r8  is_inferenceis_backward_get_grid_typers   setr  r   r9   DESCRIPTIVE_NAMEr"  r  r  r  r  r  estimate_kernel_num_bytesr]   r^   r  r|  r  r   r  r  r(  get_reduction_hintr  r_   r@  r  r   ro  codegen_static_numelsaliasesr  r  r   )"r   rC  coder  r#  r  
numel_hintr  device_typer  r  r  r  mutated_argsmutationmutation_argargnamer   sizeargr  triton_meta_signaturer  r  r  r  arg_numr#  heuristics_linereduction_hint	tile_hintoldnewr  r.  s"                                   @@r{   codegen_kernelzTritonKernel.codegen_kernel  s   
![[..0 	+MFE"6*43H3H))77>Jj3*>? !	+C
O<	!*Jv)	+, <KK134''==?DDKe#@A@A&&D==?@#'99#;#;#= Iq	* 	FAs#w' ellCHH5QWW--JJJ#*!''"2"2"O"OPV"W$IaL	 )3 	/H499222  !8!8!BCDII555AGG$;$;;D$8$88  )B)B8)LMXX 499333#yy77A%lJ???  .	/6  3 	/LGS3-MM%6%C%CC  .	/ l+++- 	2DU3TZZ@GW%NN77<<01	2	A $$ 	=D  T%>%>&!2!2!4 5U;<	= %%h' 1$"2"2G!
 /&--agg.Q.Q.ST'
 ww++Bqww/B/B ,,.77!$"5"56{;;<!-(!//
 '')
 -1-?-?M/*%%484M4MM01""f&>&>335;F-3M/*"+I"6!7I +95 	BG@AK$Yw%7%<%<=	B '++ 	 FNN2KK	  #$$($7$7$9#: ; --447 8!!, 0##0"3 4O ""!]]==?N#$$($7$7$9#: ;  *~ .$$2#3 4!!, 0##0"3 4	O I:!#/	:;q@ <I =I#$$($7$7$9#: ;  *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$473{6678$))Cc[bCc:c9ddfg	
 [[] 	#&&t, II--/ 1S#c#/01KK		"		# ""KK55f=>}}	# 	#s   A%a33a<c                   t         j                  j                  j                  |       } t	        | t
        j                  t        f      rt        |       }t        |      }|S d}t         j                  j                  j                  | |      sC|dkD  rt        d|        |dz  }t         j                  j                  j                  | |      sC|S )N   i @  z!Failed to find static RBLOCK for r   )rD   r  r   simplifyrh  r   r@  r>  r.   statically_known_leqr  )rd  r5  s     r{   _get_persistent_RBLOCKz#TritonKernel._get_persistent_RBLOCK  s    !!**62fu}}c23f+C!#&C 
 Cgg&&;;FCH?$'H%QRRq gg&&;;FCH 
r}   c                N    	 t         j                  |        y# t        $ r Y yw xY w)NTF)r  r  r  )rd  s    r{   has_persistent_RBLOCKz"TritonKernel.has_persistent_RBLOCK  s*    	//7 		s    	$$c                   d	d}| j                   D ]M  }|j                  r| j                  rdt        j                  j
                  j                  |j                        } ||      r)|j                  |j                   dt        |              |j                  r| j                  r| j                  r1| j                  | j                  |j                              }d| d}n| j                  |j                        }|j                  |j                  j!                          d|        |j                  dk(  s/| j"                  s=|j                  d       P y)
a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        r0_numel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        c                B    t        | t        j                  t        f      S rr   )rh  r   r@  r>  )r  s    r{   is_static_integerz=TritonKernel.codegen_static_numels.<locals>.is_static_integer  s    dU]]C$899r}   znumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)zBLOCK: tl.constexpr = r  zXBLOCK: tl.constexpr = 1N)r  r   r   r~   )r.  r)  r&  rD   r  r   r  r  r   r#  r>  r  r  r  r;  r  r   r"  )r   r  r  r   simplified_tree_numelr  r5  s          r{   r  z"TritonKernel.codegen_static_numels  s   $	: $$ 	;D$$(=(=()(8(8(A(A$**(M%$%:;NNdkk](3?T;U:V#WX  T%>%>-- JJt';';DJJ'GHEFugMfgC55djjAC$++"3"3"5!66LSERS{{c!dmm9:	;r}   c                   t        | j                  D cg c]  }t        |j                          c}      }| j                  r|dk(  sJ t
        j                  S |dk(  rt
        j                  S |dk(  rIt        t        | j                  | j                              rt
        j                  S t
        j                  S |dk(  rt
        j                  S t        d|       c c}w )NrF   r   r   z"Unsupported number of dimensions: )r  r.  r>  r)  r  r'   CooperativeReductionGridGrid1Dr   r   rH  Grid2DWithYZOverflowGrid2DGrid3Dr  )r   r   r  s      r{   r  zTritonKernel._get_grid_type  s    8H8HI***+IJ%%6M6$===!V$+++!V3t22D4D4DEF(===$+++!V$+++=aSABB Js   C!c                   | j                   D ]  }t        |j                  t        j                  t        j
                  f      r|j                  }n*t        j                  j                  j                  ||      }|j                  r| j                  s|j                  |       |j                  t        |              y rr   )r.  rh  r  r   r@  r   rD   r  wrapper_codegenerate_numel_exprr)  r&  ri  r  )r   rC  r  	arg_typesr   r  s         r{   r~  z#TritonKernel.add_numel_to_call_args  s    $$ 	-D$**u}}ell&CDzzww++??dK$$(=(=  &  d,	-r}   c                   t         j                  j                  }|j                          | j                  j                         \  }}}}| j                  |||       | j                  j                  D ]  }|j                  |        |j                  ||d|| j                         t        | j                  j                        D ]  }|j                  |        y )NT)r   r  r  )rD   r  r  write_triton_header_oncerE  r  r~  workspace_argsgenerate_workspace_allocationgenerate_kernel_callr  r+  generate_workspace_deallocation)r   rC  rH  wrapperr  r  r  wss           r{   call_kernelzTritonKernel.call_kernel  s    ''&&((*%)YY%=%=%?"9a##D)Y?))** 	6B11"5	6 	$$(( 	% 	
 499334 	8B33B7	8r}   c                   t         j                  j                  }| j                  j	                         \  }}}}t        ||      D ]w  \  }}t        |t              st         j                  j                  r|j                  d| d| d       Jd| d}|j                  |       d| d}|j                  |       y y )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rD   r  r  rE  r  r   rh  rT   cpp_wrapperr   )r   r  r  r  arg_signaturesr  arg_signaturer@  s           r{   codegen_nan_checkzTritonKernel.codegen_nan_check  s    ''&&*.))*B*B*D'9na"%i"@ 
	,C-377&&%%TUXTYY\]`\aade )-BCD%%d+(-BCD%%d+
	,r}   c                    t        |i |S rr   )r  )r   rE  r  s      r{   create_cse_varzTritonKernel.create_cse_var.  s     $1&11r}   c                   |j                    d| j                  | j                  |j                               }|j                  j
                  r| j                  j                  |       y | j                  j                  |       y r
  )	rC  r  r;  r  rootr  rl  r   r  )r   entryr@  s      r{   codegen_iteration_ranges_entryz+TritonKernel.codegen_iteration_ranges_entry1  sd    **SD,@,@,L!M NO::((. II%r}   c                   |j                   J | j                  |j                         }| j                  }|dk7  rd| dnd}| j                  r| j                  r|j
                  r| d}d|j                  j                          d| | S )Nr  r%  r   r   z + rsplit_startztl.arange(0, zBLOCK))r  indexing_size_strrx  r  r  r)  r#  r   )r   r$  rp  rx  r  s        r{   r  z)TritonKernel.iteration_ranges_ranges_code9  s    +++%%e&6&67&&*5*C4}A&&&))""x/Fu||1134F4&IIr}   c                ^    | j                   }| j                         }dg|z  }d| d| d| dS )NrF   rI  r   r   )rx  r(  )r   r$  r  rx  r)  rp  s         r{   iteration_ranges_scalar_codez)TritonKernel.iteration_ranges_scalar_codeF  sC     &&&&(sTz$r%;-q99r}   c                $   |j                   J d|j                    d}| j                  |      r#d| d|j                   dz    d|j                    d}|j                  j                  ||      }| j                  dk7  r| d	| j                   dS |S )
Nztl.program_id(r   r  z + tl.program_id(rF   z) * tl.num_programs(r  r  r%  )r  rH  	pid_cacherd  rx  )r   r$  r4  pids       r{   iteration_ranges_get_pidz%TritonKernel.iteration_ranges_get_pidN  s    ~~)))u~~.a0 &&u- cU+ENNQ,>+??STYTbTbSccefCoo!!#s+z)U$t//022
r}   c                    |j                   dk(  xr[ |j                   xrL | j                   xr= t        j                  j
                  j                  |j                  t                      S r  )	r  has_zdimr  rD   r  r   r   r  r-   )r   r$  s     r{   rH  z#TritonKernel.needs_yz_grid_overflow]  sa    NNa YNN"Y...Y GG$$99%++~GWXX		
r}   c                    | j                   r | j                   |j                          d   S t        |j                            S )Nr   )r  r   r+   )r   r#  s     r{   r  zTritonKernel.max_blocke  s;    $$'7u%=>>//r}   c                   | j                   sy| j                  rW|j                  j                          d| j                  v r.| j                  |j                  j                          d   dk(  r6yt        j
                  j                  j                  |j                  d      ry|j                  r(| j                  r| j                  |j                        }n9|j                  dk(  r| j                  rd}n| j                  |j                        }|j                  r| j                  r|| j                         z  }t        j
                  j                  j!                  |j                  |      r[|j"                  dk7  xsJ |j$                  xs< t        j
                  j                  j'                  |j                  t)                     S y)NFr   rF   Tr  )r  r  r#  r   rD   r  r   r   r  r)  r  r  r"  r  r  r  rK  r  r/  r   r-   )r   r   r  s      r{   r  zTritonKernel._has_constant_maskj  sa   !!DKK$5$5$7#8!>$BSBS!S  DKK$5$5$7#8!>?1Dww77

AF !:!:33DJJ?I[[CDMMIt{{3I!;!;!DOO$55I 7788YO" W==W77##88^EUV r}   c                d    | j                   d   }|j                  dk(  sJ | j                  |      S )Nr   r  )r.  r#  r  )r   xtrees     r{   r  z TritonKernel._has_constant_xmask  s5      #||s"""&&u--r}   c                    | j                   D ]2  }| j                  |      s|j                  |j                   d       4 |j                  d       y )Nr   r   )r.  r  rE  r#  )r   r   r   s      r{   r  zTritonKernel.filter_masks  sN    $$ 	8D&&t,!!T[[M"67	8
 	&!r}   c                    t        t        j                        d | j                   D cg c]  }t        |    c}S c c}w rr   )r*  r   r   r%  r   )r   r   s     r{   get_reduction_prefixesz#TritonKernel.get_reduction_prefixes  sB     ]::;<Ud>U>UV
 t
 	
 
s   <c                   | j                   D cg c]  }|j                  s| }}dj                  t        d |D                    }|j	                  d| j                  |              | j                   D cg c]+  }|j                  rt        j                  |j                     - }}t        |      }|j	                  d| j                  |              yc c}w c c}w )z^
        Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
        r  c              3  :   K   | ]  }|j                    d   yw)r  Nr  r  s     r{   r   z8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>  s     "UTdkk]%#8"Ur  z	rnumel = zRBLOCK: tl.constexpr = N)
r.  r)  r   r   r   r  r   r   r   r<   )r   rA  r   reduction_treesrd  	rn_blocksrS  s          r{   r  z%TritonKernel.codegen_reduction_numels  s    
 -1,<,<RD@Q@Q4RRF"U_"UUV	$**V"4!567
 ((
   %%dii0
	 

 y)/

60B/CDE S

s   CC90Cc                |    | j                         }|D cg c]  }t        j                  | | fi | c}S c c}w )zK
        Helper to initialize symbols like rn_numel, rn_base, etc.
        )r6  r   r   )r   r  r  rn_prefixesr#  s        r{   r  z#TritonKernel._get_reduction_symbols  s=     113JUVxx0;F;VVVs   !9c                    | j                         }| j                  ddd      }t        t        |      dz
        D cg c]  }t	        ||dz   d        c}t        j                  d      gz   S c c}w )z
        Compute coefficients to convert ND reduction indices to linear indices.
        For example:
          rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
        r  Tr   rF   N)r6  r  r,  r  r<   r   r@  )r   r<  	rn_numelsrO  s       r{   _get_reduction_index_coeffsz(TritonKernel._get_reduction_index_coeffs  s{     113//PT/U	;@[AQTUAU;V
47M)C!GI./
]]1 	 
s   A0c                :    | j                         }t        ||      S )zK
        Compute linear reduction indices from N dimensional ones.
        )r?  r;   )r   
multi_indscoeffss      r{   r  z'TritonKernel._flatten_reduction_indices  s     113,,r}   c                $   | j                  ddd      }| j                  ddd      }| j                  |      }|j                  d| j                  |              | j                  |      }|j                  d| j                  |              y)zX
        Generates code that converts ND reduction indices into linear indices.
        r   Tr   r   z
roffset = z	rindex = N)r  r  r   r  )r   rA  
rn_offsetsrn_indsr;  rindexs         r{   r  z&TritonKernel.codegen_reduction_indices  s    
 00d 1 

 --gtQU-V 11*=
4#4#4W#=">?@009	$"3"3F";!<=>r}   c                   |j                   }|j                  r%|j                  |j                   d| d| d       n|j                  D|j                  |j                   d| j                  |              |j                  | d       n|j                  | d| j                  |       }n| j                  || d      }|j                  | d| j                  |       d|j                          d|j                   d| g       | j                  |      r(| j                         }|j                  | d	| d
       y |j                  | d|j                   d| d       y )Nr(  z	offset + r  z
offset = 0r   z	offset = r  r   zmask = tl.full(z, True, tl.int1)zmask = z < r  )r#  r  r   rC  r  r  r  r)  r  r-  r   r  r  )r   r$  r  r  r@  r  s         r{   r  z,TritonKernel.iteration_ranges_codegen_header  sb    LL==NNejj\QCy4@A^^#NNejj\T-N-Nu-U,VWXNNaS
+,+Id&G&G&N%OP881#VMOOc4#@#@#G"HAGGI;V[\zzl#dV, ""5)'')ENNaSw6FGHNNaS

|3qc?@r}   )r   TN)r  zdict[str, sympy.Expr]r  zOptional[FixedTritonConfig]r   r   r  r  r   r   r   r   )r   r   )r   )rC  r   r  r   r.  r   r   ztuple[str, str])r  r   rp  r   r>  r~   r   r~   )rC  r   r   r   rr   )
rC  r   r   r   r  rL   rj  rC   r   r   NN)r(  rL   rr  z.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]rs  rL   rt  r  r  r~   ru  z Optional[tuple[str, sympy.Expr]]rv  zOptional[CSEVariable]r   rL   )r  r   r  r  r   r   )
r  r  r:  r  r  rB   r  +Union[CSEVariable, tuple[CSEVariable, ...]]r   rJ  )r  r  )rC  r   r   r   r  rJ  )r  tuple[torch.dtype, ...]r   r   )r  rK  r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r(  tuple[CSEVariable, ...]r   rL  )
r  rK  r(  rL  re  r~   rf  r~   r   rL  )r   r]  )r   z type[triton_heuristics.GridExpr])rC  r   rH  zOptional[IRNode]rm  )r   r  )r$  rY   )r$  rZ   r   r   )r$  rZ   r  r   r   r   )r$  rZ   r   r~   )r#  r   r   r>  )r   rZ   r   r~   )r   r   r   r   )rA  rN   r   r   )r  r   r   zlist[sympy.Symbol]r^  )rA  r  r   r   )r$  rZ   r  rN   r   r   )Mrs   r   r   r  r%  r   r:  r  r  r
  r  r  r  r  r  r  r  r  r   r  r.  r/  r5  rB  r=  r>  rm  rh  r~  r  r  r  r  r  r  r  r  r  r  r  r0  r\  rh  r|  r  r  r  r  rb  r  r  r  r  r  r  r~  r  r  r!  r%  r  r)  r-  rH  r  r  r  r  r3   r6  r  r  r?  r  r  r  r  r  s   @r{   r  r  T  s   %I%%).E&.O
 48%3%%3
 2%3 
%3N"

#J*:0

 " " fTfTR	 EG- - !- -<- 	- ^8.MM M 	M
 M4EP SW(( *(3>(FO(	(T  480411 C1 &	1
 $1 1 11 .1 
1f.
"JJ J &	J
 ;J 
5JX&DO&"9
v5(&!
%6(( ( ;	(T1Sfm"'m"
m" (m" 
!m"^>"'>" (>" 	>"
 >" 
!>"@b%H Xt

 % %N`D    $;LC
-8(,2&J:(:14:	:
0
)V.
" 
 
F$W 
 
-? A(A0>A	Ar}   r  c            
      `    e Zd ZU eZded<    eej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  g      Zd fdZedd       Zd Zd ZdddZ	 d	 	 	 ddZ	 	 	 	 	 	 	 	 dd	Z	 	 	 	 	 	 	 	 dd
Zd Z xZS )TritonSchedulingz	type[Any]kernel_typec                    t         |   |       |t        |d      sy |j                  D ]$  }t	        |t
        t        f      st        |_        & y )NrF  )	r	  r
  r   rF  rh  r2   r0   debug_triton_codedebug_device_str)r   	schedulerrH  r  s      r{   r
  zTritonScheduling.__init__  sM    #GIw$?OO 	:D$0B CD(9%	:r}   c                    t         j                  j                  st         j                  j                  r't	        g | j
                  t        j                        S | j
                  S rr   )r   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrI   REDUCE_TO_SINGLE_ELEMENT)rw   r  s     r{   get_backend_featuresz%TritonScheduling.get_backend_features  sR     MM00}}99P#&&P(O(OP  ###r}   c                   t         j                  j                  }t        ||      \  }}|r|j	                  |       t
        j                  rvddlm}m	 t        fd|D              sY|D cg c]  }t        ||      r|j                           }}|j	                  |j                   ddj                  |              y y y c c}w )Nr   )r/   ForeachKernelSchedulerNodec              3  6   K   | ]  }t        |        y wrr   )rh  )r   r  r[  s     r{   r   z3TritonScheduling.codegen_comment.<locals>.<genexpr>1  s      >?
189s   z Fused node name list: r   )rD   r  r  r7   make_commentr   debug_fusiontorch._inductor.schedulerr/   r[  r   rh  get_namecommentr   )	r   node_scheduler  origins_detailed_originsr/   r  
node_namesr[  s	           @r{   codegen_commentz TritonScheduling.codegen_comment%  s    ''&&%8%P""  )
  CP  +!!%67 JJL
 
 $$''>tyy?T>UV s   .#Cc                   t         j                  j                  }||j                  v r|j                  |   }|S t        j
                  j                  r$t        |t        j
                  j                        nd}t        |      d d }dj                  d|||j                         g      }||j                  |<   t        j
                  j                  r|nd}|j                  t        t        j                        |      }|j                  t        t        j                         |      }|j                  dd      }t#        t%        |j'                               d      \  }	}
}t)               }t*        j-                         rt*        j                  ||       |j/                  d	|d
       |j1                  |d       t         j                  j3                         }|j/                  d|j4                   d       d| }t7        ||      \  }}|d|z   dz   |z   z  }|j9                  ||j;                         |       t=        j>                  d      rt=        j@                  |||       |S )Nr   r   r  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Tr  z''', device_str='z')z# kernel path: r  kernel_metadata)!rD   r  r  src_to_kernelr   r   descriptive_namesr6   rE   r   next_kernel_suffixunique_kernel_namesreplacer   r9   r  r  r#   r"   r  rN   async_compileuse_process_poolr   r   r  r  r7   define_kernelr   r    is_metric_table_enabledlog_kernel_metadata)r   src_coderb  r  r  r  
fused_namekernel_category	subs_name	_basenamer  kernel_pathcompile_wrapperr  metadata_commentrc  detailed_originss                    r{   rs  zTritonScheduling.define_kernel?  s4   ''&&w,,,!//9Kj c ==22 &mV]]5T5TU 
 AJ2ANO((?J8R8R8TUK /:G!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H(08>>;K1Ld(S%Iq+,.O--/ $$Y9%%(=i]%&PQ""84"8WW@@BN%%(9.:M:M9Nb&QR!0>(;M7(S%G%w 58H HH!!_5579I ../@A++KhOr}   c                    | j                  |d      }t        j                  |      }| j                  ||t	        d |D                    S )z
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        T)r  c              3  <   K   | ]  }|j                           y wrr   r`  r   r  s     r{   r   z9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>  s     :WA1::<:Wr  )re  )generate_kernel_code_from_nodesr$   r>  benchmark_codegened_moduler   )r   rF  n_spills_thresholdrv  r\  s        r{   benchmark_fused_nodesz&TritonScheduling.benchmark_fused_nodesy  sV    
 77PT7Ux(..#
:WQV:W0W / 
 	
r}   c                  	
 t        t        j                  j                        }t	               5  |j                  t        j                  j                               5  dfd

fd}
fd}||nt        dg      }t        j                  d|j                          |        j                  fcddd       cddd       S j                         	j                  j                  	   j                  	 d          j(                  }t+        |      d
k(  sJ |d   j,                  |kD  rt'        d	      nNt/        j0                  	fd      t+        j2                        dkD  rt/        j0                  	fd      z
  t        j                  d|        |        j                  fcddd       cddd       S # t        $ rn}t         j"                  j$                  r t        j                  d||       t'        d	       |        j                  fcY d}~cddd       cddd       S d}~ww xY w# 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)z$Benchmark an already compiled moduleNc                 ~     j                   J t        j                  j                   j                         d   dz   S Nr   z.kernel_perf__file__ospathsplitextr\  s   r{   cache_file_pathzDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_path  s6    ||///ww''5a8>IIr}   c                 >            } t        | t                     y rr   r%   r   )r  r  mss    r{   store_cachez@TritonScheduling.benchmark_codegened_module.<locals>.store_cache  s    &(T3r7+r}   c                             } t         j                  j                  |       r.t        |       5 }t	        |j                               cd d d        S y # 1 sw Y   y xY wrr   )r  r  existsopenr  readr  fdr  s     r{   
load_cachez?TritonScheduling.benchmark_codegened_module.<locals>.load_cache  sM    &(77>>$'d 0r$RWWY/0 00s   AA unknown%kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrF   c                 4      j                     d         S r?  
clone_argsrE  callwrapped_jit_functions   r{   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>       D!@!5!@!@$!G!JK r}   c                 "     j                     S rr   r  rE  r  s   r{   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>  s     ? 4 ? ? F r}   z+The fused kernel for %s took %.3f ms to run)r   rD   r  r  r   r  r  r   r  debugr  get_argsr  rh  r  	Exceptionr   r   .disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr  n_spillsr(   benchmark_gpur  )r   r\  r  re  device_interfacer  r  r  r  rE  r  r  r  r  s    `       @@@@@r{   r  z+TritonScheduling.benchmark_codegened_module  sJ    4AGG4G4GH O	$##AGG$G$G$IJO	$ BJ, )4
*i[:Q  II7
 B~3<<'?O	$ O	$ O	$B <<>D88D#&;; (4)44d;A>? -66Iy>Q&&& |$$'995\ !..K +==>Bk77F B II=
 Ms||#_O	$ O	$ O	$N  
(==OO		@
 5\3<<''cO	$ O	$ O	$N
(OO	$ O	$ O	$ O	$ O	$sh   .I%"AI<	I%(I8GB3I	I%	IAI0I1I5	I%III	I%%I.c                   |j                  d      }|xr  t        d |j                         D              }| j                  }|rddlm} |}|rd|d<   |j                  d      r
d|d	<   d|d<   t        j                  |j                        s|j                  d	      rJ d|d	<   t        j                  j                  ||||      } ||i |}| j                  |||      S )
Nr\  c              3  <   K   | ]  }|j                           y wrr   )is_split_scan)r   rH  s     r{   r   z9TritonScheduling.create_kernel_choices.<locals>.<genexpr>  s      (
%)D (
r  rF   )TritonSplitScanKernelFoverride_cooperative_reductionrh  Toverride_persistent_reduction)contains_opr   scheduler_nodesrO  triton_split_scanr  r  r  reduction_numelrd  rD   r  triton_kernel_kwargsadd_multi_kernel_choices)	r   kernel_featureskernel_argskernel_kwargsis_scanr  rO  r  r  s	            r{   create_kernel_choicesz&TritonScheduling.create_kernel_choices  s     "--f5 
C (
-<-L-L-N(
 %
 +/*:*:@/K>CM:; &&v.=AM9:>CM:;11/2Q2QR$(()HIII=BM9:		66+}
 k;];,,V[-PPr}   c           	        |g}t         j                  j                  s|S |j                  xr |j	                  d       }|j
                  xr |j	                  d       }|r%|j                   | j                  |i |ddi       |r|j                  j                  }t        j                  j                  j                  |d      r[|j                   | j                  |i |ddix}       |r2|j                  r&|j                   | j                  |i |ddd       t        |      dkD  r.|dd  D ]  }	|j                  |	_         |j!                  d        |S )	Nr  r  Fi   )r  r  rF   c                    | j                   S rr   )r  )ks    r{   r  z;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>1  s    q'='= r}   r  )r   r   multi_kernelr  rd  r  ri  rO  r(  r  rD   r  r   r   r  must_keep_buffersrh  )
r   r  r  r  kernelsoptional_persistentoptional_cooperativerd  r  kernel2s
             r{   r  z)TritonScheduling.add_multi_kernel_choices  s    (.h}}))N$99 
-BSBS+C
 ?
  &;;  
MDUDU,E
 A
 NN    # 38  __44Fww44VUC-T--$' 8= E '5+E+ENN((((+ <A:?	 w<!"12; E,2,D,D)E LL=L>r}   c                   fdfd}fd}dg }}d}t         j                  j                  }t        |      t         j                  _        t         j                  j                  }t        |      t         j                  _        t
        j                  dkD  }	t
        j                  dkD  }
| j                  |d|	|
d      }|D ]  \  }}}|D cg c]  }|j                          }}|D cg c]  }|D ]  }|j                           }}}|j                  t        t        j                        d      }t        j                   |      t"        j%                  d	|j&                          |       \  &|z  }|z  }|j)                  j&                         ܉j+                         j,                  j.                    j0                   d          j2                  }t5        |      d
k(  sJ |d   j6                  dkD  rt9        d      xn3t;        j<                  fd      t;        j<                  fd      t"        j%                  dt        d |D                      |        |z  }|z  }|j)                  j&                          |t         j                  _        |t         j                  _        |||fS c c}w c c}}w )Nc                 ~     j                   J t        j                  j                   j                         d   dz   S r  r  r  s   r{   r  z@TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_path9  s6    <<+++77##CLL1!4~EEr}   c                             } t         j                  j                  |       rCt        |       5 }t	        d |j                         j                         D              cd d d        S y# 1 sw Y   yxY w)Nc              3  2   K   | ]  }t        |        y wrr   )r  )r   r  s     r{   r   zNTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>A  s      Eaq Er  rI  )r  r  r  r  r  r  splitr  s     r{   r  z;TritonScheduling.benchmark_combo_kernel.<locals>.load_cache=  s^    "$Dww~~d#$Z F2  E2779??3D EEF FFs   .A,,A5c                 \            } t        | t              dz   t              z          y )Nr  r  )r  r  r  ms_clones    r{   r  z<TritonScheduling.benchmark_combo_kernel.<locals>.store_cacheD  s&    "$Ds2w}s8}<=r}   r   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_coderh  r  rF   r  c                 4      j                     d         S r?  r  r  s   r{   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>{  r  r}   c                 (     j                     d   S r?  r  r  s   r{   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>~  s    ;0;;TB1E r}   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc              3  <   K   | ]  }|j                           y wrr   r  r  s     r{   r   z:TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>  s     <A1::<<r  )rD   r  r  r   inplaced_to_remover   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_code	get_nodesr`  rp  r   r9   r  r$   r>  r  r  r  ri  r  r  rh  r  r  r  r  r  r(   r  )r   	node_listr  r  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origr  r  kernel_code_listrv  r  
node_grouprH  fused_node_listsrF  r  namesr  rE  r  r  r\  r  r  r  s                        @@@@@@@r{   benchmark_combo_kernelz'TritonScheduling.benchmark_combo_kernel4  s   
	F	 	>  ) # ww66",-A"B"#''"<"<%/0G%H" 77!;;;a?::%"&+#" ; 
 (8 2	+#Ha=GHT 0HH/?OeOAQZZ\O\OEO''K,C,C(DiPH""8,CII7
 &<LB~B(*  .<<>D88D#&;;  0%00$7:;,66Iy>Q&&&|$$q( %e,X !..K '44E IIV<<<	 MNHh&NS\\*e2	+f #7%<"22i  IOs   K7K#)rS  zOptional[Scheduler]r   r   )r  ztorch.device)   )r   tuple[float, str])r  N)re  zOptional[OrderedSet[str]]r   r  )r  rh   r  	list[Any]r  r'  r   list[TritonKernel])r  r  r  r  r  r'  r   r  )rs   r   r   r  rO  r   r   rI   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONrW  r
  r   rY  rf  rs  r  r  r  r  r  r  r  s   @r{   rN  rN    s   )K)!""$$**44++**		
: $ $48t	
 RVT$5NT$	T$l#Q+#Q #Q &	#Q
 
#QJ33 3 &	3
 
3jY3r}   rN  c                   g }| j                         }|t        |t        j                        sJ |r0|j                  $|j                  | j                          d       |S ddlm} | j                         }|J | j                  j                  |      }t        |t        |f      sJ dt        |              t        j                  j!                  |      5  t"        j$                  }|j'                  | j)                               j+                         }|t"        _        d d d        |j                  | j                          d       |j                  t-        j.                  d             |S # 1 sw Y   RxY w)Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noderh  r   MultiTemplateBuffermake_kernel_renderri  r`  0torch._inductor.codegen.cuda_combined_schedulingr  r  rS  get_backendr\   r  rD   r  set_current_devicer    generated_kernel_countr  r  r  r  ro  )rH  linesmulti_templater  r  backendold_generated_kernel_counttriton_codes           r{   rQ  rQ    sd   E++-N!Z@V@V%WWW.;;C((JKL2 L/	
 "!!!..,,V4'N4J#KL 	
klpqxlykz{	
L WW''/ 	H *1)G)G&!AA eg  .HG*	H 	(67X__[&9:L	H 	Hs   A	E66E?r   )r  r   rj  r   rk  r   r   r   rH  )r  r  r   r  )r  r  r   r~   )r  zUnion[CSEVariable, Any]r   r~   )r   rf   ra  )ry   r~   r   zCallable[[_T], _T])rH  r/   r   r]  )
__future__r   ry  re  r  r!  r&  loggingr  r  r  r  collections.abcr   r   r   typingr   r   r	   r
   r   r   r   sympy.printing.precedencer   rt   torch._loggingtorch.utils._pytreer  _pytreer  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr   r   r   r   r    rq  r!   	codecacher"   r#   r$   r%   ops_handlerr&   runtimer'   runtime.benchmarkingr(   runtime.hintsr)   r*   r+   r,   runtime.runtime_utilsr-   r.   rS  r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   virtualizedrA   ro  rB   rC   rD   wrapper_benchmarkrE   block_analysisrG   commonrH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   simdrW   rX   rY   rZ   r[   r\   triton_utilsr]   r^   r_   r`   ra   r  rb   typesrc   rd   r  rf   rg   simd_kernel_featuresrh   ri   	getLoggerrs   r  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logrn   r   r   r   	dataclassr   r   r  rr  r  r:  r  r  r  r  r  r  r  r  r3  r5  _initialize_pointwise_overridesr  r[  r   ru  r  r   r  r  r  rN  rQ  r   r}   r{   <module>r)     s6   "         	  .  F F  0   $ $ C < 0 / K K 2 X X 4 " " ( F F ( ' .  D W W     C B B /    "   %  L8	Bg!00<H~~//*E^^--hA
6 6  4 $ 4 *, ,: 
 
 
@ c+ c+ c+L	++/+<P++>jQM jQZ 	3
&8
;P *(.bO&k O&d  / / 9q$O q$h$+ $+N : : :&! !H # # #
%uS%S/-A'BBC 
l&A:/0 l&A^MJ3~ J3Zr}   