
    rhu                   d   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d d	lm Z  d d
l!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7 erddl.m8Z8 ddl9m:Z: ddl;m<Z<m=Z= ddl/m>Z>m?Z?m@Z@ ddlAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZL ddlMmNZNmOZOmPZP ddlQmRZR ddlSmTZTmUZUmVZVmWZW ddlXmYZY ddlZm[Z[m\Z\m]Z]m^Z^m_Z_ erd dl`maZambZbmcZc d dlmdZd  ej                  ef      Zgej                  j                  efd       Zjej                  j                  efd!      Zkej                  j                  efd"      Zl eW       j                  Zn e g d#      Zod8d9d$Zpej                   G d% d&             Zr G d' d(er      Zs G d) d*er      Ztd:d+Zu ed,eTeT-      Zv G d. d/eVev   eev         Zw G d0 d1e?      Zx ej                  d23       G d4 d5             Zy G d6 d7ez      Z{y);    )annotationsN)Counter)AnyCallableGenericno_type_checkOptionalTYPE_CHECKINGUnion)TypeVar)analyze_memory_coalescing)free_unbacked_symbols)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hash)	MemoryDepStarDepWeakDep)IRNode)!indexing_dtype_strength_reduction)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)cache_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reduction'set_kernel_post_grad_provenance_tracingsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernel)DisableReductionEnableReductionNodeScheduleEntryNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence)CoalesceVarAnalysis
perf_hintsschedulefusion)zyxr0_r1_c                j    t         j                  j                  j                  j                  }||S | S N)torch	_inductorr   triton	max_tiles)defaultrU   s     o/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/codegen/simd.pyget_max_tilesrX   Y   s-    &&--77I!-9:7:    c                       e Zd ZdZej
                  j                  ej
                  j                  d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZee	e
dd                     Zd	dZee	e
d
d                     Z xZS )IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    )divisorlengthc                   t         
|           || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        y rQ   )super__init__namevar_list
var_rangesnumelprefixr\   r]   kernelroot)selfra   rb   rc   rd   re   rf   r\   r]   rg   	__class__s             rW   r`   zIterationRanges.__init__n   sO     		 $
	rY   c                ,    t        | j                        S rQ   )r/   re   rh   s    rW   is_reductionzIterationRanges.is_reduction   s     #4;;//rY   c                ,    t        | j                        S rQ   )r1   ra   rk   s    rW   symbolzIterationRanges.symbol   s    !$)),,rY   c                z    t        j                         D ci c]  \  }}||
 }}}|| j                     S c c}}w rQ   )r   itemsre   )rh   symtre   prefix_to_symts       rW   rq   zIterationRanges.symt   s>     <F;K;K;MN<4&$,NNdkk** Os   7)ra   strrb   list[sympy.Symbol]rc   dict[sympy.Symbol, sympy.Expr]rd   
sympy.Exprre   rs   rf   
SIMDKernelrg   IterationRangesRootreturnNonery   boolry   zsympy.Symbol)ry   r   )__name__
__module____qualname____doc__sympySOner`   propertyr*   r   rl   rn   rq   __classcell__ri   s   @rW   r[   r[   ^   s    . ww{{ % 3	
    " 
0 0   0- +   +rY   r[   c                       e Zd ZdZ	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZddZddZ	 	 	 	 ddZ	ddZ
	 	 	 	 dd	Z xZS )rx   z
    Root of a iteration range tree that represents a single
    tiled dimension in the output kernel. It contains multiple
    sets of iteration represented with IterationRangesEntry.
    c          	         |i }t         |   |g i ||||        || _        i | _        || _        |r| j
                  r|	J || _        || _        |	| _        |
| _	        y )N)ra   rb   rc   rd   re   rf   rg   )
r_   r`   indexnodes	pid_cacherl   is_loop
tensor_dimgrid_dimhas_zdim)rh   ra   rd   re   r   rf   r   r   r   r   r   ri   s              rW   r`   zIterationRangesRoot.__init__   s     I 	 	
 
=?
 *3 t00X5EFF$  rY   c                <    d| j                   d| j                   dS )NzIterationRangesRoot(, z, ...))ra   rd   rk   s    rW   __repr__zIterationRangesRoot.__repr__   s    %dii]"TZZLGGrY   c                b    | j                   j                         D ]  }|j                           y rQ   )r   valuescache_clear)rh   nodes     rW   r   zIterationRangesRoot.cache_clear   s*    JJ%%' 	D	rY   c                2    t        | j                   d      S )Nr   )r1   re   rk   s    rW   	index_symzIterationRangesRoot.index_sym   s    !T[[M"788rY   c                   t         j                  j                  j                  ||z  | j                        rt        | j                         |      }nt        | j                         ||      }|| j                  vrt        | j                   t        t         j                  j                         ||||       }|t         j                  j                  |j                         <   | j                   j#                  |j                                || j$                  |j                         <   || j                  |<   | j                  |   S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        )r7   graphsizevarsstatically_known_equalsrd   r   r   r   r   IterationRangesEntryre   nextrf   iter_vars_countrange_tree_nodesrn   rb   appendrc   )rh   r\   r]   exprr   s        rW   lookupzIterationRangesRoot.lookup   s     7733Gf4DdjjQDNN,g6D"4>>#3WfEDtzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3DOODKKM*#DJJtzz$rY   c                    t         j                  j                  }g }t        |      D ](  }|j	                  | j                  ||             ||z  }* g t        |      S rQ   )r   r   r   reversedr   r   )rh   lengthsr\   itervarsr]   s        rW   construct_entriesz%IterationRangesRoot.construct_entries   s]     ''++w' 	'FOODKK89&G	' %(#$$rY   c                f    | j                  |      D cg c]  }|j                          c}S c c}w rQ   )r   rn   )rh   r   es      rW   	constructzIterationRangesRoot.construct   s'    $($:$:7$CDq
DDDs   .c           
     \  	
 dd|j                   D cg c]+  }t        j                  j                  j	                  |      - }}|D cg c]!  }|s|j
                  | j
                  k(  s |# }}|j                  fd       t        j                  j                  g 	g 
	
fd}|D ]v  }t        j                  j                  j                  |j                        s8 || j                  t        |j                                     |j                   ||       x t        j                  j                  j                  | j                         s, || j                  t        | j                                      g t#        	      g t#        
      fS c c}w c c}w )z,Figure out vars from this tree used in indexc                   t         j                  j                  j                  | j                  t
        j                        }t         j                  j                  j                  | j                  t
        j                        dk(  }|| fS )a:  
            Gets the key for sorting nodes. When two nodes have the
            same divisor, the node with length as 1 should be handled
            first so the current divisor is not changed after multiplied
            node.length. Returns `not length_is_one_hint` for ascending
            sort.
            fallbackr8   )r7   r   r   	size_hintr\   r   unbacked_symint_fallbackr]   )rM   divisor_hintlength_is_one_hints      rW   get_sort_keyz8IterationRangesRoot.vars_and_sizes.<locals>.get_sort_key   s     77++55		F$C$C 6 L   **HHv'F'F +    !&8"899rY   c                     |       S rQ    )rM   r   s    rW   <lambda>z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>  s    a rY   keyc                    j                  | j                                j                  | j                         | j                  z  y rQ   )r   rn   r]   )r   r\   
index_varssizess    rW   addz/IterationRangesRoot.vars_and_sizes.<locals>.add  s5    dkkm,LL%+GrY   )rM   r   ry   ztuple[int, bool])free_symbolsr7   rf   r   getre   sortr   r   r   r   r   r   r\   r   r   rd   r   )rh   r   sr   nr   r   r\   r   r   r   s          @@@@rW   vars_and_sizesz"IterationRangesRoot.vars_and_sizes   sK   
	:& <A;M;MNa**..q1NN!CqQ188t{{+BCC

0
1''++
	,  	D77##;;DLL'RDKK$,,)HIJ,,I	 ww77

GLGXdjj'%BCD&*%&(:(5/(:::/ OCs   0F$F)F)/F)rQ   )ra   rs   rd   rv   re   rs   r   intrf   rw   r   Optional[dict[str, str]]r   r|   r   Optional[int]r   r   r   r|   ry   rz   ry   rs   ry   rz   r}   )r\   rv   r]   rv   ry   r   )r   list[sympy.Expr]ry   zlist[IterationRangesEntry])r   r   ry   rt   )r   rv   ry   z+tuple[list[sympy.Symbol], list[sympy.Expr]])r~   r   r   r   r`   r   r   r   r   r   r   r   r   r   s   @rW   rx   rx      s     /3(!(! (! 	(!
 (! (! ,(! (! "(!  (! (! 
(!TH9 .%'%	#%E/;/;	4/;rY   rx   c                  p     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd
dZddZddZd
dZddZddZ	ddZ
 xZS )r   c                $   t         |   ||j                  |z  |j                  |j                  |j
                  |||j                  |j                  	       || _         t        j                  d       | j                        | _        || _        y )N)	ra   rd   rb   rc   re   r\   r]   rf   rg   )r_   r`   rd   rb   rc   re   rf   rg   parent	functools	lru_cache_codegencodegenr   )rh   ra   r\   r]   r   r   ri   s         rW   r`   zIterationRangesEntry.__init__)  s~     	,,'__((==== 	 
	
 0y**40?	rY   c                    d| j                    d| j                   d| j                   d| j                   d| j                   dS )NzIterationRangesEntry(r   ))ra   r\   r]   r   rc   rk   s    rW   r   zIterationRangesEntry.__repr__@  sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrrY   c                L    fd| _         d | j                   _        | _        y )Nc                      S rQ   r   )ra   s   rW   r   z/IterationRangesEntry.set_name.<locals>.<lambda>D  s    t rY   c                      y rQ   r   r   rY   rW   r   z/IterationRangesEntry.set_name.<locals>.<lambda>E      rY   )r   r   ra   )rh   ra   s    `rW   set_namezIterationRangesEntry.set_nameC  s    ##/ 	rY   c                8    | j                   j                          y rQ   )r   r   rk   s    rW   r   z IterationRangesEntry.cache_clearH  s      "rY   c                X    t         j                  j                  |        | j                  S rQ   )r7   rf   codegen_iteration_ranges_entryra   rk   s    rW   r   zIterationRangesEntry._codegenK  s    	//5yyrY   c                   g }t        | j                  t        j                        r|S t        | j                  t        t
        f      sJ t        | j                               | j                  j                  dd  D ]l  }t        |t        j                  t        j                  f      r.|j                  }t        |      dkD  sIt        d |D              s\|j                  |       n |S )Nr8   r   c              3  P   K   | ]  }t        |t        j                           y wrQ   )r   r   SIZE.0r   s     rW   	<genexpr>z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>X  s       ,56N1dii0,   $&)
isinstancer   r   Symbolr   r   typeargsIntegerr   lenallr   )rh   precomputed_argsargsymbolss       rW   r   z%IterationRangesEntry.precomputed_argsO  s    -/dii.##$))h%@AR4		?RA99>>!"% 	1CcEMM5<<#@A**w<!# ,:A, ) %++C0	1  rY   c                ,    t        | j                        S rQ   )hashra   rk   s    rW   __hash__zIterationRangesEntry.__hash__^  s    DIIrY   c                X    t        |t              sJ | j                  |j                  k(  S rQ   )r   r   ra   )rh   others     rW   __eq__zIterationRangesEntry.__eq__a  s&    %!5666yyEJJ&&rY   )ra   rs   r\   rv   r]   rv   r   rv   r   r[   ry   rz   r   )ra   rs   ry   rz   r   )ry   r   ry   r   )r   objectry   r|   )r~   r   r   r`   r   r   r   r   r   r   r   r   r   s   @rW   r   r   (  sf      	
    
.s
# 'rY   r   c                    | t        d      k(  ry| t        d      k(  ryt        j                  |       ryt        |       S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    rW   constant_reprr   f  s9    e	%-		E	;rY   CSEVariableType)boundrV   c                      e Zd ZU dZeZded<   ded<   dZded<   ded	<   	 	 	 	 d7	 	 	 	 	 	 	 	 	 	 	 	 	 d8 fd
Ze	e
ed9d                     Zd:dZd;dZe	d<d       Zd=dZ	 	 	 	 	 	 	 	 	 	 	 	 d>dZd?dZd@dZdAdZd=dZd=dZdBdZd9dZdCdZdDdZd<dZdEdZ	 	 	 	 	 	 dFdZ	 	 	 	 	 	 dFdZdGdZdHdZ e!	 	 	 	 	 	 dId        Z"e#e$jJ                  jL                  f	 	 	 	 	 	 	 dJd!       Z'e#e$jJ                  jL                  f	 	 	 	 	 	 	 dKd"       Z(	 	 	 	 dLd#Z)e#	 	 	 	 	 	 dMd$       Z*dNd%Z+dNd&Z,dOd'Z-	 	 	 	 dEd(Z.dPd)Z/dQd*Z0dRd+Z1dSdTd,Z2e3jh                  	 	 	 	 	 	 dUd-       Z5dVd.Z6e!d/        Z7d0 Z8d1 Z9d2 Z:d3 Z;d4 Z<d5 Z=dWd6Z> xZ?S )Xrw   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]sexprkexprFr|   allow_block_ptrrs   kernel_namec                    |i }t         
           | _        |j                          _        t                _        t                _        |j                         D ci c]/  \  }}|t        j                  j                  j                  |      1 c}} _        g  _        i  _        t!        j"                          _        |j'                          _        ||n j+                          _        | _        ||n j1                          _         j5                          _        d  _        t:        j<                  d fd       }	|	 _         jA                  |       y c c}}w )Nc                    t         j                  j                  j                  | j	                               } j
                  D ]  }j                  | |      }  j                  |       S rQ   )r7   r   r   simplify_with_rangesrc   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treerh   s     rW   simplify_indexingz.SIMDKernel.__init__.<locals>.simplify_indexing  sb    GG$$99%ARSE(( B44UDAB 66u==rY   )r   rv   )!r_   r`   featuresget_mutations	mutationsr-   bodyindexing_coderp   r7   r   r   simplifynumelsr  r   	itertoolscountr   rl   inside_reduction should_use_cooperative_reductioncooperative_reductiontiling_scoresshould_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr   r   cacher  initialize_range_tree)rh   tilingr  r   override_persistent_reductionoverride_cooperative_reductionr  re   valr  ri   s   `         rW   r`   zSIMDKernel.__init__}  sO    I !//1"$	+-FLlln
7BvsFAGG$$--c22
 79JL(0 ( 5 5 7 .9 +668 	"
 ?L -8 *557 	!
 **,(, 
	> 
	> "3""9-?
s   "4E"c                :    t        d | j                  D              S )Nc              3  2   K   | ]  }t        |        y wrQ   )r/   )r   re   s     rW   r   z0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s     I6&v.I   )sumr  rk   s    rW   num_reduction_dimszSIMDKernel.num_reduction_dims  s     IT[[IIIrY   c                    t         rQ   NotImplementedError)rh   dtypes     rW   dtype_to_strzSIMDKernel.dtype_to_str      !!rY   c                6    | j                   j                         S rQ   )r  select_index_dtyperk   s    rW   get_index_dtype_as_torch_dtypez)SIMDKernel.get_index_dtype_as_torch_dtype  s    }}//11rY   c                @    | j                  | j                               S rQ   )r,  r0  rk   s    rW   index_dtypezSIMDKernel.index_dtype  s      !D!D!FGGrY   c                     yNFr   rk   s    rW   r  zSIMDKernel.want_no_x_dim      rY   c                   t        fdt        D              }| xs | }d	d}g d}	t        t        |	            }
ddg}|r|}n
|r|
}n|
|z   } |||      } ||	t              }g }t	        |      D ]s  \  }}t        |      }|j                  |      }|j                  |      }||n|}|j                  t        | d|   ||| ||xr | j                   ||dv 
             u |S )
Nc              3  ,   K   | ]  }|v s|  y wrQ   r   )r   re   r  s     rW   r   z3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s      %
61AF%
   	c                `    t        fd| D              D ci c]  \  }}||
 c}}S c c}}w )Nc              3  ,   K   | ]  }|v s|  y wrQ   r   )r   r"  masks     rW   r   zOSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s     2U3PT32Ur8  )	enumerate)seqr;  idxr"  s    `  rW   filtered_index_mapz<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  s3    )22U#2U)U%S#S  s   *)rM   rL   rK   rN   rO   r   rK   )r   r   r   r   r   )ry   zdict[Any, int])
r   all_prefixeslistr   r<  r/   r   r   rx   r  )rh   r   r  rl   r  r  active_prefixesno_r_dimr?  	grid_dimspointwise_tensor_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr  ire   r   r   r   s       `                rW   construct_range_treesz SIMDKernel.construct_range_trees  s3    % %
!-%
 
 (';|+;	
 $	 $Xi%8 9(K/K/.@K ,KI))\B"?3 	IAv.v6L'++F3J#''/H!)AxE#he$6N'(J1J1J-J)% F]	& rY   c                    | j                  || j                  | j                  j                         | j                  | j
                        }| j                  j                  |       y rQ   )rK  r  r  rl   r  r  r  extend)rh   r   r  s      rW   r  z SIMDKernel.initialize_range_tree  sR    00!!MM&&(KKMM
 	,rY   c                     y)zr
        Hook called right before codegen with every index that will be
        used in the fused kernel.
        Nr   )rh   indicess     rW   finalize_indexingzSIMDKernel.finalize_indexing  r   rY   c                v    | j                   }d| _         	 | j                  |||      || _         S # || _         w xY wr4  )r  store)rh   ra   r   r   priors        rW   store_reductionzSIMDKernel.store_reduction  s;    %% %	*::dE51$)D!ED!s   / 	8c                     yr4  r   rk   s    rW   r  z+SIMDKernel.should_use_cooperative_reduction  r5  rY   c                     yr4  r   rk   s    rW   r  z*SIMDKernel.should_use_persistent_reduction  r5  rY   c                t    t        t        j                  j                  d | j                  D                    S )Nc              3  P   K   | ]  }|j                   j                            y wrQ   )rc   rp   r   r
  s     rW   r   z(SIMDKernel.var_ranges.<locals>.<genexpr>  s"      *,0%%'*r   )dictr  chainfrom_iterabler  rk   s    rW   rc   zSIMDKernel.var_ranges  s4    OO)) *484D4D* 
 	
rY   c                :    t        d | j                  D              S )Nc              3  J   K   | ]  }t        |j                  d u        y wrQ   )r   r   rY  s     rW   r   z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>  s     Q3td23Qs   !#)r&  r  rk   s    rW   triton_tensor_ndimzSIMDKernel.triton_tensor_ndim  s    Q@P@PQQQrY   c                ^    dg| j                         z  }d||<   ddj                  |       dS )Nrz   :[r   ])r_  join)rh   rJ  r   s      rW   indexing_size_strzSIMDKernel.indexing_size_str   s9    42244a499U#$A&&rY   c                    dg| j                         z  }| j                  D ]R  }|j                  |j                  r| j                  s)|j
                  j                          d||j                  <   T |S )N1BLOCK)r_  r  r   rl   r  re   upper)rh   r   r
  s      rW   dense_size_listzSIMDKernel.dense_size_list%  sv    //11$$ 	GD&$$(=(=,0KK,=,=,?+@)Fdoo&	G rY   c                L    | j                         }ddj                  |       dS )Nrb  r   rc  )rj  rd  rh   r   s     rW   dense_size_strzSIMDKernel.dense_size_str/  s)    $$&499U#$A&&rY   c                   t        |t              s|S |j                  d   }| j                  j	                  |      x}|S t        |||j                  i      }t        j                  j                  j                  |      }t        ||j                  j                         |j                  j                  t        j                  j                   |j                  j"                        j%                         i      S Nr   )r   r   r   r   r   r3   r   r7   r   r   r	  rg   r   r   r   r   r   rd   rn   )rh   r   rM   	tree_node	new_indexs        rW   r	  z)SIMDKernel.combine_modular_indexing_pairs3  s    %1LJJqM..22155I>Luq)..&9:	GG$$CCIN	((*INN,A,AGGKK!5!5-&(
 	
rY   c                    t         j                  j                  j                  |      x}r!|\  }}t	        | j                  ||      |      S | j                  ||      S rQ   )r7   r   r   expand_floor_divr   _combine_contiguous_dims)rh   r   r
  
expand_resrq  denominators         rW   r  z"SIMDKernel.combine_contiguous_dimsE  s[     ))::5AA:A%/"I{D99)TJKXX00==rY   c                   t        |t        j                  t        j                  f      r|S |j	                  |      \  }}t        |      dk  r|S t        j                  j                  j                  ||t        |g||            \  }}}||k(  r|S |j                  |      }t        |t        t        | ||                        }	|	S )zI
        More aggressive simplification to merge contiguous dims
        r8   )r   r   r   r   r   r   r7   r   r   _simplify_loopsr;   r   r3   rZ  zip)
rh   r   r
  r   r   	new_sizesreindex_prunenew_index_varsrq  s
             rW   rt  z#SIMDKernel._combine_contiguous_dimsN  s     eemmU\\:;L //6
Eu:?L%&WW%5%5%E%E7US&
"	7F L	2ud3z7>;R+S&TU	rY   c                      j                   d   j                  xs  j                  t        j                   fd       } |       S )Nc               3     K    j                   j                         s j                  rJ d  y r j                          d _        	 d  r j                          d _        y # d _        w xY ww)NFT)r  rl   r  codegen_body)rh   should_flushs   rW   ctxz)SIMDKernel.disable_reduction.<locals>.ctxe  sl     ==--/0000 !!#$)D!-%%'(,%%s   AA5A) !A5)	A22A5)r  r   r  
contextlibcontextmanager)rh   r  r  s   ` @rW   disable_reductionzSIMDKernel.disable_reductionb  sE    ''+33Qt7Q7Q		"	"	- 
#	-$ urY   c                    t        |      t        | j                        k(  sJ t        || j                        D cg c]  \  }}|j                  |       c}}S c c}}w rQ   )r   r  ry  r   )rh   r   r]   rangess       rW   
set_rangeszSIMDKernel.set_rangesz  s]    7|s4#3#34444 #&gt/?/?"@
 V$
 	
 
s   Ac                D   t        d |D              r| D cg c]  }g  c}g fS t        j                  j                  | D cg c]  }g  c}| D cg c]  }j	                  |       c}t        j                         d
fd}	 	 	 	 	 	 	 	 dd}g }d}|D ]K  }	g }
|	D ]/  }j                  |d      r|
j                  d        )|t              k  r>j                  |   d      r)|dz  }|t              k  rj                  |   d      r)|dz   t              k  roj                  ||         rZj                  ||         st        |   }t        ||         }|
j                   || |||       ||dz   |                   |t              k  s|
j                  t        j                   |||                   2 |j                  |
       N t        d D              sJ d d	|        |fS c c}w c c}w c c}w )Nc              3  8   K   | ]  }t        |      d k(    ywr   N)r   )r   r]   s     rW   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s     6Fs6{a6s   c                    j                  |      }j                  |    |      st        t        |    |      | <   |    j	                  |       t              S rQ   )r  statically_known_multiple_of	CantSplitr   r   r   )rJ  r   
new_ranges	remainingsv	var_counts     rW   	add_rangez5SIMDKernel._split_iteration_ranges.<locals>.add_range  sZ    ;;t$D229Q<F#IaL$7IaLqM  &	?"rY   c                     d fd}|S )Nc                     |    z  |    z   S rQ   r   )	flat_varsidx1idx2sizes    rW   getterzISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  s    io-	$??rY   )r  r   ry   rv   r   )r  r  r  r  s   ``` rW   make_combinedz9SIMDKernel._split_iteration_ranges.<locals>.make_combined  s    @ MrY   r   r8   c                6    t         j                  j                  S rQ   )r   r   Zero)_s    rW   r   z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>  s    EGGLL rY   c              3  t   K   | ]0  }t         j                  j                  j                  |      d k(   2 yw)r8   Nr7   r   r   r   r   s     rW   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s*     I!177##--a0A5Is   68zfailed to set ranges  )rJ  r   r   rv   ry   r   )r  rv   r  r   r  r   ry   z(Callable[[list[sympy.Expr]], sympy.Expr])r   r7   r   r   r  r  r  r   r   r   statically_known_gtr  r  r   operator
itemgetter)groupsr   groupr  gr  r  return_getters_groupscurrent_grouplength_groupreturn_gettersr  size1size2r  r  r  r  s                 @@@@rW   _split_iteration_rangesz"SIMDKernel._split_iteration_ranges  sj    6g66$*+5B+R//WW:@-AQb-A
-34R[[^4	OO%		# 	#		$'	/2	5	 !## %	9LN$ "--dA6"))*@A#c)n49S9Sm,:
 "Q&M $c)n49S9Sm,: !1$s9~5":P:P)M2; ::i6 (%m4E$T9]+CDE"))%!%mU;%ma&7? %s9~5&--$//	-0NOA"F "((8K%	9N IyII 	
#I;ay9	
I 000K , .B4s   	H	HHc                   t         j                  j                  }t        |d         dk(  r\|j	                  |t
        j                  j                        s2|j	                  t        |      t        |d         |z        r|d   |gfS |S )z1Fill in the reduction numel of lengths if missingr8   r   )	r7   r   r   r   r   r   r   r   r2   )clsr  r   reduction_numelr   s        rW   prepare_split_iteration_lengthsz*SIMDKernel.prepare_split_iteration_lengths  s{     77##wqz?a00%''++N00f%gaj)O;
 AJ 122rY   c                n    | j                  |||      }	 | j                  ||       y# t        $ r Y yw xY wNTF)r  r  r  )r  r  r   r  s       rW   is_compatiblezSIMDKernel.is_compatible  sB     55fgW	''8 		s   ( 	44c                >   | j                   D ci c]  }|j                  |j                   }}| j                  s0|D ]+  }t	        |      st
        j                  j                  ||<   - g |j                         }| j                  ||| j                        S c c}w rQ   )r  re   rd   r  r/   r   r   r   r   map_kernel_groups_to_node_sizesr  )rh   r   rtr  re   r  s         rW   split_and_set_rangeszSIMDKernel.split_and_set_ranges  s     150@0@A""))RXX%AA$$  1&v.%*WW[[F6N1 $6==?#33FGT__UU Bs   Bc           
     F   t        |      t        |      k(  r!t        d t        ||      D              r || S | j                  ||      \  }}g t        j
                  j                   ||       }|D cg c]  }|D cg c]
  } ||       c} c}}S c c}w c c}}w )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        c              3     K   | ]?  \  }}t         j                  j                  j                  t	        |      |z
        d k(   A ywr  r7   r   r   r  r2   )r   rM   r  s      rW   r   z=SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>  s@      /
1 GG%%mA&6&:;q@/
s   AA)r   r   ry  r  r  r[  r\  )	r  r  r   r  r  r  r   fnsfns	            rW   r  z*SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<3v;&3 /
GV,/
 ,
 w'',/,G,GPW,X)
)LY__22:z3JKL8MN,"H,NN,Ns   7	B BBBc                6    t        |t        j                        S rQ   )r   r   TMPrh   r   s     rW   is_indirect_indexingzSIMDKernel.is_indirect_indexing  s    "5$((33rY   c                   | j                  |      rydgt        | j                        z  }|j                  D ]g  }|| j                  vr| j                  |   }t        |j                  t              sJ ||j                  j                  xx   |j                  z  cc<   i t        j                  j                  j                  t        fdt        || j                  j!                               D              S )NFr8   c              3  F   K   | ]  \  }} |       |      k7    y wrQ   r   )r   	idx_range
iter_ranger  s      rW   r   z,SIMDKernel.is_broadcasted.<locals>.<genexpr>1  s,      
%	: Y8J#77
s   !)r  r   r  r   r   r   r   rx   r   r]   r7   r   r   r  anyry  r   )rh   r   index_numelsrn   entryr  s        @rW   is_broadcastedzSIMDKernel.is_broadcasted   s    $$U+sS--(( 	=FT222))&1Eell,?@@@++,<,	= 77##,, 
),\4;;;M;M;O)P
 
 	
rY   c                    t        |t              r)ddj                  t        | j                  |             dS | j                  | j                  |            S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        rb  r   rc  )r   rA  rd  mapindex_to_strr  rename_indexingr  s     rW   r  zSIMDKernel.index_to_str6  sN     eT"tyyT%6%6!>?@BBzz$..u566rY   c                n   | j                  |      }t        |t        j                  j                  j
                        }t        |j                  t        j                              s(t        |j                  t        j                              r3|j                  t        j                  j                  j
                        }t        |j                  t        j                              r|j                  t        j                        D ]g  }|j                  }t        |      dkD  st        d |D              s1|t        j                  j                  j                  |      i}t        ||      }i | j                  |      }t        |t               s|n|j"                  d   }| j%                  |      S )Nr   c              3  p   K   | ].  }t        |t        j                  t        j                  f       0 y wrQ   )r   r   r   PRECOMPUTED_SIZEr   s     rW   r   z.SIMDKernel.prepare_indexing.<locals>.<genexpr>W  s.      , #1tyy$2G2G&HI,s   46)r  r3   r7   r   r   precomputed_replacementsr   atomsr   floorceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)rh   r   ar   replacements
simp_indexs         rW   prepare_indexingzSIMDKernel.prepare_indexingC  sG    &&u-5!''"2"2"K"KLu{{5;;'(CEMM0J,KJJqww//HHIE u{{5==)*[[/ 	< ..w<!# ,$, ) %&qww'7'7'O'OPQ'R#SL&ul;E	< ++E2
 )X>JJOOTUDV 	 $$Z00rY   c                r    | j                   D cg c]  }|j                  r| j                  s| c}S c c}w rQ   )r  rl   r  )rh   ts     rW   active_range_treeszSIMDKernel.active_range_treesi  s3    ''
q~~AVAVA
 	
 
s   44c                4   t         j                  j                  j                  || j	                               }t        |j                  t              D ]  }|| j                  v si }| j                  |   j                         D ].  }t         j                  j                  j                  |      ||<   0 t        |      dkD  r5t        | j                  |   j                  |      | j                  |   _        | j                  |   j                           |S )Nr   r   )r7   r   r   r  rc   sortedr   rs   r   r   r  r   r3   r   r   )rh   r   symr  pss        rW   r  zSIMDKernel.codegen_indexingn  s    ww44T4??;LM$++5 	5Cd+++  "//4EEG TB'(ww'7'7'O'OPR'SL$T|$q(6@--c277$7D))#.3 %%c*224	5 rY   c                    t        d      )NzNYI: codegen_nan_checkr)  rk   s    rW   codegen_nan_checkzSIMDKernel.codegen_nan_check  s    !":;;rY   c                    t        d      )NzNYI: call_kernelr)  )rh   ra   r   s      rW   call_kernelzSIMDKernel.call_kernel  s    !"455rY   c              #     K   | j                   }| j                  }|rt        j                  ||      }t	        j
                  |      }|| _         || _        	 | || _         || _        y# || _         || _        w xY ww)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr5   logical_andr6   _unwrap)rh   r;  r   rS  	prior_vals        rW   
mask_loadszSIMDKernel.mask_loads  sy     
 $$	??4/D!!$' 	)J#DO(D $DO(Ds   AA=A* A=*A::A=c                (   | j                   j                         D ci c]  \  }}||j                   }}}t        ||      }i }| j                  D ]7  }t        |j                        }t        ||di      t        ||di      z
  ||<   9 |S c c}}w )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        r8   r   )r   rp   r   r3   r  r1   ra   )	rh   r   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            rW   get_strides_of_loadzSIMDKernel.get_strides_of_load  s     8<7L7L7R7R7T Utq!AFF U U'/DE** 	J":??3A#$6A?*"QFC GAJ	
  !Vs   Bc                \    t        |t              rt        t        | |            S  | |      S rQ   )r   tupler  )r  r   s     rW   _map_tuple_or_scalarzSIMDKernel._map_tuple_or_scalar  s'    eU#R((%yrY   c           	        g }t        t        | j                  j                  j	                                     }| j                  j                         \  }}}}| j                  j                         }t        j                  j                  j                  t        | j                  j	                                     }t        |      D ]2  \  }}||vr|j                  d       t        j                  j!                  |      }	t        j                  j                  j                  |	      }
|
|kD  rwt#        t$                  }d}||   D ]M  }t'        |t(        t*        f      r|j-                  d|        |dz  }3|j-                  |j.                         O t        |      |z  }n|
}t        j                  j1                  |      }t3        |      }|j                  ||z  dt5        ||k        z   z         5 t7        |      S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   no_index_dep_r8   )r   r4   r   inplace_buffersr   python_argdefsr  buf_accessesr7   r   r   r   r2   r  r<  r   	get_numelr   r   r   r!   r"   r   r   	get_dtyper,   r   r&  )rh   nbytesninplace_argsr  	call_argsr  	out_numelrJ  r   	arg_numelbuf_sizerO  no_index_dep_countdeprd   r+  
dtype_sizes                    rW   estimate_kernel_num_bytesz$SIMDKernel.estimate_kernel_num_bytes  s    F499#<#<#C#C#EFG!YY5579a}}113 GG$$..}T[[=O=O=Q/RS		* 	MFAs ,&a ))#.Iww''11)<H)# %S/+%&"', /C!#'9:m4F3G$HI*a/*CII./ Gy0 GG%%c*E'.JMM%*,CM8I4J0JKL9	M: 6{rY   c           	     &   t        | j                  j                        dk(  rEt        | j                  j                        dk(  r#t        | j                  j                        dk(  ry| j                  j                         \  }}}}d}|D ]F  }t        j                  j                  |      }|s&|j                         }	t        |	j                        dk(  sOt        |	j                  D 
cg c]
  }
|
dk(  s	|
 c}
      dk(  r|t        j                  |	j                        }||}||k7  st        d| dd| d	| z         }t        j!                  |       |D cg c]m  }t        j                  j                  |      rJt        j                  t        j                  j#                  |      j                         j                        ndo }}|D cg c]Z  }t        j                  j                  |      r7t        j                  j#                  |      j                         j                  nd\ }}|D cg c]@  }|t        j                  j$                  v rd
n|t        j                  j&                  v rdndB }}|D 
cg c]  }
|
j(                   }}
t        d| d| d| d| d| dz         }t        j!                  |        y t+        d| d      }t        j!                  |       yc c}
w c c}w c c}w c c}w c c}
w )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r8   r   N   r   zExpected stride order z, but found stride orderr  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersr  r  r7   r   try_get_buffer
get_layoutr  r   get_stride_orderstrider&   logwarning
get_buffergraph_inputsname_to_bufferra   r%   )rh   r  argdefsr  
_signaturer  uniform_stride_orderarg_namebuflayoutrM   stride_ordermsgra   stride_order_list	size_listsource_listargdef_namess                     rW   warn_mix_layoutzSIMDKernel.warn_mix_layout  s    		''(A-DII,,-2DII--.!3
 ,0II,D,D,F)J#! 0	H''((2C^^%F6;;1$6;;9a!q&9:a?!226==A'/+7()\9%01E0FF^_l^<}EFC KK$ %.) ! 7711$7 ++GG..t4??AHH "	")% ) %.	! ! 7711$7 **40;;=BB!"!I ! %.# !	  177#7#77 %  177#9#99 2!	"#K # 5<#<qAFF#<L#<%(nYK|\m[no&ykk]"MNC KK$a0	b 3K=@TU
 	C[ :)!# $=s'   -
K:
8K:
"A2K?AL?AL	
Lc                   t        j                  ||d|      }d| _        t        j                  | j                  j
                  |      }t        j                  ||      }d| _        t        j                  ||      }t        j                  ||      }t        j                  ||d|      }t        j                  |||f      S )Nr&  FT)r5   	reductionr  
index_exprr  r  truedivsubmulr6   r  )	rh   r+  r   sum_rnumelmeandxdx2m2s	            rW   welford_reduce_fallbackz"SIMDKernel.welford_reduce_fallback=  s    }}UE5%8 % = =uE{{4( $WWUD!ggb"o]]5%4!!4V"455rY   c                    t        j                  ||d|      }t        j                  ||      }t        j                  |      }t        j                  ||d|      }t	        j
                  ||f      S )Nmaxr&  )r5   r*  r-  expr6   r  )rh   r+  r   vmaxr-  r8  vsums          rW    prepare_softmax_twopass_fallbackz+SIMDKernel.prepare_softmax_twopass_fallbackI  s\    }}UE5%8ggeT"ggcl}}UE5#6!!4,//rY   c                    t         rQ   r)  rk   s    rW   codegen_kernelzSIMDKernel.codegen_kernelP  r-  rY   c                     y rQ   r   rk   s    rW   r  zSIMDKernel.codegen_bodyS      rY   c                     y rQ   r   )rh   r  s     rW   r   z)SIMDKernel.codegen_iteration_ranges_entryV  r?  rY   )NNNN)r  dict[str, sympy.Expr]r  rC   r   r   r   Optional[bool]r!  rB  r  Optional[dict[str, sympy.Expr]]ry   rz   r   )r+  torch.dtypery   rs   )ry   rD  r   r{   )r   r   r  r|   rl   r|   r  rA  r  r|   ry   list[IterationRangesRoot])r   zdict[str, str]ry   rz   )rO  Sequence[sympy.Expr]ry   rz   )ra   rs   r   rv   r   r:   ry   rz   )ry   ru   )rJ  r   ry   rs   )ry   z	list[str])r   rv   ry   rv   )r   rv   r
  rx   ry   rv   )ry   z'contextlib.AbstractContextManager[None])r   rv   ry   rt   )r  Iterable[sympy.Expr]r   Sequence[Sequence[sympy.Expr]]ry   zStuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]])r  rG  r   rH  r  rv   ry   rH  )r  rG  r   rH  r  rv   ry   r|   )r   rH  ry   list[list[sympy.Expr]])r  rF  r   rH  ry   rI  )r   rv   ry   r|   )r   rv   ry   rs   )ry   rE  )r   rv   ry   rv   r   rQ   )ra   rs   r   zOptional[IRNode]ry   rz   )r;  zUnion[str, OpsWrapper]r   Union[int, float]ry   zIterator[str])r   rv   ry   ru   )r  r   )@r~   r   r   r   pexprr   __annotations__r  r`   r   r*   r   r'  r,  r0  r2  r  rK  r  rP  rT  r  r  rc   r_  re  rj  rm  r	  r  rt  r  r  staticmethodr  classmethodr   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r(  r5  r;  r=  r  r   r   r   s   @rW   rw   rw   s  sE    */E&.&&!OT! /38<9=9=/.%/. %/. ,	/.
 (6/. )7/. 7/. 
/.b J   J"2 H H5+5 5 	5
 &5 5 
#5n-*
R'
'
$>>':>	>':	(0
 L1$L1/ML1
L1 L1\ 
 ',ggkk	$ 0 $	
 
( & 
 ',ggkk	$ 0 $	
 
 
V5
V	
V O$O 0O
 
 O O84
,7$1$1 
$1L

"<6 )*)3D)	) )&0  
=~EN
60"rY   rw   c                     e Zd ZU dZeZded<   d Zd ZeZ	eZ
d Z	 	 d#dZe	 	 	 	 	 	 d$d       Zd%d	Z	 	 	 	 d&d
Zd Zdd	 d'dZd Z	 d(	 	 	 	 	 	 	 	 	 	 	 d)dZd Ze ej0                  d      d*d              Ze	 	 	 	 	 	 d+d       Ze	 	 	 	 	 	 d,d       Ze	 	 	 	 	 	 	 	 d-d       Ze	 	 d.d       Ze	 	 	 	 	 	 	 	 	 	 d/d       Ze	 	 	 	 	 	 	 	 d0d       Ze	 	 	 	 	 	 	 	 d1d       Z ee!jD                  jF                  df	 	 	 d2d       Z$ee!jD                  jF                  df	 	 	 d3d       Z%d Z&d4dZ'd(d Z(d! Z)d" Z*y)5SIMDSchedulingzo
    Single Instruction Multiple Data parent class used for fusion across
    multiple different backends.
    z	type[Any]kernel_typec                &    t        d |D              S )Nc              3     K   | ]6  }t         j                  j                  j                  t	        |             8 y wrQ   r  r   s     rW   r   z*SIMDScheduling.group_fn.<locals>.<genexpr>c  s*     PQQWW%%..}Q/?@Ps   <>)r  rl  s     rW   group_fnzSIMDScheduling.group_fnb  s    P%PPPrY   c                   t        |t        j                        st        |t        j                        r t        j                  j                  ||      S |j                  \  }\  }}|j                  \  }\  t        ||      }|j                         r)|j                         s|j                         rA |d       n8|j                         r(|j                         s|j                         r |d       |j                         r,|j                         r|k(  xr |k(  }|s |d||       |S |j                         s|j                         s|k(  r|k(  s|j                         s |d||       y|j                         D ]`  }|j                         r nN|j                         |j                         z  s7|j                  \  }\  }	}
||	k(  r||
k(  rT |d||	||
        y ||fD ]  }|j                         s y | j                  |j                         ||      }| j                  |j                         ||      }| j                  |j                         |j                         z   ||      }t        j                  j                  rVd}t!        |      dkD  r%t!        |      dkD  r||cxk(  xr |k(  nc }n||k(  }nt!        |      dkD  r||k(  }|s |d|||       yy|j                         s|j                         r|d	k(  rd	k7  sJ |z  k(  rt#        fd
|j                         D              s	 |d       yt        j                  j$                  r\|j                         sLt'        | j                  |j                         |      j)                               |d	fd	ffv }|s |d       |S y|k7  r |d       |k(  S |j                         r|j                         rJ | j+                  ||      S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s)Tr   ztiling mismatch (%s, %s, %s)r8   c              3  j   K   | ]*  }t         j                  f|j                                , y wrQ   )rw   r  
get_ranges)r   r   numel2rnumel2s     rW   r   z*SIMDScheduling.can_fuse.<locals>.<genexpr>  s1       ,,fg->Os   03z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuser  r)   is_split_scanrl   is_template	get_nodesused_buffer_namesget_buffer_namesselect_tilingr   rT    tiling_prevents_pointwise_fusionr   r    tiling_prevents_reduction_fusionr  r   can_fuse_horizontal)rh   node1node2r  numel1rnumel1whyreduction_can_fuser   	pro_numel
pro_rnumelr   tiling1tiling2tiling3condis_reduction_tiling_validrX  rY  s                    @@rW   r[  zSIMDScheduling.can_fusee  s    eYAABj977G
 77@@NN${{FG${{FGu% )<)<)>!!#<=  "5+>+>+@!!#<=E$6$6$8!'6!1!Hg6H%G &%!!#E,>,>,@f$G);((*O ! !& 1 )++-!  $557%:P:P:RR$59ZZ22Iz &) 3:8M \ & ) ' * $)#)& U^  ==? 
 (():FGLG(():FGLG((!EOO$55vwG }}==w<!#7|a'&'<W<&'1\A%"g-D6	 !!!#(:(:(<a<GqL00')) "__.  <= MMBB!--/05**5??+<fELLN1  !,1- 5:;4412V##!!#E,>,>,@@@''u55rY   c           
     z   g t        t        j                            t               t               d fd}fd}fd}fd}t        j                  fd       }fd}	|D ]  }
|
v rj                  |
        ||
      r? |	|
      r |       5  	 d d d        r ||
      sxs t              nd  ||
       ` ||
      r" |       5  j                  |
       d d d        t        d d d	|
j                  d
           S # 1 sw Y   |xY w# 1 sw Y   xY w)Nc                b    | j                   \  }\  }}|k(  xr |k(  xs |z  k(  xr |dk(  S Nr8   r  r   r  
node_numelnode_rnumelrd   r0  s       rW   fits_in_main_bodyz@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sH    +,77(A(
K%'AK6,A efn,A1ArY   c                N    | j                   \  }\  }}|k(  xr |dk(  xr dk7  S rt  ru  rv  s       rW   fits_outside_reductionzESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s4    +,77(A(
K&K;!+;K!KrY   c                \    | j                   j                  D ]  }|j                  v s y yr  )read_writesreadsra   )r   readcurrent_loop_buffer_usages     rW   expect_improved_memory_usagezKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s1    ++  99 99  rY   c                   j                  |        j                  |        j                  | j                  j                  D cg c]  }|j
                   c}       | j                         rt        | t        j                        rrt        | j                  t        j                        rNt        | j                  j                  t        j                        s j                  | j                                y j                  | j                  j                   D cg c]  }|j
                   c}       y c c}w c c}w rQ   )r   r   updater}  r~  ra   rl   r   r   SchedulerNoder   r   ComputedBufferdataScanget_namewrites)r   rM   r  donenode_schedulenot_ready_yet_nodess     rW   schedule_node_in_loopzDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop  s    HHQK  #%,,amm>Q>Q-Raff-RS
  q)"9"9:qvvr'8'89"166;;8#''

5)00!--BVBV1WQ!&&1WX .S 2Xs   D; E c               3  L  K   rd   t         u rj                          nj                  t               r1j	                  t               j	                  dz   t                d d  j                  t                j                           j                          y w)Nr  r8   )r@   popr   r?   insertclear)r  maybe_split_indexr  r  s   rW   end_current_reduction_loopzISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop  s      r!2o!E!!#$$%56 $$%68HI$$%6%:OL$(!  1%%'%++-s   B!B$c                    dk(  ry| j                   z  sy|rt        |d   t        t        f      rJ t	              S )Nr8   Fr  )	ancestorsr   r@   r?   r|   )r   r  r  r0  s     rW   #requires_closing_previous_reductionzRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reduction#  sN    {&7 b!O5E#F*   +,,rY   zunexpected group: (r   z) != r8   )
r   r   r'   r  r  r   r   r   r*  r  )rh   r   rd   r0  ry  r{  r  r  r  r  r   r  r  r  r  r  s     ``       @@@@@rW   generate_node_schedulez%SIMDScheduling.generate_node_schedule  sW   #%)5568 0:|5?\!+/		L		Y" 
	"	"	. 
#	.	-  	Dt|HHTN &6t]K35  -5QRV5W(9(OS=O% )-%%d+'-/1 /!((./ / *)%6(%

1O -	4 ' / /s   (D%&D1%D.	1D:	c                b   |j                         }t        j                  j                  j                  j
                  rt        |      }nd}t        |d       j                  \  }\  }}| j                  |||      }t        j                  d|       | j                  t        ||||            S )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        Nc                4    t        | j                               S rQ   r   rl   rM   s    rW   r   z-SIMDScheduling.codegen_node.<locals>.<lambda>V  s    c!..:J6K rY   r   zSchedule:
 %s)r^  rR   rS   r   rT   coalesce_tiling_analysisr   r7  r  r  schedule_logdebugcodegen_node_schedulerC   )rh   r   r   coalesce_analysisr  rd   r0  r  s           rW   codegen_nodezSIMDScheduling.codegen_nodeI  s     04~~/???!!((AA 9$ ? $ ,KLRR?E633E5&I+];))}eV=NO
 	
rY   c                   t        j                  t         j                        j                  }t	        |       sy|D cg c]0  }|j                         r|j                         j                         2 }}t        d |D              syt        j                  j                  j                  | |       |D ],  }t        j                  j                  j                  ||       . yc c}w )NFc              3  2   K   | ]  }t        |        y wrQ   )r+   )r   r  s     rW   r   z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>s  s     FD)$/Fr%  T)rR   iinfoint32r7  r+   has_tensor_outputr  storage_sizer   r7   r   r   	guard_leq)rd   buffersint_maxr   	buf_sizesr  s         rW   can_use_32bit_indexingz%SIMDScheduling.can_use_32bit_indexing_  s     ++ekk*..%e, 
$$& NN))+
	 
 FIFF 	
""5'2 	6DGG&&tW5	6
s   5C$c                   |j                   }| j                  ||j                  |j                  |j                        \  }}| j                  ||g||d      }|D ]  }| j                  ||        t        j                  |       |D ]  }t        j                  |      5  |j                         }d d d        | j                  ||      }t        j                  j                  rt!        ||       t"        j%                  d|       ||_        t)        |      |_         ~t+        |      dkD  rt        |      }	n|\  }	t        j                  |	      5  |j-                         D ]  }
|
j/                           	 d d d        | j1                  |       |	j3                  |	j&                         t        j4                  r|	j7                          t        j8                  r|	j9                  |d   j&                         t        j:                  xj<                  |	j<                  z  c_        t        j:                  xj>                  |	j>                  z  c_        t        j:                  j@                  jB                  rt        jD                  r|d   jF                  jI                         }|j-                         D ]  }
|
jK                         }||vr|
jL                  J |
jL                  jO                         }|CtP        d   dxx   dz  cc<   t        j:                  j@                  jS                  d|jT                  d| d	        | jW                          y # 1 sw Y   xY w# 1 sw Y   xY w)
N)r  r  z+Generating kernel code with kernel_name: %sr8   r   inductorintermediate_hookszrun_intermediate_hooks(r   r   ),r  get_tiling_and_scoresrd   r  r  create_kernel_choices!codegen_node_schedule_with_kernelr>   merge_workspaces_inplacer7   set_kernel_handlerr=  define_kernelr   traceenabledr0   r  r  r  r   r   scheduler_nodesmark_runcodegen_commentr  nan_assertsr  r(  r   removed_buffersinplaced_to_removewrapper_codesupports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersr  r   get_origin_noder   	writelinera   free_buffers_in_scheduler)rh   kernel_featuresr  r  tiling_scorekernelsrf   src_coder  final_kernelr   	live_outsra   origin_nodes                 rW   r  z$SIMDScheduling.codegen_node_schedule}  s   '55#99!!++--	 
 ,,H(<H

  	JF22=&I	J,,W5 	3F%%f- 3!0023,,X}fMK||##7! IIC[Q!,F(2F	3  w<!&w/L%O\!!,/ 	 '779   	  	]+  !9!9:**,!!(()?)?@	<#?#??	""l&E&EE" GG  <<22  
;;=I'779 
}}y(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO
 	&&(k3 3&	  	 s   !M&M+M(	+M5c                (     | j                   |i |gS rQ   )rQ  )rh   r  kernel_argskernel_kwargss       rW   r  z$SIMDScheduling.create_kernel_choices  s)     D
 	
rY   c           	     <   |5  t        j                         }i }|D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          D|j                          |j                  |j                               }|j                  t        j                  |j                  j                  |      j                                       |j!                  |j#                                |D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          Dt%        |j                         |j                  |j                               }|j'                  |        	 d d d        y # 1 sw Y   y xY wrQ   )r  	ExitStackr?   enter_contextr  r@   closedecide_inplace_updater  rW  r  rZ  fromkeys_bodyindexing_from_argsr   rP  keysr$   r   )rh   r  rf   stackall_indexingr   r   s          rW   r  z0SIMDScheduling.codegen_node_schedule_with_kernel  sS    	-((*EL & ++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN $$\%6%6%89 & 	-++''(@(@(BC_,KKM 6djjA!'!<!<T__=N!OJLL,	--	- 	- 	-s   FFFFonly_gen_src_codec               f
   |j                   \  }\  }}|dk(  sJ |j                  j                  |j                        \  }}	i }
|j                         }g }|D ]  }|j	                         }|j                  |       ||z  s*t        |      dk(  sJ ||
t        t        |            <   |j                  j                  t        t        |                   g } t        |      dk(  sJ |5  |s|g|D ]  }|j                            |	       }|j                  d      5  |D ]0  }|j                  |j                  |j                                      2 |j                   j#                  t%                      ddd       |j&                  j)                         D ]+  \  }}d| d}|
j+                  |j-                         g       x}s0t/        d |D              }t1        j2                  d|       5  |j                  |      5  |D ]  }t        |j	                               dk(  r<t        |      dk(  r.t5        |      r#|xj6                  |j	                         z  c_        |j                  |j                  |j                                       |j                   j#                  t%                      ddd       ddd       . 	 ddd       t9        t:              s$|j=                  d	       |j=                  d
d       t?        j@                  |      5  |j&                  jC                         D ]  }d| d}|j=                  |d        |j                  d      5  t9        |t:              r|}n|j=                  d       |jD                  }ddd       g |||}t0        jF                  rH|jI                         dz  }|jK                          d d|jM                  |      jO                          }|rcddd       S | jQ                  ||      }t0        jR                  jT                  rtW        ||       ddd       | jY                         |j[                  |j                         t>        j\                  xj^                  |j^                  z  c_/        t>        j\                  xj`                  |j`                  z  c_0        | jc                          y# 1 sw Y   nxY w# 1 sw Y   =xY w# 1 sw Y   nxY w# 1 sw Y   CxY w# 1 sw Y   xY w# 1 sw Y   xY w)z
        Codegen a triton template

        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
        r8   r   z<STORE_OUTPUT>Nz<LOAD_INPUT_>c              3  <   K   | ]  }|j                           y wrQ   )can_codegen_without_upcasts)r   p_ns     rW   r   z2SIMDScheduling.codegen_template.<locals>.<genexpr>!  s      5>A7795   ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eAr  )2r  r   make_kernel_renderr_  r`  r   r   r   iterprologue_fused_inputsr   r  set_subgraph_bodyr   r  rW  cse
invalidater   named_input_nodesrp   r   r  r   r   patchr   #prologue_fused_inputs_preserve_zeror   rs   finalize_hookr7   r  r  codebenchmark_kernelr  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r  r  r0   r  r  r   r  r  r  )rh   template_nodeepilogue_nodesprologue_nodesr  r  _numelr0  rf   renderbuf_name_to_prologue_grouptemplate_readsprologue_groupprologuenamesr   partial_code
input_namebuffersubgraph_namecan_codegen_without_upcastprologue_noder  r  num_gbr  s                             rW   codegen_templatezSIMDScheduling.codegen_template  s    ,11FF{{&++>>}?Q?QR%'"&88:& 	$H--/E!!(+~%5zQ&@N*4U+<=,,00d5k1BC!#	$ >"a''' ,	@$ +<^< $DMMO$ "8L))*:; 4* QDLL!<!<T__=N!OPQ

%%jl34
 '-&>&>&D&D&F @"
F".zl! <%?%C%COO%r& >  25 5ES5 2.  7=W9W @ $55mD @1? "$'(F(F(H$IQ$N(+N(;q(@'CM'R(.(R(R,9,J,J,L)*(R !. 5 5$*$?$?(5(@(@(B%&!"" #JJ11*,?!@@ @@,	@\ ,,&&~6&&{5&A !!&) 	T %66;;= H
".zl! <**=*GH ))*:; 1lC0+H ../?@+00H1 NnMmMnMM&&99;cA::<=Rj66v>GGIJL  !1	T 	T4 ,,X}fMK||##7{S;	T> 	]+;(:(:;	6#9#99	""f&?&??"&&(_4 4&@ @@ @3,	@ ,	@t1 1	T 	Ts   ,5T!AS&:AT)T5T B1S3	8T  TA
T''1TA+T'9T'&S0	+T3S=8T  T
TTT$	T''T0c                    t         j                  j                  j                  t         j                  j                  j                                y rQ   )r7   r   r  r  
device_opssynchronizerk   s    rW   codegen_synczSIMDScheduling.codegen_synch  s-    	&&qww'9'9'E'E'GHrY   c           
        ddl m} |D cg c]  }|j                          }}i i }
}	t        ||      D ]u  \  }}t	        |d       j
                  \  }\  }}| j                  |||      }| j                  |||      }||||f|
|<   |j                  |t        |||      |       |	|<   w |j                  || ||	|
      }t        j                  dt        |      |D cg c]  }t        |       c}       g }|D ]>  }|D cg c]  }|j                          }} |||      }t        ||      D ]  \  }}| j                  |
|   d	   |j                  |	|                |	|   }|
|   d	   }|sIt!        j"                  |      5  t%        j&                  |      D ]  }|j)                           	 d d d        t         j*                  xj,                  |j,                  z  c_        t         j*                  xj.                  |j.                  z  c_         |j1                         }|j3                  |||f       A |S c c}w c c}w c c}w # 1 sw Y   xY w)
Nr8   )ComboKernelc                4    t        | j                               S rQ   r  r  s    rW   r   z;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>x      #ann>N:O rY   r   )r  optimize_mask)r   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groups)enable_autotunemixed_sizesr   )triton_combo_kernelr	  r^  ry  r7  r  r  ra  create_triton_kernelrC   horizontal_partitionr  r  r   r  create_sub_kernelr7   r  rB   
only_nodesr  r   r  r  r=  r   )rh   subkernel_nodescustom_part_algorithmr  r  r  r	  r   fused_node_listssubkernel_mapnode_schedule_mappnr   r  rd   r0  r  r  
partitionspkernel_code_list
node_grouprf   	subkernelr  s                            rW   generate_combo_kernel_codez)SIMDScheduling.generate_combo_kernel_codek  s    	59HIDNN,II+-r(_.>? 		IB!$U0O!P!V!VAv 77ufMM''ufEF$165&$Hb! + @ @+M5&I"-o !A !M"		 !55!"2$+ 6 

 			? '(SV(	

 $ 	DJ=GHT 0HH  /'F
 !-=> K	E66%b)!,,,]2->? *"-	 1" 5a 8(--i8 ,$6$A$A-$P ,D MMO,, ''9+D+DD'**i.J.JJ*K ,,.H##Xvz$BC-	D.  c J. )  I, ,s   I II+IIc                6   |j                         }|j                  }|j                  }t        j                  dkD  xs t        j                  dk(  xr |}| j                  ||||      }|D ]  \  }}}	| j                  ||g|      }
t        j                  j                  rt        |j                  |
       | j                  |g       t        j                  d|
       |j                  t        j                   j"                  |
        | j%                          y )Nr8   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algor  r   combo_kernel_allow_mixed_sizesr#  r  r  r  r0   snodesr  r  r  r  r7   r   r  r  )rh   combo_kernel_noder  r  r  r  r   r  rf   r  r  s              rW   codegen_combo_kernelz#SIMDScheduling.codegen_combo_kernel  s   +??A 1 K K+;;;;a? 
11Q6P;P 	  ::2O[
 $4 		BHfa,,X8I7JFSK||##7%,,k   "3!45II:KHqww33[A		B 	&&(rY       c           
        
 dk(  }d 
fd}|j                         \  }
t        |      dk  rt        
      dk  st        |
z         rg S |j                         \  }
 |||r|n
|j                  |            }|D cg c]?  }t	         j                  |j                  |      |j                  |j                        A }	}|	S c c}w )Nr8   c                d   t        |j                        t        |      k(  sJ d|j                  d|       |j                  |j                  g}t	        d t
        j                  j                  |      D              sJ t
        j                  j                  |      D cg c]:  }|j                  t        j                  j                  vrt        |t              r|< }}t        |j                  D cg c]  }|j                   c}      }dd}t        j!                   ||      g|       dd      g}|D ]  }t        j                  j"                  j%                  |j&                  |j                        }	t        |	      t        |      k(  sJ 	 |	j'                  d      dz   }
|
t        |      k(  rt	        d	 |	|
d
 D              r	  ||d
|
        |||
d
       f}t        j                  j"                  j+                  t-        d t/        ||	      D                    }|j                  |v r|dz  }t        j1                  |d         r|dz  }t        j1                  |d         r|dz  }t        j                  j"                  j+                  |t-        t        j                  |            z
        dk\  s|j3                  t        j!                   ||d
|
        |||
d
       g      ||j                                |S c c}w c c}w # t(        $ r Y w xY w)zX
            Compute tiling candidates by dividing up the iteration ranges.
            zrw.range_vars=z ranges=c              3  H   K   | ]  }t        |t        t        f        y wrQ   )r   r    r!   )r   r	  s     rW   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s$       3G 45s    "c                f    t         j                  j                  j                  t	        |             S rQ   r  )r  s    rW   collapse_rangeszNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_ranges  s"    ww''00v1FGGrY   noner   )r  ra   scorer8   c              3  &   K   | ]	  }|d k(    ywr  r   r   s     rW   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s     ;a16;s   Nc              3  2   K   | ]  \  }}|d k7  s|  ywr  r   )r   r  r  s      rW   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s       "!-vST"s   r   r  r2  ra   )r  rF  ry   rv   )r   
range_varsr~  r  r   r  r[  r\  ra   r7   r   r  r   r    r   CandidateTilingcreate_partial_tilingr   stride_hintsr   
ValueErrorr   r2   ry  is_good_sizer   )is_pointwiser  rwdep_sourcesr	  depswrite_namesr0  tilingsr  splittiled_groupsr2  r  r  reduction_rangess                rW   tile_rangesz5SIMDScheduling.candidate_tilings.<locals>.tile_ranges  s    r}}%V4S8H	&6SS4 88RYY/K $??88E    %??88E88177#:#::sI. D  %"))%D3chh%DEKH
  44(01<  G  4''**77		2==Q7|s6{222
#MM!,q0EF+ ;756?;; ! < $F6EN3#F56N3  ((22! "14VW1E" 
 88{*QJE"//Q@QJE"//Q@QJE GG$$..ioofFV.W XX 
 NN'#&#<#<$3F6EN$C$3F56N$C!" !0$ #(!$
Q4l N[ &E: " s$   $?L8L"L"=L""	L/.L/r5  )r<  r|   ry   list[CandidateTiling])	rW  r   r   "pointwise_or_reduction_read_writesr7  complete_partial_tilingr  r2  ra   )r  r   rd   r  r<  rE  pointwise_rangespartial_tilingsr  full_tilingsrD  s   `  `      @rW   candidate_tilingsz SIMDScheduling.candidate_tilings  s     '!+\	| .2__->** !Q&$%*$%58H%HII .2__->**% ,2B33LA
 *	
  22MM5/ ll[[	
 	
 	
s   ACc                    g dt        |       d }ddgdt        |       }t        g t        ||      t        ||            S )zK
        Create a tiling dict from pointwise and reduction splits.
        )rK   rL   rM   NrN   rO   )r   r   ry  )r  	pw_tilingreduction_tilingpw_prefixesreduction_prefixess        rW   create_tilingzSIMDScheduling.create_tilingB  sY     &s9~o&78#U^,Cc2B.CDVc+y)VC0BDT,UV
 	
rY   c                >    | j                  |r|ng |s|      S g       S rQ   )rR  )r  r  r<  s      rW   r8  z$SIMDScheduling.create_partial_tilingO  s0       "F&F
 	
,.
 	
rY   c                    t        |j                               }d|v }||z  }|t        |      z  g}|r||fn||f} | j                  | S )zb
        Given a tiling for only pointwise or reduction dimensions, adds the missing one.
        rM   )rA  r   r2   rR  )	r  r  rd   r  splitsr<  total_numelmissing_tilingtiling_argss	            rW   rH  z&SIMDScheduling.complete_partial_tilingZ  sf     fmmo&f}o-%f(==> )5V^$>6:R 	 !s  +..rY   c           
     z   |dk(  }t        t        t        t        j                  f             }t        j                  |      D ]  }t        |t        j                        s|j                         }|st        |d         dk(  rC||rdnd   }|g}	|j                  j                         D 
cg c],  }
t        |
t              rt        |
j                        dkD  r|
. }}
|D ]  }
g |
j                  j!                         }t        j"                  j$                  }t&        j(                  j*                  }t-        |      D ]!  \  }\  }}||z  }|j/                  ||      s! n |j1                  ||      sdz   }|r|d| n||d }g }|D ]  \  }}t3        j4                  |
j6                  |      }t9        d|j;                  t<              |j;                  t>              z   t        |            }t3        j@                  ||||      }||d   n|g}|jC                  |        |D cg c]F  }t&        j(                  j*                  j1                  |t        j"                  j$                        s|H }}t        |      dkD  s|	jE                  |        |	D ]z  }t9        dt        |      tG        d      z
        }|dz   }tI        |d|       }|ftK        ||d       z   }|jM                  | jO                  | jQ                  ||      ||             |  tS        |t        d      }|S c c}
w c c}w )z
        Creates N-dimensional tiling candidates, attempting to simplify loads/stores
        by tiling the kernel into higher dimensions.

        Returns a list of tilings ranked by dimensionality.
        r8   r   Nr   T)r   reverse)*r   rZ  rs   r   Exprr@   filterr   r   r  rW  r   r}  reads_and_writesr    r  rp   r   r   r7   r   r   r<  statically_known_geqr   r9   get_subexpr_involving_symbolr   r7  r  r   r   match_mod_div_block_exprrM  r   rX   r2   r  r   rH  r8  r  ) r  r  pointwise_numelr  r<  rA  r   node_rangesranges_to_tilenode_tilingsr	  memory_depsall_var_rangespointwise_vars_numelr   pointwise_end_idxvarrd   reduction_start_idxrc   index_tilingr   num_dimsmatch_resultdimsdimnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingss                                    rW   get_nd_tilingszSIMDScheduling.get_nd_tilingso  sy    '!+T#uzz/235#**=9 _	DdI$;$;< //+KCA$71$< )lBN*+L  ++<<>c9-#cjj/A2E K 
 # 96 "73::#3#3#5!6',ww{{$77++7@7P 3%|U(E1(44,o   77(/  '8!&;# $ ##7$78'(;(<=   "", .JC/LL		3E
  #H-O0LLN+ H $7#O#OsE8$L /;.F<?UGD ''-%..  , 77++CCCU     |$q( ''5s96x  , #&q#k*:]1=M*M#N %5%9"(5kBUCU6V(W%$9#;e 2 34? $  //112BLQ''e_	F  
 qn s   .1L3,AL8c                   j                   sdnj                   j                  j                  j                  j                  j                  j                  j
                  }D cg c]  }||   	 c}D cg c]  }||   	 c}t        j                  t              k(  fd       t        j                  t              k(  fd       i g }	 	 	 d	 	 	 	 	 	 	 df	d}|j                   |d       |d      f       r$|j                   |fdd       |d      f       j                  j                         z  }	|	D ]%  }|j                   ||fd       |d      f       ' t        d	
      d	k(  rBdk(  r=t        j                  |	d      D ]$  }
|j                   ||
d       |d      f       & g }|D ]b  \  \  }}\  }}t        | j!                  ||      t#        |      t#        |      z         }| j!                  ||      }|j                  ||f       d | j!                  gg      }ddfd}t%        ||      D ]  \  }}| j'                  |j(                        rt+        |j(                        dk(  rdndz
  }|t        d	
      kD  rDt,        j/                  d|t        j0                  j2                  j4                  j6                         |j(                  |fc S |j(                  |k(  s|j(                  |fc S  |dfS c c}w c c}w )zr
        Generates a tiling, and a score of each tile according to each tile's coalesced memory accesses.
        Nc                      d d  S Nr   r   )r  ra  	pw_rangess   rW   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>  s    ykO#4B}oF rY   c                      d d  S ry  r   )r  
red_rangesr  s   rW   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>  s    zl"_$5RG rY   Fc                B  	 |rn}|rn}|s|r|gg fS g g fS t        |       ||f}j                  |d      x}r|S |rn}g }g }	d}
d}t        ||      D ]  \  }}|| vr"|
|z  }
j                  j                  |d      }-|r|k(  rj                  }|J |j
                  }t        ||j
                        }|j                  |
|z         |	j                  |j                         |j                  |       |	j                  j                  j                  |d             d}
d}|
|z  }
|j                  |
       |	j                  j                  j                  |d             d}
 |
dk7  s|r0t        |      dk(  r"|j                  |
       |	j                  |       t        t        |            D ]S  }t        j                  j                  j                  ||   d      }t        |d      }t!        |	|   |z  dz        |	|<   U ||	f|<   ||	fS )z]
            Generate a tiling, and a tiling score, given vars to use as splits.
            Nr8   r   r+  r      )r   r   ry  coalesced_by_varsuggested_splittiling_factorr   r   r2  r   ranger7   r   r   r   minr   )vars_to_useuse_split_varr<  r  target_numelr   outsplitting_varsrU  split_scoresprodprev_var_coalesced_scorer  v_range
var_tilingtile	remainderrJ  r   all_iter_varsall_red_varsr  ra  rz  r|  r  scored_sub_split
tiling_vars                      rW   process_node_varszASIMDScheduling.compute_tiling_strategy.<locals>.process_node_vars  sZ    #/YJF.:?L)NB//8O$m\BC&**355s5
.:]NFLD'($ ".&9 
7K'GOD/@/Q/Q/U/U10,  Q*_!2!B!BJ%111%33D (*2J2J KIMM$"23 ''
(8(89MM$' ''(9(J(J(N(NqRS(TUD/0,d###$5$F$F$J$J1a$PQ;> qy\c&kQ.>d###$<= 3v;' ?GG$$..vay2.F1I"%l1o&9A&=">Q?
 &,\$:S!L))rY   T)r<  )r  r<  r   rV   r8   r   )r2  gffffff?gGz?c                    d}| d   j                   j                         D ]"  }t        j                  |      s|z  }|z  }$ | d   j                   |z  S )Ng      ?r   )r  r   r7  r;  r2  )r  score_factor	tile_size"bad_size_additional_tiling_penaltygood_size_tiling_penaltys      rW   	score_modz9SIMDScheduling.compute_tiling_strategy.<locals>.score_mod  se    LqT[[//1 K	&33I>#/2T#TL#/2J#JL	K aDJJ;--rY   r   r   zmFound optimal tiling with %s tiles but torch._inductor.config.triton.max_tiles set to %s. Consider increasing)r   FF)r  ztuple[sympy.Expr, ...]r  r|   r<  r|   ry   ztuple[list[int], list[int]])r  ri  norm_read_writesr   reduce_varsrc   rR   _checkr2   r   r  r  rX   r  combinationsr7  rR  r&  r  tiling_is_compatibler  r   perf_hint_loginforS   r   rT   rU   )r  r  ra  r  r  r  r  score_splitr  overlapping_iter_varsr  rA  pw_splitpw_score	red_split	red_score	candidater  default_tilingr  cand
tiling_lenr  r  r  r  rz  r|  r  r  s    ````                 @@@@@@@@rW   compute_tiling_strategyz&SIMDScheduling.compute_tiling_strategy  s    %44 "2266 	 *::EE(99EE"33>>(561VAY6	)56AfQi6
)$7F	
 	*%8G	
 DF  	
 35"'!&K	*/K	*K	* K	* )	K	* K	*\ 	!t4!u5	
 %#T &59	 ->>CCEE 	 ' 	A%qd>%59	 #q(_-A(556KQO "")+DI)u= HJ<G 	68 Xx"89i'!!(I6(mc)n4I ,,XyALNNI|45	6 **O+<>OP .3*#( 	. #)i"@ 	1D,'' !-o6JPQR
a 88!&&9"..55??	 {{L00 {{n,{{L00)	1, t##s 76s   7M
Mc                T    t        t              sJ t        fd|D              S )Nc              3     K   | ]R  }t        |t        j                        r6t        j	                  j                         |j                                 T yw))r  N)r   r   r  rw   r  r   rW  )r   r   r  r  s     rW   r   z6SIMDScheduling.tiling_is_compatible.<locals>.<genexpr>  sO      
 $	 7 78	 $$!2O % 
s   AA)r   rZ  r   )r  r  rd   r  r  s      ``rW   r  z#SIMDScheduling.tiling_is_compatible  s1     &$''' 
 &	
 
 	
rY   c                B    |D ]  }| j                  ||||      s|c S  y rQ   )r  )r  r  rd   r  ru  r  s         rW   get_first_compatible_tilingz*SIMDScheduling.get_first_compatible_tiling  s1     % 	F''uovV	 rY   Nc                0    | j                  ||||      d   S ro  )r  )r  r  rd   r  r  s        rW   ra  zSIMDScheduling.select_tiling  s)     ((5/3D

 	rY   c                   |dk(  }| j                  |g|g      }t        j                  j                  j                  j
                  r0|r.t        j                  j                  s| j                  ||||      S |st        j                  j                  rt        d      dk  rt        j                  t        j                  k  rt        j                  |      D ]i  }t        j                  j                  rt!        | j#                  |||            dkD  s>t        j%                  t'        j(                  d              |dfS  |dfS t+               }t-        j.                         }	t        j                  |      D ]g  }| j#                  |||      D ]O  }
|
j0                  |v r|
j0                  |j3                  |
j0                         |	|
xx   |
j4                  z  cc<   Q i |	j7                         D 
cg c]  \  }
}|
j8                   }}
}t        d      dk\  r?|r=	 	 	 	 	 	 d
d}t;        dt!        |            D ]  } ||d   ||         }||g|z   } n t!        |      dkD  rt        j%                  d	|       t        j                  j                  r| j=                  |||      |z   }| j?                  ||||      x}r|dfS |dfS c c}}
w )z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r8   r   r  r   z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                Nr   c                8   | d   | j                  dd      }}|d   |j                  dd      }}t        ||g      s/t        j                  j                  j                  ||z
        dk(  ry t        j                  j                  j                  ||z
        dk  r||f||fc\  }}\  }}t        j                  j                  j                  ||z
        dkD  sJ t        j                  j                  j                  ||      sy |t        ||      || d   d}|S )NrM   rL   r8   r   rN   )rK   rL   rM   rN   )r   r   r7   r   r   r   r  r   )tiling0rm  a0a1b0b1
new_tilings          rW   convert_tiling_to_3dzBSIMDScheduling.get_tiling_and_scores.<locals>.convert_tiling_to_3d0	  s    !w{{3':B w{{3':B *2r(3ww''11"r':a?77##--b2g6:*,bB8&HRhr2ww''11"r':Q>>>ww''DDRL !"b)"5>	
 "!rY   zpossibly bad tiling: %s)r  rA  rm  rA  ry   rC  ) rR  rR   rS   r   rT   r  prefer_nd_tilingr  tile_reductionsrX   r  levelloggingWARNINGr@   r\  r   rL  r  textwrapdedentr   collectionsr   ra   r   r2  most_commonr  r  rv  r  )r  r  rd   r  r  r<  r  r   
seen_namescandidate_tilescandidate_tilingr2  ru  r  rJ  new_3d_tilingr  s                    rW   r  z$SIMDScheduling.get_tiling_and_scores  s    " '!+ **E7_4EF OO""))BB!MM22..uo7H  V]]%B%B}H
H ""goo5+22=A D"MM99 5 5dE? STWXX%**$OO!$ !4'' "4''&0l
4?4G4G4I#**=9 	LD$'$9$9$$W L #((J6%**6NN#3#8#89 015E5K5KK1L	L ,;+F+F+H7
' % ##7
 7

 #q(\"."9N"0"8 1c.12  4"1%~a'8! !,&3_~%EN ~"8.I ==))""=%I ! 
 445/>
 
6 
 4<t##7
s   9Kc                     y rQ   r   rk   s    rW   flushzSIMDScheduling.flushe	  r?  rY   c                     yr4  r   rk   s    rW   ready_to_flushzSIMDScheduling.ready_to_flushh	  r5  rY   c                   t        d |D              st        |d       j                  \  }\  }}| j                  |||      }| j	                  |||      }| j                  |t        |||            }| j                  ||       t        j                  d|      5  t        j                  |      5  |j                         }	d d d        d d d        nL|d   j                  |      \  }
}}t        j                  d|      5  | j                  |||
d      }	d d d        	j                  t!        t"        j$                        d	      }	|	S # 1 sw Y   xY w# 1 sw Y   @xY w# 1 sw Y   LxY w)
Nc              3  <   K   | ]  }|j                           y wrQ   )r]  )r   r   s     rW   r   zASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>l	  s     2q1==?2r  c                4    t        | j                               S rQ   r  r  s    rW   r   z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>m	  r  rY   r   )r  r  r   Tr  triton_)r  r7  r  r  ra  rQ  rC   r  r   r  r7   r  r=  get_prologue_template_epiloguer  replacers   r.   KERNEL_NAME)rh   r   r  r  rd   r0  r  r  rf   r  r  templateepilogues                rW   generate_kernel_code_from_nodesz.SIMDScheduling.generate_kernel_code_from_nodesk	  ss   2E22!$U0O!P!V!VAv 77ufMM''ufEF%%+M5&I & F 22=&I/1AB3$$V,3 "002	3 3 3 ,18+R+R,(Hh 02BC 00&*	 1  ##C(?(?$@)L%3 3 3 3 s0   E3EEE$E	EE!$E-c                     y rQ   r   )rh   r  s     rW   r  zSIMDScheduling.codegen_comment	  r?  rY   c                    t         rQ   r)  )rh   r  r  rf   s       rW   r  zSIMDScheduling.define_kernel	  r-  rY   )r   z<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode])rd   rv   r  zGIterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject, ir.IRNode]]ry   r|   )r  rC   )r  rC   ry   zlist[SIMDKernel])ry   Optional[str])F)r  zlist[BaseSchedulerNode]r  r|   r  r|   r  r|   r  r|   ry   zlist[tuple[str, Any, Any]])ry   rF  )rN  rF  rO  rF  ry   rA  )r  rF  r<  r|   ry   rA  )r  rA  rd   rv   r  rv   ry   rA  )ry   z"list[dict[str, tuple[sympy.Expr]]])
r  list[NodeScheduleEntry]ra  rv   r  rv   r  rG   ry   =tuple[dict[str, sympy.Expr], Optional[dict[str, sympy.Expr]]])r  r  rd   rv   r  rv   r  rA  )r  r  rd   rv   r  rv   ru  zlist[dict[str, sympy.Expr]])r  Optional[CoalesceVarAnalysis]ry   rA  )r  r  ry   r  r{   )+r~   r   r   r   rw   rQ  rL  rT  r[  can_fuse_verticalrd  r  r  rM  r  r  r  r  r  r  r#  r*  rN  r   r   rL  rR  r8  rH  rv  r  r  r  r   r   r   ra  r  r  r  r  r  r  r   rY   rW   rP  rP  Z  s/   
 (K'Q6B !"^@
P
, 

 
 :G)R
1
	
 -F SXt	tlI #(; 0;   $;  	; 
 ;   ;  
$; z)2 Y}  }~ 

,

@T

	

 

 
$
 
 
	
 
 /%/ / $	/
 
/ /( w
 
,w wr M$.M$ $M$ $	M$
 /M$ 
GM$ M$^ 
.
 
 $	

 &
 
  .  $	
 4  
 ;?	
 9	 
	 	 
 ;?~$
 9~$ 
G~$ ~$@<"rY   rP  T)frozenc                  @    e Zd ZU ded<   ded<   dZded<   ed        Zy)	r7  rA  r  r   r2  Nr  ra   c                r    t         j                  j                  j                  |       } | dk\  xr | dz  dk(  S )z@Somewhat arbitrary heuristic used to boost scores for some sizesr+  r   r  )r   s    rW   r;  zCandidateTiling.is_good_size	  s5     GG&&q)Bw(AFaK(rY   )r~   r   r   rL  ra   rM  r;  r   rY   rW   r7  r7  	  s)    !!JD-) )rY   r7  c                      e Zd Zy)r  N)r~   r   r   r   rY   rW   r  r  	  s    rY   r  )r   )rV   r   ry   r   )r   rJ  ry   rs   )|
__future__r   r  r  dataclassesr   r  r  r   r  r  r   typingr   r   r   r   r	   r
   r   typing_extensionsr   r   rR   torch._loggingtorch._inductor.tiling_utilsr   %torch.fx.experimental.symbolic_shapesr   torch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr    r   r   r   analyze_preserves_zero_maskr   	codecacher   dependenciesr    r!   r"   r#   optimize_indexingr$   runtime.runtime_utilsr%   r&   r'   r(   r)   utilsr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   virtualizedr5   r6   r7   block_analysisr9   commonr:   r;   r<   r=   multi_kernelr>   simd_kernel_featuresr?   r@   rA   rB   rC   collections.abcrD   rE   rF   rG   	getLoggerr~   r  _logginggetArtifactLoggerr  r  
fusion_logdoprintrK  r@  rX   	dataclassr[   rx   r   r   r   rw   rP  r7  	Exceptionr  r   rY   rW   <module>r     s   "           X X X %    B G 9 / L L  & $ $ F ! 6 6  A ; D D    - , / P P %  <<@ g!00<H~~//*E^^--hA
 	78;
 5+ 5+ 5+pN;/ N;b;'? ;'| +;Td('/*B dNs"^ s"l) d#	) 	) $	)		 	rY   