
    rh                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZ erd dlmZ d dlmZ d dlZd dlZd dlZd dlm Z m!Z! d dl"m#Z#m$Z$ d d	l%m&Z&m'Z' d d
l(m)Z) d dl*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z= ddl>m?Z? ddl5m@Z@mAZAmBZBmCZC ddlDmEZEmFZF ddlGmHZHmIZI ddl6mJZJmKZKmLZLmMZMmNZN ddlOmPZP ddlQmRZRmSZS ddlTmUZUmVZV ddlWmXZX ddlYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZh ddlimjZj  ej                  el      Zmej                  j                  eld      Zpej                  j                  eld      Zqerd   Zsej                   G d  d!             Zuej                   G d" d#eu             Zv G d$ d      Zw G d% d&      Zxd@d'Zy G d( d)      Zz	 	 	 	 	 	 	 	 dAd*Z{ G d+ d,ew      Z| G d- d.ew      Z} G d/ d0ew      Z~	 	 	 	 dBd1Z	 	 	 	 	 	 	 	 dCd3Z G d4 d5ew      Z G d6 d7e      Z G d8 d9ew      Z	 dD	 	 	 	 	 	 	 dEd:Zej                   G d; d<             Z ej                         Z G d= d2      Z G d> d?      Zy)F    )annotationsN)Counterdefaultdict)AnyCallableGenericOptionalTYPE_CHECKINGTypeVarUnion)Sequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)get_metric_tableis_metric_table_enabled)free_symbols
OrderedSet)free_symbol_is_typesymbol_is_typeSymT)
has_triton   )commsconfigdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fxcountable_fx)get_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)cache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingBaseSchedulerNodec                      e Zd ZU ded<   ded<   ded<    ej
                  e      Zded	<    ej
                  e      Z	d
ed<   ddZ
ddZddZddZddZddZddZddZddZddZy)SchedulerBuffer	Scheduler	schedulerz	ir.BuffernodeOptional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr5   
mpi_bufferc                B    | j                   }|J |j                         S N)rS   get_name)selfops     l/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/scheduler.pydefining_op_namez SchedulerBuffer.defining_op_name[   s#    ~~{{}    c                @    t        | j                  j                        S rY   )hashrQ   namer[   s    r]   __hash__zSchedulerBuffer.__hash__`   s    DIINN##r_   c                v   t               }| j                         }|j                  | dt        | j                        j
                          |j                  | d| j                  j                          | j                         r-|j                  | dt        | j                                       | j                         r-|j                  | dt        | j                                       t        | j                        dk  r0|j                  | d| j                          |j                         S |j                  | d       |j                  d      5  | j                  D ]  }|j                  | d        	 d d d        |j                  d	       |j                         S # 1 sw Y   *xY w)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rA   rZ   	writelinetyperQ   __name__layoutget_aliasespformatget_mutationslenrV   indentgetrawvalue)r[   resultrb   users       r]   	debug_strzSchedulerBuffer.debug_strc   s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! 1 JJ 1D$$vQZ011 S!!!##	1 1s   &F//F8c                6    | j                   j                         S rY   rQ   rZ   rc   s    r]   rZ   zSchedulerBuffer.get_namew       yy!!##r_   c                   | j                   J | j                   j                         sy | j                   j                         sL| j                   j                         s2t	        | j                   j                         t        j                        r4t        j                  j                  j                  | j                          y t        t        j                  d      r| j                         t        j                  j                  v rt        j                  j                  | j                            }|| j                   j"                  v r$| j                   j"                  |   j                   }n#| j                   j$                  |   j                   }t        j                  j                  j'                  || j                          y t        j                  j                  j                  | j                          y )Nargs)rQ   should_allocateget_inputs_that_alias_outputget_mutation_names
isinstanceget_output_specr    CommBufferLayoutrI   graphwrapper_codecodegen_allocationhasattrkernelrZ   inplace_update_buffersrP   name_to_donated_buffername_to_bufcodegen_inplace_reuse)r[   input_buffer_nameinput_buffers      r]   allocatezSchedulerBuffer.allocatez   sV   yy$$$yy((* II224yy++-$))335r7J7JKGG  33DII> AHHf%188#B#BB !" ? ? P DNN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>r_   c                   | j                   J t        | j                   j                  t        j                        st        | j                         ry| j                  D ]  }t        |j                   t              s y yNFT)rQ   r~   rl   r    r3   rE   rV   
OutputNode)r[   uses     r]   can_freezSchedulerBuffer.can_free   sg    yy$$$dii&&6:SII;
 :: 	C#((J/	 r_   c                ,   i }|D ]o  }t        |j                        |v r>|j                  |t        |j                                 |t        |j                        <   X||t        |j                        <   q t        |j	                               | _        y rY   )idrQ   mergelistvaluesrV   )r[   rV   rs   r   s       r]   	set_userszSchedulerBuffer.set_users   st    &( 	+C#((|v%'*yy3881E'Fr#((|$'*r#((|$		+
 &--/*
r_   c                R    | j                   J | j                   j                         S rY   )rQ   r|   rc   s    r]   rm   zSchedulerBuffer.get_aliases   s%    yy$$$yy5577r_   c                R    | j                   J | j                   j                         S rY   )rQ   r}   rc   s    r]   ro   zSchedulerBuffer.get_mutations   %    yy$$$yy++--r_   c                R    | j                   j                         j                         S rY   )rQ   r   
get_devicerc   s    r]   r   zSchedulerBuffer.get_device   s    yy((*5577r_   Nreturnstrr   intr   Noner   bool)rV   rU   r   r   r   zSequence[str]r   Optional[torch.device])rk   
__module____qualname____annotations__dataclassesfieldr   rV   r5   rW   r^   rd   ru   rZ   r   r   r   rm   ro   r    r_   r]   rN   rN   Q   sz    
O,,-K--dCE>C.?k.?.?3/J+ 
$$($?B
+8.8r_   rN   c                      e Zd ZU dZded<   y)SchedulerDonatedBufferNrR   rS   )rk   r   r   rS   r   r   r_   r]   r   r      s    /3K,3r_   r   c                     e Zd ZU ded<   ded<   ded<   ded<   ded	<   d
ed<   d@dZdAdZdBdZdBdZdBdZdCdZ	dBdZ
dDdZ	 	 	 	 	 	 dEdZdFdZdGdZdHdZdIdZ	 	 	 	 	 	 dJdZdDdZdKdZdKdZdDdZdDdZ	 	 	 	 dLdZdBd ZdBd!ZedKd"       ZedKd#       ZedHd$       ZedHd%       ZdMd&ZdNd'Z dOd(Z!dPd)Z"dHd*Z#dHd+Z$dHd,Z%dHd-Z&dHd.Z'dHd/Z(dHd0Z)dQd1Z*dHd2Z+dDd3Z,	 dR	 	 	 	 	 dSd4Z-edTd5       Z.edTd6       Z/edTd7       Z0	 	 	 	 	 	 dUd8Z1	 	 	 	 	 	 dVd9Z2edWd:       Z3edXd;       Z4dYd<Z5dZd=Z6e7	 	 	 	 d[d>       Z8y?)\rL   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writesOrderedSet[Dep]unmet_dependenciesr   	min_order	max_orderr6   mpi_nodec                "    || _         d | _        y )Nc                     g S rY   r   )rz   kwargss     r]   <lambda>z,BaseSchedulerNode.__init__.<locals>.<lambda>   s    B r_   )rP   debug_device_str)r[   rP   s     r]   __init__zBaseSchedulerNode.__init__   s    $-& 	r_   c                L   || _         t               | _        t        t                  | _        d| _        |j                         D cg c]  }t        | j                  ||        c}| _	        | j                  D ci c]  }|j                         | c}| _        y c c}w c c}w )NF)rP   rQ   rS   )rQ   r   	ancestorsr   
last_usagewrittenget_outputsrN   rP   outputsrZ   outputs_by_name)r[   rQ   outputbufs       r]   _init_from_nodez!BaseSchedulerNode._init_from_node   s    ,0	*4,$
   **,/
  .. /
 ,0<<<
$'CLLNC<
/
<
s   B;B!c                T    t        |       j                   d| j                         dS )Nz(name=)rj   rk   rZ   rc   s    r]   __repr__zBaseSchedulerNode.__repr__   s'    t*%%&fT]]_,?qAAr_   c                H   | j                         }t               }|j                  | dt        |       j                   dt        t        | dd            j                   d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j                  |j                                # 	 ddd       |j                  d       	 |j                  | j                                |j'                         j)                         S # 1 sw Y   XxY w# t         $ r t"        j%                  dd       Y Lw xY w)#Longer form printout for trace logsrf   (rQ   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rh   Ignoring error in debug_str()Texc_info)rZ   rA   splicerj   rk   getattrrn   r   writesr   readsrq   r   ru   ri   debug_str_extra	Exceptionlogwarningrr   rstrip)r[   rb   r   outs       r]   ru   zBaseSchedulerNode.debug_str   s   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   %5E25E> 2E;> F! F!c                     y)N r   rc   s    r]   r   z!BaseSchedulerNode.debug_str_extra      r_   c                $    | j                  |       S rY   )r   rc   s    r]   _debug_str_for_devicez'BaseSchedulerNode._debug_str_for_device  s    $$T**r_   c                   t        | j                  dd       }d}t        |t        j                  j
                  j                        r'd|j                  |j                         gdd      z   }nct        |t        j                  j
                  j                        r5d|j                  |j                         |j                         gdd      z   }|  | S )Ndatar   z, F)shorten	multiline)r   rQ   r~   torch	_inductorr    	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)r[   
maybe_datadata_strs      r]   debug_str_shortz!BaseSchedulerNode.debug_str_short
  s    TYY5
j%//"4"4">">?j33$$&'% 4  H 
EOO$6$6$@$@Aj33..0*2O2O2QR 4  H
 z""r_   c                p    t         j                  d| | j                  | j                  j                         y )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   rc   s    r]   log_detailszBaseSchedulerNode.log_details  s,    6####		
r_   c                     y rY   r   )r[   self_dep	other_deps      r]   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_pair!  s     	r_   c                X    | j                  | j                  j                  |             y rY   )set_read_writesr   renamer[   renamess     r]   update_mutated_namesz&BaseSchedulerNode.update_mutated_names&  s!    T--44W=>r_   c                X    | j                  | j                  j                  |             y rY   )r   r   	with_readr[   deps     r]   add_fake_depzBaseSchedulerNode.add_fake_dep)  s!    T--77<=r_   c                B    t        d | j                         D              S )Nc              3  `   K   | ]&  }|j                         xs |j                          ( y wrY   )rm   ro   ).0r   s     r]   	<genexpr>z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>-  s-      
9<COO4!2!2!44
s   ,.)anyr   rc   s    r]   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutation,  s%     
@D@P@P@R
 
 	
r_   c                h    || _         | j                   j                  | _        | j                          y rY   )r   r   r   
prune_deps)r[   rws     r]   r   z!BaseSchedulerNode.set_read_writes1  s(    "&"2"2"8"8r_   c                b    | j                         }t        fd|D              }||z
  | _        y )Nc              3  B   K   | ]  }j                  ||        y wrY   )get)r  kmutation_real_names     r]   r  z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>:  s     !U1"4"8"8A">!Us   )used_or_aliased_buffer_namesr   r   )r[   future_used_buffersr  used_bufferss     ` r]   set_last_usagez BaseSchedulerNode.set_last_usage6  s0     88:!!U!UU&)<<r_   c                F    | j                   D ]  }|j                           y rY   )r   r   )r[   r   s     r]   mark_runzBaseSchedulerNode.mark_run=  s    << 	CLLN	r_   c                    t        d t        j                  | j                  j                  | j                  j
                        D              S )Nc              3  4   K   | ]  }|j                     y wrY   rb   r  r  s     r]   r  z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>B  s      
 HH
   )r   	itertoolschainr   r   r   rc   s    r]   used_buffer_namesz#BaseSchedulerNode.used_buffer_namesA  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
r_   c                $   t               t        j                  | j                  j                  | j                  j
                        D cg c]  }|j                   }}t        |      dkD  r|j                         }j                  |       t        j                  j                  j                  |      rC|j                  fdt        j                  j                  |   j                         D               t        |      dkD  rS c c}w )Nr   c              3  *   K   | ]
  }|vr|  y wrY   r   )r  alias
used_namess     r]   r  zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>R  s#       J.	 s   )r   r"  r#  r   r   r   rb   rp   popaddrI   r   name_to_bufferr  extendr|   )r[   r  depsr(  s      @r]   r  z.BaseSchedulerNode.used_or_aliased_buffer_namesG  s    &0l
 !t'7'7'='=t?O?O?V?VW
 HH
 
 $i!m((*CNN3ww%%))#. !"!7!7"224	 	 $i!m 
s   Dc                L     t         fd j                  D               _        y )Nc              3  f   K   | ](  }|j                   j                  j                  vr| * y wrY   )rb   rP   available_buffer_namesr  r  r[   s     r]   r  z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>\  s/      -
xxt~~DDD -
s   .1r   r   rc   s   `r]   r  zBaseSchedulerNode.prune_deps[  s#    ", -
..-
 #
r_   c                     d fdt        fd j                  j                  D              } j                   j                  j	                  |             y )Nc                    t        | t              syj                  j                  | j                     j                         }|t        j                  j                  v S NF)	r~   r*   rP   r   rb   r^   rI   r   removed_operations)r  op_namer[   s     r]   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_pruned  sF    c7+nn00:KKMGagg8888r_   c              3  4   K   | ]  } |      s|  y wrY   r   r  r  r8  s     r]   r  z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>j  s      
\#5FC
   r  r'   r   r   )r   r   r   r   remove_reads)r[   	to_remover8  s   ` @r]   prune_weak_depsz!BaseSchedulerNode.prune_weak_depsb  sN    	9  
++11
 
	 	T--::9EFr_   c                F    t        | || j                  j                         y rY   )_prune_redundant_depsrP   r   )r[   name_to_fused_nodes     r]   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_depso  s     	d$68R8RSr_   c                R    | j                   J | j                   j                         S rY   )rQ   get_operation_namerc   s    r]   rZ   zBaseSchedulerNode.get_namet  r   r_   c                "    | j                         S rY   rZ   rc   s    r]   get_first_namez BaseSchedulerNode.get_first_namex  s    }}r_   c                B    t        d | j                         D              S )Nc              3  <   K   | ]  }|j                           y wrY   rG  r  rQ   s     r]   r  z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>}  s     Gd$--/G   )r   	get_nodesrc   s    r]   get_operation_namesz%BaseSchedulerNode.get_operation_names{  s    Gdnn6FGGGr_   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrY   rG  r  r   s     r]   r  z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s     AS#,,.ArL  )r   r   rc   s    r]   get_buffer_namesz"BaseSchedulerNode.get_buffer_names  s    ADLLAAAr_   c                B    t        d | j                         D              S )Nc              3  Z   K   | ]#  }t        |t              xr t        |d        % yw)T)disallow_fp32_opsNr~   SchedulerNoder"   r  ns     r]   r  zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s7      
  q-( G+AFG
s   )+allrM  rc   s    r]   can_codegen_in_low_precisionz.BaseSchedulerNode.can_codegen_in_low_precision  s%     
 ^^%
 
 	
r_   c                B    t        d | j                         D              S )Nc              3  V   K   | ]!  }t        |t              xr t        |       # y wrY   rV  rX  s     r]   r  z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s-      
 q-(K-H-KK
s   ')rZ  rc   s    r]   r"   z-BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
r_   c                    | gS rY   r   rc   s    r]   rM  zBaseSchedulerNode.get_nodes  s	    vr_   c                    | j                   S rY   )r   rc   s    r]   r   zBaseSchedulerNode.get_outputs  s    ||r_   c                     | j                   |   S rY   )r   )r[   buf_names     r]   
get_outputzBaseSchedulerNode.get_output  s    ##H--r_   c                R    | j                   J | j                   j                         S rY   )rQ   r   rc   s    r]   r   zBaseSchedulerNode.get_device  s%    yy$$$yy##%%r_   c                L    | j                         }|d uxr |j                  dk(  S Ncpu)r   rj   r[   devices     r]   is_cpuzBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::r_   c                X    | j                         }|d uxr t        |j                        S rY   )r   rD   rj   rh  s     r]   rD   zBaseSchedulerNode.is_gpu  s'    "T!9fV[[&99r_   c                     yr5  r   rc   s    r]   is_reductionzBaseSchedulerNode.is_reduction      r_   c                     yr5  r   rc   s    r]   is_split_scanzBaseSchedulerNode.is_split_scan  rn  r_   c                     yr5  r   rc   s    r]   is_templatezBaseSchedulerNode.is_template  rn  r_   c                     yr5  r   rc   s    r]   	is_externzBaseSchedulerNode.is_extern  rn  r_   c                     yr5  r   rc   s    r]   
is_foreachzBaseSchedulerNode.is_foreach  rn  r_   c                     yr5  r   r[   read_deps     r]   can_inplacezBaseSchedulerNode.can_inplace  rn  r_   c                     yr5  r   rc   s    r]   has_side_effectsz"BaseSchedulerNode.has_side_effects  rn  r_   c                \
    ddl m} t         t              rt        j
                  rt        j                  j                   j                         t        j                        r{t        t        j                  t        j                  j                  j                   j"                        rt%        t        j                  dd      t'        t        j                  d      sy j(                  t        j                  j*                  z   j,                  j.                  z  }d fd} j1                         D ]  }|j2                  }|J |j5                         rJ|j7                         s:|j9                         s*|j;                         t        j                  j<                  v ro j>                  j@                  D ]h  }|jB                   j,                  jD                  v r$ j,                  jD                  |jB                     }n/ j,                  jF                  jI                  |jB                        }|s|t        j                  jJ                  jM                  |       st        |jN                  tP              r|jR                  J |jR                  D cg c]   }|j2                  j;                         |vr|" }	}tU        |	      dk(  s|	d   jV                  s&|	d   j2                   u s9|j2                  Gt        |j2                  jY                         tZ        j\                  tZ        j^                  tZ        j`                  f      r|jN                  rft        |jN                  j2                  tZ        jb                  tZ        jd                  f      r(tU        |j2                  j7                               dkD  r ||j2                  |j2                        s+ ||      s5t        j                  jf                  ji                  |j;                         |j;                                t        t        j                  t        j                  j                  j                   j"                        rnt        j                  jj                  jm                  |j;                                t        j                  jj                  jm                  |j;                                |j;                         t        j                  jn                  |j;                         <      yc c}w )	z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNrz   c                   | j                   j                        }| j                         t               }| j                  D ]  }|j
                  }t        |t              s |j                         | j                   j                  vs| j                   j                  |      |urd|fd|j                  j                         D        z  }t        |      dkD  s y y)Nc              3  @   K   | ]  }|j                   k(  r|  y wrY   r  )r  orb  s     r]   r  z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>  s%      vv)    r   FT)rP   get_fused_noderZ   r   rV   rQ   r~   rL   rH  rB  r   reads_and_writesrp   )buf_to_be_inplaced
fused_noder-  rt   	user_noderb  r[   s        @r]   single_index_in_fused_nodezKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node  s    
 ,55DDTJJ)224H %/LD*00 ! II	!)->? ,,.-77JJK)33BB9M%&  &22CCE 
 t9q= '!* r_   r   )r  rN   r   r   )8codegen.wrapperr~  r~   rW  r   inplace_buffersrI   r   has_featurer   r#   INPLACE_BUFFERSr   r   r   codegensimd
SIMDKernelr   r   r   r6  rP   completed_operationsr   rQ   r{   r|   r}   rZ   removed_buffersr   r   rb   r   r   r  r   	can_reuserS   NopKernelSchedulerNoderV   rp   rz  r   r    r3   r2   MutationLayoutSHOULDREMOVEFallbackKernelr1   rz   make_inplacer  r*  r   )
r[   r~  inconsequential_nodesr  r   buf_noderead	input_bufxremaining_usess
   `         r]   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_update  s   
 	; t]+&&##DOO$5~7U7UVqxx)@)@)E)E)P)PQ188[$7C &) NNgg(()nn112 	 	D ##% C	CxxH''',,.88:..0<<>QWW%<%<<((.. 899 E EE $ E Edii PI $ : : > >tyy II ,,66y$G&y'<'<>TU$??666 "+&66??,4II &N & N+q0*1-99*1-22d:%NN6 *%NN::< " " 4 4 " = =! &11 * ) 5 5 : :!#!2!2BNN C! !$INN$O$O$Q RUV V1)..#((K6yA
 2293E3E3GX%HHeoo&=&=&B&B&M&M HH..2293E3E3GHHH..223<<>B &..0 77G q8C	0&s   %T)c                .   t         j                  sy |r| j                  ry | j                  J | j                  j	                         }g }|D ]  }|j
                  dk(  r|j                  d       |j                  d       d|j
                   d|j                   }d|j                  v r|d|j                  d    z   }|j                  |       d|j                  v s|j                  d    }|j                  d	      d
   }|j                  d|j                  dd      j                  dd      j                  dd      z          |j                  d       |j                  d       ! t        |      dk(  ry |j                  |       d| _        y )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|{z{{}z}}r   \z#pragma CMT END ORIGINr   T)r   comment_originr   rQ   get_originsr\   appendtargetmetasplitreplacerp   
writelines)	r[   buffer	only_onceorigins	out_linesr  op_info_strr  stack_trace_last_lines	            r]   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_info>  s    $$yy$$$))'')	 	%AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(9(9#(>r(B%  "+33C>WS$'WT4()   !9:  $-	%0 y>Q 	)$r_   c                (    | j                  dd      S )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrc   s    r]   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizesj  s    55t 6 
 	
r_   c                (    | j                  dd      S )NTFr  r  rc   s    r]   get_read_buffer_sizesz'BaseSchedulerNode.get_read_buffer_sizesp  s    55u 6 
 	
r_   c                (    | j                  dd      S )NFTr  r  rc   s    r]   get_write_buffer_sizesz(BaseSchedulerNode.get_write_buffer_sizesv  s    55 6 
 	
r_   c                Z    t        | j                  ||      j                         d      S )Nr  r   )start)sumget_read_write_buffer_accessesr   )r[   r  r  s      r]   r  z3BaseSchedulerNode.get_read_write_buffers_sizes_impl|  s3     //+N 0 fh	
 	
r_   c                    t         t              ri S t         t              rt         j                  t              ri S t         t              r`t         j                  t
        j                        r< j                  j                  t        j                  j                  j                  u ri S ddt         t              r@ t         j                         d         t         j                         d         z        nt        d      t!        j"                  t$              }|r9 j&                  j(                  D ]   }||j*                     j-                  |       " |r9 j&                  j.                  D ]   }||j*                     j-                  |       " |r&t1        d  j&                  j(                  D              n	t1               }|r&t1        d  j&                  j.                  D              n	t1               }d fdt         t2              rt1         fd|D              }||z
  }||z
  }i }||z  D ]  }	t5        fd	||	   D              |	t6        j8                  j:                  v rt6        j8                  j:                  |	   }
n;|	t6        j8                  j<                  v rt6        j8                  j<                  |	   }
n	 	 	 	 d fd
 |
      }|	|vr|||	<   ||	xx   |z  cc<    |S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        c                X    t         j                  j                  j                  | d      S )Nr   fallback)rI   r   sizevars	size_hint)ss    r]   try_size_hintzGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint  s"    77##--a!-<<r_   r   r       eAc              3  4   K   | ]  }|j                     y wrY   r  r   s     r]   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     BCsxxBr!  c              3  4   K   | ]  }|j                     y wrY   r  r   s     r]   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     CCsxxCr!  c                    j                   j                  |    j                  }t        d |D              }t	        |t        |      z
        dkD  S )Nc              3  4   K   | ]  }|j                     y wrY   rQ   )r  rt   s     r]   r  z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>  s     !>$))!>r!  r   )rP   r   rV   r   rp   )r   snodesrV   buf_usesr[   s       r]   is_materializedzIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized  sG    NN..s399E!!>!>>Hx*V"44599r_   c              3  J   K   | ]  } |j                         r|  y wrY   r  )r  r  r  r[   s     r]   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s#      )_S$++-N)s   ##c              3  "   K   | ]  }  y wrY   r   )r  r  
node_numels     r]   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     $RCZ$Rs   c                   | syt        | t        j                        r| j                         S t        | j                  t
              rj                  j                  | j                            j                  }d}|D ]x  }t        |j                  t              sJ t        |j                  j                  t              r5|j                  j                         D ]  }| |j                        z  } x y |S t        | j                  t        j                        r"t        fd| j!                         D              S  	t#        | j%                                     }t'        | j)                               t+        |      z  S )Nr   c              3  h   K   | ])  } t         j                  j                  |             + y wrY   )rI   r   
get_buffer)r  mut_nameget_buf_bytess     r]   r  zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>   s-      $ &agg&8&8&BCs   /2)r~   r    TorchBindObjectr  rl   r2   rP   r   rZ   rV   rQ   rL   r1   r   r3   r  r}   rH   r   r>   	get_dtypemin)
r   rV   totrt   	sched_buf	buf_elemsbuf_accessed_elemsr  r[   r  s
         r]   r  zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes  sC    c2#5#56,,..

,=> !NN66s||~FLLEC % 	%)$))5FGGG%diinnkB-1YY-B-B-D E	 #}Y^^'D DE $%	% J

BMM: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  r_   )r  z
sympy.Exprr   r   )r   r   r  Sequence[BaseSchedulerNode]r   r   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r   )r~   r  ExternKernelSchedulerNoderQ   r1   r    r  op_overloadr   _prims	rng_primsgraphsafe_run_with_rng_staterW  rH   
get_rangesr   collectionsr   r   r   r   rb   r  r   r   FusedSchedulerNoder  rI   r   r+  graph_inputs)r[   r  r  buf_accessesr  r   r   r  buf_byte_accessesrb  r   	buf_bytesr  r  r  r  r  s   `           @@@@@r]   r  z0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d23Id56:II{<
 It67499b&7&78		%%||%%BBC I	= dM*&doo/23 1! 456J
 SJ"..t4''-- 3SXX&--c23 ''.. 3SXX&--c23
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d./( )%) O o-FO+E,. 1	9H!$$R<;Q$R!R177111gg,,X6QWW111gg**84!Q!! !F &c*I00.7!(+!(+y8+c1	9f ! r_   c                   | j                   y | j                   j                         }|y t        |      sy t        |      }t        j
                  j                  j                  |fd      d   }t        d   dxx   |z  cc<   |S )Nr   r  inductor
flop_count)	rQ   get_origin_noder.   r-   rI   r   r  
size_hintsr   )r[   fx_nodeflopsresolved_flopss       r]   estimate_flopsz BaseSchedulerNode.estimate_flops  s    99))++-?G$w'))44eX4J1M\*n<*r_   c                   | j                         d   j                         d   }|j                  j                         }t	        t        |            syt        | j                        r<t        | j                  t        j                        sJ 	 t        | j                        S t        | j                        ry|j                  j!                         }	 t#               }t%        |      dz  }|dk  rt'        d|       |dk  rt'        d|       	 | j+                         }|dk(  s|| j-                         |z  S d}| j-                         }	|	dn|	}	||z  |z  dz  }
|	|z  }t/        |
|      S # t        $ r}t        j                  |       Y d}~yd}~wt        $ r}t        j                  |       Y d}~yd}~ww xY w# t(        $ r Y yw xY w)zB
        Returns estimated op runtime in nanoseconds (ns)
        r   Nl    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g      ?r  )rM  r   rQ   r   rD   r/   rB   r~   r    IRNoder&   
ValueErrorr   r   	TypeErrorrG   maybe_get_dtyper?   r=   AssertionErrorr   r  r  max)r[   r   rl   edtypegpu_memory_bandwidth	gpu_flops	flops_estfactorcounted_bytescompute_timetransfer_times               r]   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtime"  s   
 nnq!--/2))+of-. #dii333
7		BB TYY
 ((*	#4#6 )%069I $q($CDXCYZ  A~$'I)%UVV 
 '')	>Y.4469MMM 99;*2*Y6#=%(<< <//_    2  		s6   E3 >G 3	G<FG"F<<G	GGc                     y rY   r   rc   s    r]   get_template_nodez#BaseSchedulerNode.get_template_nodec      r_   c                .    | j                         }|J |S rY   r  )r[   templates     r]   get_template_node_or_throwz,BaseSchedulerNode.get_template_node_or_throwf  s!    ))+###r_   c                f    t        d t        |       D              }| d| }| |   }| |dz   d }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c              3  H   K   | ]  \  }}|j                         s|  y wrY   rr  )r  irY  s      r]   r  zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>r  s     PDAqaPs   ""Nr   )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        r]   get_prologue_template_epiloguez0BaseSchedulerNode.get_prologue_template_epiloguek  sN     PIe,<PP.)n-!+-.00r_   N)rP   rO   r   r   )rQ   ir.Operationr   r   r   )r   z	list[str]r   r   r(   r   r(   r   r   r  dict[str, str]r   r   )r  r'   r   r   r   )r  r   r   r   r  OrderedSet[str]r  r$  r   r   r   r&  rB  dict[str, BaseSchedulerNode]r   r   r   r  )r   zSequence[SchedulerBuffer])rb  r   r   rN   r   ry  zdependencies.Depr   r   T)r  rA   r  r   r   r   r   )r  r   r  r   r   r   )r  r   r  r   r   zdict[str, int]r   z
int | None)r   floatr   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)r  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]])9rk   r   r   r   r   r   r   ru   r   r   r   r   r   r  r  r  r   r  r  r$  r  r  r?  rC  rZ   rH  r:   rN  rR  r\  r"   rM  r   rc  r   rj  rD   rm  rp  rr  rt  rv  rz  r|  r  r  r  r  r  r  r  r  r  r  r  staticmethodr   r   r_   r]   rL   rL      sj   BB(('' NN''

&B*2+#
!.7	
?>


=#2=HV=	=
(
GT">T	T
. H H B B 
 
 
 
.&;:@F 9=*$*15*	*X 
 

 
 

 
 


!
37
	
J!!J!37J!	J!X   >0 >0@
 1&1	S1 1r_   c                  B    e Zd ZU g dZded<   ded<   d
dZddZddZy	)	WhyNoFusename1name2reasonrz   r   r7  ztuple[Any, ...]rz   c                X    |j                         | _        |j                         | _        y rY   )rZ   r5  r6  r[   node1node2s      r]   r   zWhyNoFuse.__init__  s    ^^%
^^%
r_   c                J    || _         || _        t        j                  |        y rY   )r7  rz   
fusion_logdebug)r[   r7  rz   s      r]   __call__zWhyNoFuse.__call__  s    	r_   c                p    d| j                    d| j                   d| j                  | j                  z  z   S )Nzcannot fuse z with rf   r4  rc   s    r]   __str__zWhyNoFuse.__str__  s6    djj\

|2>KK$))#
 	
r_   Nr:  rL   r;  rL   r   r   )r7  r   rz   r   r   r   r   )rk   r   r   	__slots__r   r   r?  rA  r   r_   r]   r3  r3  z  s#     5IK
&

r_   r3  c                    t        | t        t        f      rt        | t              } t        j                  | d      }d|v rdt        j                  |d       S |S )Nkey   )rq   r       )	r~   r   setsortedr   pprintrn   textwraprq   )objrs   s     r]   rn   rn     sR    #
C()Sc"^^C*Fv~HOOFG4566Mr_   c                  0    e Zd ZddZddZddZd	dZeZy)
r   c                &    t        |g      | _        y rY   r2  r  s     r]   r   zOutputNode.__init__  s    ",cU"3r_   c                     yr5  r   rc   s    r]   rm  zOutputNode.is_reduction  rn  r_   c                     y)Nr   r   rc   s    r]   r|   z'OutputNode.get_inputs_that_alias_output  r   r_   c                     y)NOUTPUTr   rc   s    r]   rZ   zOutputNode.get_name  s    r_   N)r  r)   r   r   r   r   r   )rk   r   r   r   rm  r|   rZ   r   r   r_   r]   r   r     s    4 Hr_   r   c                    t        j                          j                  D ]N  }t        |t              r|j
                     j                         }|   j                         xx   dz  cc<   P d fdt        fd j                  D              }|r? j                  |z
   _         j                   j                  j                  |             yy)am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   c                    t        | t              rD| j                     j                         }|   j	                            dkD  }|   k(  }|xs |S y)Nr   F)r~   r*   rb   r^   rZ   )r  r7  is_redundantis_self_depr   name_to_dep_countrB  rQ   s       r]   r8  z+_prune_redundant_deps.<locals>.should_prune  sb    c7#!#((+<<>G,-?-H-Q-Q-STWXXL -W5=K.;.r_   c              3  4   K   | ]  } |      s|  y wrY   r   r:  s     r]   r  z(_prune_redundant_deps.<locals>.<genexpr>  s      ,s2Cr;  Nr<  )r  r   r   r~   r*   rb   r^   rZ   r   r   r   r=  )rQ   rB  r   r  r7  deps_to_prunerX  r8  s   ```   @@r]   rA  rA    s     '2&9&9&;&& K#w'!#((+<<>G09BBDEJEK

 
  .. M "&"9"9M"IT--::=IJ r_   c                  8     e Zd Zd fdZddZddZddZ xZS )r  c                    t         |   |       | j                  |       | j                  |j	                                y rY   superr   r   r   get_read_writesr[   rP   rQ   	__class__s      r]   r   z"ExternKernelSchedulerNode.__init__  5    #T"T1134r_   c                V    | j                          dt        | j                  dd        S )Nz.node.kernel = python_kernel_name)rZ   r   rQ   rc   s    r]   r   z)ExternKernelSchedulerNode.debug_str_extra  s*    --/"/'$))EY[_2`1abbr_   c                     yNTr   rc   s    r]   rt  z#ExternKernelSchedulerNode.is_extern  r  r_   c                    | j                   J t        | j                   d      xr | j                   j                         S )Nr|  )rQ   r   r|  rc   s    r]   r|  z*ExternKernelSchedulerNode.has_side_effects  s6    yy$$$tyy"45V$)):T:T:VVr_   rP   rO   rQ   r!  r   r   r   r   )rk   r   r   r   r   rt  r|  __classcell__ra  s   @r]   r  r    s    5
cWr_   r  c                        e Zd Zd fdZ xZS )r  c                    t         |   |       | j                  |       | j                  |j	                                y rY   r]  r`  s      r]   r   zNopKernelSchedulerNode.__init__  rb  r_   rh  )rk   r   r   r   ri  rj  s   @r]   r  r    s    5 5r_   r  c                  J    e Zd ZU ded<   ded<   	 	 	 	 	 	 d fdZ	 	 d	 	 	 	 	 ddZ	 	 d	 	 	 	 	 ddZ	 	 	 	 	 	 ddZdd	Zdd
Z		 	 	 	 	 	 d dZ
d!dZd"dZd#dZd#dZd#dZd$dZd%dZ	 	 	 	 d&dZd'dZ	 d(	 	 	 d)dZed*d       Zed*d       Zd+dZed,d       Z xZS )-rW  z tuple[Sequence[sympy.Expr], ...]_sizesr4   _bodyc                f    t         |   |       | j                  |       | j                          y rY   )r^  r   r   _compute_attrsr`  s      r]   r   zSchedulerNode.__init__  s,    
 	#T"r_   c                   t        | j                  t        j                  t        j                  f      sJ | j                  j                  ||      \  | _        | _        | j                  j                         }| j                  j                  |      j                  }| || j                        f| _        t        j                   xs t        |j                          }t        | j                  t        j                        r,| j#                  | j                  j%                  |             y | j#                  t'        j$                  | j                  g| j                  d|i       y )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizerv  )r~   rQ   r    ComputedBufferTemplateBuffersimplify_and_reorderrn  ro  get_device_or_errorrP   get_backendgroup_fnr   r   loop_ordering_after_fusionrD   rj   r   extract_read_writesr   )r[   rt  ru  ri  r|  should_normalizes         r]   rq  zSchedulerNode._compute_attrs  s3   
 $))b&7&79J9J%KLLL"&))"@"@'A&? #A #
TZ
 ..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!23  		--8H-I   00JJ!%8Hr_   c                *    | j                  ||       y )Nrs  )rq  )r[   rt  ru  s      r]   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
r_   c                b   t        d | j                  j                  D              }| j                  t	        j
                  | j                  g| j                  d|ij                  |             | j                  j                  |        |r!ddlm} |j                  j                          y y )Nc              3  N   K   | ]  }t        |t        t        f      s|  y wrY   )r~   r*   r)   r   s     r]   r  z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>'  s#      0
ZgwEW5XC0
s   %%rv  r   SIMDScheduling)r   r   r   r   r   r~  ro  rn  r  pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)r[   rv  need_clear_tiling_cache	fake_depsr  s        r]   refresh_dependenciesz"SchedulerNode.refresh_dependencies"  s    
 &0 0
++110
 &
	 	,,

![[4=i	"	
 	""..t4"4 ,,88: #r_   c                    | j                   j                  |      | _         | j                   j                  | _        | j	                  dd       y )NFTrv  r  )ro  reorder_iter_loopssizesrn  r  )r[   	new_orders     r]   apply_new_loop_orderz"SchedulerNode.apply_new_loop_order>  sA    ZZ22

 jj&&!!E4!Pr_   c                    | j                   j                         | _         | j                   j                  | _        | j	                  dd       y )NTFr  )ro  merge_loopsr  rn  r  rc   s    r]   r  zSchedulerNode.merge_loopsF  s<    ZZ++-
jj&& 	!!D%!Pr_   c                   d }| j                   d   }t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|rPt        xj
                  dz  c_        t        j                  d| j                         |       | j                  |       y t        j                  d| j                                y )Nr   r   z"Reorder loops for %s with order %szEDon't reordering %s because we can not decide the suitable loop order)
rn  rp   num_varsdecide_loop_order_to_matchr!   num_loop_reorderingloop_ordering_logr>  rZ   r  )r[   r   r   r  
self_sizess        r]   r   z'SchedulerNode.reorder_loops_by_dep_pairR  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##Wr_   c                $   | j                         }| d| j                  d    | d| j                  d    | d| j                   g}| j                  j	                         D ]  }t        |t              r|j                  }t        j                  j                  |      }t        |t        j                        rZ|j                  | dt        |j                                 t        | j                   t"              rR|j                  d| d       |j                  t%        j&                  | j                   j)                         d	             | j*                  J |j-                  | j/                                d
j1                  |      S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:rH  r   )rZ   r   rn  r   r  r~   r*   rb   rI   r   r  r    r  r  rn   rl   ro  r4   rL  rq   ru   rQ   r,  r   join)r[   rb   linesr  rb  r   s         r]   r   zSchedulerNode.debug_str_extraf  sK   }}f$TZZ]O4f'

17fIdkk]+

 ##446 	OCc7+88gg((2!#r'9'9:LLH:Z

8K7L!MN	O djj(+LL6${34LL)=)=)?HIyy$$$T//12yyr_   c                    | j                   S rY   )rn  rc   s    r]   r  zSchedulerNode.get_ranges|      {{r_   c                    t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  j                               S Nztype(self.node)=)r~   rQ   r    rw  rx  rj   r   r   rc   s    r]   rm  zSchedulerNode.is_reduction  s[    $))b&7&79J9J%KL 	
tDII !	
L DII00233r_   c                L   t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  t        j                        xr. t        | j                  j                  t        j                        S r  )r~   rQ   r    rw  rx  rj   r   	SplitScanrc   s    r]   rp  zSchedulerNode.is_split_scan  sy    $))b&7&79J9J%KL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
r_   c                J    t        | j                  t        j                        S rY   r~   rQ   r    rx  rc   s    r]   rr  zSchedulerNode.is_template  s    $))R%6%677r_   c                f    t        | j                  t        j                        r| j                  S d S rY   r  rc   s    r]   r  zSchedulerNode.get_template_node  s$    &tyy"2C2CDtyyN$Nr_   c                f    | j                          | j                          | j                  |       y rY   )r  r  r  )r[   
index_varss     r]   runzSchedulerNode.run  s#    ""$Z r_   c                &   | j                   }t        t        t        |            t        t        t        |            k(  sJ t	        t        t        j                  j                  |      t        j                  j                  |                  }|S rY   )	rn  r  maprp   dictzipr"  r#  from_iterable)r[   r  r  
var_rangess       r]   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 r_   c                   | j                  |      }	 t        j                  t        t        j                         |            5  t        j
                  j                  |       5   | j                  |  d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w# t        $ r" t        j                  d| j                          w xY w)NzError in codegen for %s)r  rI   set_ops_handlerr9   get_ops_handlerr   set_current_nodero  r   r   fatalrQ   )r[   r  r  s      r]   r  zSchedulerNode.codegen  s    00<
	!!"213D3D3F
"ST())$/( 

J'	( ( ( ( ( (
  	II/;	sA   1B  B$B4B<B B	
BBB B +Cc                    |r| j                   nt        | j                         \  }}t        j                  | j                  |t
        j                  j                  gt        |      z  g      S )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	rn  reversedr   r~  ro  sympySZerorp   )r[   	pointwise
keep_sizesignore_sizess       r]   "pointwise_or_reduction_read_writesz0SchedulerNode.pointwise_or_reduction_read_writes  sT     3<4;;$++AV 
L//JJ
%'',,#lBS1S0T
 	
r_   c                &    | j                  d      S )zH
        Get the memory dependencies in the non-reduction axes.
        Tr  r  rc   s    r]   r  z#SchedulerNode.pointwise_read_writes  s    
 666FFr_   c                &    | j                  d      S )zD
        Get the memory dependencies in the reduction axes.
        Fr  r  rc   s    r]   reduction_read_writesz#SchedulerNode.reduction_read_writes  s    
 666GGr_   c                   | j                         ryt        d | j                         D              ryt        | j                  j
                        dk(  rt        |t        j                        rt        t        | j                  j
                              }t        |t        j                        sJ dt        |             |j                  |j                  k(  xr |j                  |j                  k(  S y)NFc              3  <   K   | ]  }|j                           y wrY   )rm   rQ  s     r]   r  z,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?Ss ?rL  r   ztype(write_dep)=)rr  r  r   rp   r   r   r~   r   r(   r  iterrj   indexsize)r[   ry  	write_deps      r]   rz  zSchedulerNode.can_inplace  s    ?D,<,<,>??t&&'1,l,,2
 T$"2"2"9"9:;Ii)?)?@WEUT)_DVBWW@>>Y__4X)..9XXr_   c                   t               }t        | j                  t              r| j                  j	                         D ]  }|j
                  dk(  s|j                  dk(  s#d|j                  v r|j                  d   dk(  s,t        |j                        dk(  s\|j                  d   dk(  so|j                  d|j                  v r|j                  d   n(t        |j                        dk\  r|j                  d	   nd
        |S )Ncall_methodstoremode
atomic_add   rG  rb      r   r   )r   r~   ro  r4   rM  r\   r  r   rp   rz   r*  )r[   buffers_store_as_atomic_addrQ   s      r]   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    7A|#djj(+

,,. GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr +*r_   )rP   rO   rQ   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r   NN)rt  z*Optional[tuple[dict[Any, Any], list[Any]]]ru  zOptional[Callable[..., Any]]r   r   )rv  r   r  r   r   r   )r  zSequence[int]r   r   r   r"  r   )r   Sequence[Sequence[sympy.Expr]]r   r/  )r  Sequence[sympy.Expr]r   r   )r  r  r   zdict[sympy.Expr, sympy.Expr])r  r  r   r   r,  )r  r   r   r   )r   r   r+  r'  )rk   r   r   r   r   rq  r  r  r  r  r   r   r  rm  rp  rr  r  r  r  r  r  r:   r  r  rz  r  ri  rj  s   @r]   rW  rW    sk   ,,O : 
	 RVBF$N $@ 
	D RVBF
$N
 $@
 
	
;;8<;	;8Q
Q!.7	( ,4
8O!
8	%
 !%	
	
	 	
 G G H H + +r_   rW  c           	     n     j                   } j                  t        j                  j	                  |D cg c]  }|j
                   c}             t         fdt        j                  |D cg c]  }|j                   c} D               j
                  j                  z
   _        y c c}w c c}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wrY   rb   rR  )r  r  group_snodes     r]   r  z2refresh_group_node_dependencies.<locals>.<genexpr>  s.      
xx{;;== 
   (+)
r  r   r   
ReadWrites
merge_listr   r   unionr   r   )r  r  r  s   `  r]   refresh_group_node_dependenciesr    s     F**6+JaAMM+JK
 	 
!'')O1!*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B-0B2rO   c                   t        | t        t        f      sJ || _        || _        d | _        t        j                  |D cg c]  }|j                  |j                   c} | _        t        |        t        d | j                  D              | _        t        d | j                  D              | _        | j                         D ci c]  }|j                         | c}| _        y c c}w c c}w )Nc              3  4   K   | ]  }|j                     y wrY   r   r  r  s     r]   r  z"init_group_node.<locals>.<genexpr>       HHr!  c              3  4   K   | ]  }|j                     y wrY   )r   r  s     r]   r  z"init_group_node.<locals>.<genexpr>  r  r!  )r~   r  GroupedSchedulerNoder  rP   rQ   r   r  r   r  r  r   r  r   r   rZ   r   )r  rP   r  r  r   s        r]   init_group_noder    s    
 k$68L#MNNNK%KK&,,%	A!)@!++	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@# ##K 
B#s   C*C*	C/c                      e Zd ZU dZded<   e	 	 	 	 	 	 dd       Zedd       Z	 	 	 	 	 	 ddZ	d  fdZ
ed!d       Zd!d	Zed"d
       Zd#dZd!dZd!dZ	 	 	 	 	 	 d$ fdZed"d       Zed"d       Zd%dZd!dZed&d       Zed&d       Zed&d       Zed'd       Zd(dZed&d       Zd)dZd*dZd+dZd!dZ xZ S ),r  z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r0  r  c           	        |j                   |j                   u sJ t        |t        t        f      sJ |j	                         rt        |t
              rt        |j                  t              sJ t        |j                  j                        dk(  sJ t        t        t        |j                  j                              t              sJ t        t        |j                  j                              j                  }|j                         D cg c]  }|j	                         s| }}t        |      dk(  sJ |d   }t        |j                  j                        dk(  sJ t        t        |j                  j                              }t        |t               sJ t#        t!        ||j$                  |j&                  |j(                  |j*                        g      |j                  _
        nt        |t        t        f      sJ t-        t/        j0                  |j                         |j                                     } | |j                   |      S c c}w )Nr   r   )rP   r~   rW  r  rr  r  rQ   r1   rp   r   r   r  r  r)   rb   rM  r(   r   r  	var_namesr  r  r   r"  r#  )	clsr:  r;  rb   rQ   template_nodesr  writer  s	            r]   fusezFusedSchedulerNode.fuse  s    %//111%-1C!DEEE:e5N#O ejj+666u((//0A555d4(9(9(@(@#ABGLLLU..5567<<D/4/@WtDDTDTDVdWNW~&!+++*1-M}00778A===m77>>?@EeY///'1ekk5??EJJ

(E$ em5G%HIIIY__U__%68IJK5??E**! Xs   I'Ic                    t        t        d d | j                         D                    }t        |      dk(  ry t	        |      }|S )Nc              3  |   K   | ]4  }|j                         s|j                         r|j                          6 y wrY   rr  rt  r  rK  s     r]   r  z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>E  6      '')T^^-= '')   :<r   r   filterrM  rp   r  r[   fpsrets      r]   r  z!FusedSchedulerNode.estimate_flops?  K      $ 0	
 s8q=#h
r_   c                   | j                         ry d }| j                  D ]`  }t        |t              sJ |;t	        |      t	        |j
                  d         k7  rt        j                  d        y |j
                  d   }b d }|J t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|s%t        j                  d| j                                y t        xj                  dz  c_        t        j                  d| j                         |       | j                  D ]%  }t        |t              sJ |j                  |       ' t        |        y )Nr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %s)rr  r  r~   rW  tuplern  r  r>  rp   r  r  rZ   r!   r  r  r  )r[   r   r   r  snoder  s         r]   r   z,FusedSchedulerNode.reorder_loops_by_dep_pairQ  sI    
[[ 	)Ee]333%%
*;uU\\RS_?U*U!''G aJ	) 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[ 	2Ee]333&&y1	2 	(-r_   c                    t         |   |       t        | ||       g | _        t	        |d       j
                  | _        y )Nc                4    t        | j                               S rY   )r   rm  r  s    r]   r   z-FusedSchedulerNode.__init__.<locals>.<lambda>z  s    s1>>3C/D r_   rE  )r^  r   r  rV   r  r   r[   rP   r  ra  s      r]   r   zFusedSchedulerNode.__init__v  s8    #i0%'
%DEKK
r_   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w N_r  r  rZ   r[   r  s     r]   rZ   zFusedSchedulerNode.get_name|  )    xxt{{;!;<<;   8c                <    | j                   d   j                         S Nr   r  rZ   rc   s    r]   rH  z!FusedSchedulerNode.get_first_name      {{1~&&((r_   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rY   r   r  r  rR  r  s     r]   rR  z#FusedSchedulerNode.get_buffer_names  .    !L1!"4"4"6!LMM!L   9c                j    g }| j                   D ]!  }|j                  |j                                # |S rY   r  r,  r   r[   rs   rQ   s      r]   r   zFusedSchedulerNode.get_outputs  4    (*KK 	.DMM$**,-	.r_   c           
     ~   t        | j                        D cg c]+  \  }}| j                          d| d|j                          - }}}| j                  d   j                  }||j                  | j                                t        j                  dj                  |      j                         d      S c c}}w )Nz.snodes[z] =
r   r   rH  )r  r  rZ   ru   rQ   r,  r   rL  rq   r  r   )r[   r  rQ   r  s       r]   r   z"FusedSchedulerNode.debug_str_extra  s     %T[[1
4 }}xs%0@/AB
 
 {{1~""LL3356tyy/668&AA
s   0B9c                h    | j                   D cg c]  }|j                          }}|  d| S c c}w )Nz
, snodes: )r  r   )r[   rQ   
snodes_strs      r]   r   z"FusedSchedulerNode.debug_str_short  s9    9=Ed**,E
Ez*.. Fs   /c                    t         |   ||       t               }t        | j                        D ]/  }|j                  ||       |j                  |j                         1 y rY   )r^  r  r   r  r  updater   )r[   r  r  rQ   ra  s       r]   r  z!FusedSchedulerNode.set_last_usage  s\    
 	24FG 0:|T[[) 	8D 35GH&&t7	8r_   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rY   )r   r  r  r$  r  s     r]   r$  z$FusedSchedulerNode.used_buffer_names  s.    !MA!"5"5"7!MNN!Mr  c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rY   )r   r  r  r  r  s     r]   r  z/FusedSchedulerNode.used_or_aliased_buffer_names  s3    8<D1a,,.D
 	
Dr  c                    | j                   S rY   r  rc   s    r]   rM  zFusedSchedulerNode.get_nodes  r  r_   c                T    t        |       j                   d| j                          dS )Nz(nodes=r   r   rc   s    r]   r   zFusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@r_   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrY   )rm  r  s     r]   r  z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     91>>#9rL  r  r  rc   s    r]   rm  zFusedSchedulerNode.is_reduction  s    9T[[999r_   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrY   )rp  r  s     r]   r  z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s     :1??$:rL  r   rc   s    r]   rp  z FusedSchedulerNode.is_split_scan  s    :dkk:::r_   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrY   r  r  s     r]   r  z1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8q1==?8rL  r   rc   s    r]   rr  zFusedSchedulerNode.is_template  s    8DKK888r_   c                j    | j                   D ]$  }|j                         s|j                         c S  y rY   )r  rr  r  r[   rQ   s     r]   r  z$FusedSchedulerNode.get_template_node  s5    KK 	0D!--//	0 r_   c                     | j                   d   S r
  )r   rc   s    r]   r   zFusedSchedulerNode.get_device  s    zz!}r_   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrY   )r  r  s     r]   r  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA1--/ErL  r   rc   s    r]   r  z+FusedSchedulerNode.has_aliasing_or_mutation  s    EEEEr_   c                    t         rY   NotImplementedErrorr  s     r]   r  z'FusedSchedulerNode.update_mutated_names      !!r_   c                    t         rY   r+  )r[   rb   s     r]   r  zFusedSchedulerNode.add_fake_dep  r-  r_   c                    t         rY   r+  rx  s     r]   rz  zFusedSchedulerNode.can_inplace  r-  r_   c                P   | j                         }dj                  d | j                  D              }t               }|j	                  | dt        |       j                   d| d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j	                  |j                                # 	 ddd       |j                  d       	 |j	                  | j!                                |j)                         j+                         S # 1 sw Y   XxY w# t"        $ r t$        j'                  dd       Y Lw xY w)r   rg   c              3  F   K   | ]  }t        |      j                    y wrY   )rj   rk   rX  s     r]   r  z/FusedSchedulerNode.debug_str.<locals>.<genexpr>  s     FQQ 0 0Fs   !rf   r   r   r   r   r   r   z.outputs = [
            Nrh   r   Tr   )rZ   r  r  rA   r   rj   rk   rn   r   r   r   r   rq   r   ru   ri   r   r   r   r   rr   r   )r[   rb   node_typestrr   r   s        r]   ru   zFusedSchedulerNode.debug_str  s   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   )5E69F 6E? F%$F%r:  rL   r;  rL   r   r  r-  r"  rP   rO   r  r0  r   r   r   r'  r   zlist[SchedulerBuffer]r%  r*  r   r/  )r   torch.devicer#  )rb   r'   r   r   r+  )!rk   r   r   __doc__r   classmethodr  r:   r  r   r   rZ   rH  rR  r   r   r   r  r$  r  rM  r   rm  rp  rr  r  r   r  r  r  rz  ru   ri  rj  s   @r]   r  r    s    $#+%+.?+	+ +B  "#.!#..7#.	#.JL = =) N N	B/8#28HV8	8 O O 
 

A : : ; ; 9 9   F F
"""*r_   r  c                  N    e Zd ZU dZ	 	 	 	 ddZ	 	 	 	 ddZedd       Ze	 	 	 	 	 	 dd       Z	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	e	 	 	 	 dd       Z
e	 	 	 	 dd       ZeZd	ed
<   e	 	 	 	 dd       Ze	 	 	 	 dd       ZddZddZddZddZd dZd!dZ	 	 	 	 d"dZ xZS )#ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    c                    |j                         D ]=  }|j                         | j                  v s | j                  |j                            c S  y rY   )r   rZ   read_to_node)r[   producerr   s      r]   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for  sL     '') 	9C||~!2!22((88	9 r_   c                   t        t                  }|j                  j                  D ]  }|j                  | j
                  j                  vr&| j
                  j                  |j                     j                         }|| j                  v sf|j                  | j                  |           t        |      dk(  rt        t        |            S y Nr   )r   rL   r   r   rb   rP   r   r^   name_to_noder*  rp   r  r  )r[   consumer	producersrd	node_names        r]   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,, 	<Bwwdnn88822277;LLNID---d//	:;	< y>QY((r_   c                   t        |      }j                         r|j                         rt        j                  t              t        j                  t        |      }t        j                        t        |j                        k(  }|s |d       |xr2 t        fdt        j                  |j                        D              S |j                         rkj                         r	 |d       yt        j                  t        |      }|j                        }||j                  j                  |      S  |d       yj                         rk|j                         r	 |d       yt        j                  t              j                  |      }|j                  j                  ||      S  |d       yt        d      )	Nzforeach do not have same lengthc              3  \   K   | ]#  \  }}j                   j                  ||       % y wrY   )rP   can_fuse)r  lrr=  s      r]   r  z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>$  s0      )Aq ""++Aq1)s   ),zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r3  rv  typingcastr:  rp   r  r[  r  rm  r>  rP   rI  rF  r  )r  r=  rB  whyforeach_matchconsumer_subnodeproducer_subnodes    `     r]   rI  z#ForeachKernelSchedulerNode.can_fuse  s   (+ X%8%8%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    "$$&n {{#=xHH'@@J+))228=MNNGH  "$$&n {{#=xHH'@@J+))223CXNNGHf
 	
r_   c                
   |j                         s|j                         sJ |j                         r3t        j                  t        |      }|j                  }|j
                  }n2t        j                  t        |      }|j                  }|j
                  }d }d }|j                         r|j                         r|t        j                  t        |      }t        j                  t        |      }t        |j                  |j                        D cg c]  \  }}t        j                  ||       }	}}n/|j                         rt        j                  t        |      }|j                  |      }
g }	|}d }|j                  D ]A  }||
u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C n|j                         rt        j                  t        |      }|j                  |      }g }	|}d }|j                  D ]A  }||u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C nt        d       | |j                  |	||||      S c c}}w )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)rv  rL  rM  r:  rS  rV  r  r  r  r  rF  r  r>  r  rP   )r  r=  rB  rS  rV  rT  rU  rJ  rK  fused_nodesrQ  rQ   new_noderP  s                 r]   r  zForeachKernelSchedulerNode.fuseJ  sZ    ""$(;(;(=== {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O X%8%8%:{{#=xHH{{#=xHH  AAq #''1-K    "{{#=xHH'@@JK"KK  -++166tXFH"*K&&x0&&t,-   "{{#=xHH'@@JK"KK  -++166xFH"*K&&x0&&t,- !f  &?##+
 	
Ks    I?c                    i  _         i  _        ||qt           ||       |D ]Z  }|j                  j
                  D ]  }| j                   |j                  <    |j                         D ]  }	| j                  |	<    \ n| _        | _	        d  _
        g  _         j                  t        j                  j                  |j                  |j                  g             t!         fdt!        j"                  |j$                  |j$                        D               j                  j&                  z
   _        t)        |j*                  |j*                  g       _        t-        |j.                  |j.                  g       _        |j1                         rt3        |t4              sJ ||}}
nt3        |t4              sJ ||}}
|
j6                   _         j6                  j9                  |j6                         |
j                   _        |j                         D ]  }	| j                  |	<     j                  D ci c]'  }|j:                  j=                         D ]  \  }}||
 ) c}}} _        | _        |d   jA                         }|sJ |tC        jD                  d      fff _#        t!        tH        jJ                  jL                             _'        | _(        y c c}}}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wrY   r  r1  s     r]   r  z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>  s0       xxt'<'<'>>	 r  r   combo_kernel))r<  rA  r^  r   r   r   rb   rN  rP   r  rQ   rV   r   r   r  r  r   r  r   r   r  r   r  r   rv  r~   r:  r   r  r   itemsrS  r   r  Exprr   r   fxNoder  rV  )r[   rP   r  rS  rT  rU  rV  rQ   r  rb   foreach_node
other_noder  r  vri  ra  s   `               r]   r   z#ForeachKernelSchedulerNode.__init__  s    +"5GY/ 3 ,,22 8D37D%%dii08 !446 3D.2D%%d+3	3 'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%'!+/IJJJ+6j!+/IJJJ+6j)33DNNN!!*"6"67 , 9 9D"668 5*4!!$'5 #'++@ @%:O:O:U:U:W@26!Q1@@D  *C&%%'v

> :<>?
!%((--02.@s   ,K&c           	        |D cg c]  }t        |t              s| }}|rSt        j                  dt	        |      |D cg c])  }|j
                  |j
                  j                         + c}       |D cg c]  }t        |t        t        f      s| }}|D cg c]  }t        |t              s| }}|rt        j                  dt	        |             |D cg c]  }t        |t              r| }}|D cg c]  }|j                         s| }}|r t        j                  dt	        |      |       |D cg c]	  }||vs| }}|S c c}w c c}w c c}w c c}w c c}w c c}w c c}w )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d foreach nodes are filteredz0ComboKernels: %d template nodes are filtered: %s)
r~   r  r   r>  rp   rQ   r  r  r:  rr  )r  r  r  externrQ   filtered_nodesforeach_nodesr  s           r]   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodes  st    #Oj4M&N!OOIIAF5;UTtyy?T&&(U 
a"8:S!TU 
 
 &
A7Q)RA
 
 IICSEWX%
Z;U-VA
 
 &4Gq}}!GGIIBN#
 &4Oq7N!OO9 P
 V




 H PsL   EEEE:EE#5E# E(6E( E-E-	E2E2c           
         | j                         }g }d}|D ];  }|j                  t        dt        |      |      D cg c]
  }||||z     c}       = |S c c}w )zS
        Returns a list of lists of nodes that are to be grouped together.
           r   )_topological_sort_nodesr,  rangerp   )rP   sorted_nodesgrouped_nodesmax_num_nodesr  r  s         r]   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels  sw     !88:! 	E   #1c%j-@ !a-/0	 s   A
4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    | t         _        y rY   r:  rq  )custom_group_algorithms    r]   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels  s    
 # 	#Dr_   c                ,    t         j                  |       S rY   rs  rP   s    r]   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels  s     *KKIVVr_   c                    t         rY   r+  rc   s    r]   r  z#ForeachKernelSchedulerNode.mark_run  r-  r_   c                    t         rY   r+  rc   s    r]   r  z"ForeachKernelSchedulerNode.codegen  r-  r_   c                     yrf  r   rc   s    r]   rv  z%ForeachKernelSchedulerNode.is_foreach!  r  r_   c                ,    t        | j                        S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r  rc   s    r]   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes$  s     DKK  r_   c                t    t        t        j                  j                  d | j                  D                    S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  <   K   | ]  }|j                           y wrY   )rM  r  s     r]   r  z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>,  s     1UA!++-1UrL  )r   r"  r#  r  r  rc   s    r]   rM  z$ForeachKernelSchedulerNode.get_nodes)  s(     IOO111U1UUVVr_   c                <    | j                   d   j                         S r
  )r  rH  rc   s    r]   rH  z)ForeachKernelSchedulerNode.get_first_name.  s    {{1~,,..r_   c                    t        | || j                  j                         | j                  D ]  }|j	                  |        y rY   )rA  rP   r   r  rC  )r[   rB  rQ   s      r]   rC  z/ForeachKernelSchedulerNode.prune_redundant_deps1  s=     	d$68R8RSKK 	:D%%&89	:r_   )r=  rL   r   rR   )rB  rL   r   rR   r=  rL   rB  rL   r   r   )r=  rL   rB  rL   r   r:  )NNF)rP   rO   r  r0  rS  r   rT  rR   rU  rR   rV  r   r   r   r  r0  r   r0  )rP   rO   r   list[list[BaseSchedulerNode]])rt  rp  r   r   r   r   r   r0  r*  r   r(  )rk   r   r   r7  r>  rF  r8  rI  r  r   rg  r1  ro  rq  r   ru  rx  r  r  rv  r}  rM  rH  rC  ri  rj  s   @r]   r:  r:    s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %F/F/ (F/ $(	F/
 1F/ 1F/ F/ 
F/P +	  B 	& * 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	:r_   r:  c                       e Zd ZU dZded<   edd       Zd fdZddZddZ	e
dd       Zdd	Ze
dd
       ZddZe
dd       ZddZedd       Z xZS )r  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r0  r  c                    |d   j                   t        fd|D              sJ  | |      }|D ]  }|j                  |j                         <   ! |j                  |j                         <   |S )Nr   c              3  :   K   | ]  }|j                   u   y wrY   rw  )r  rQ   rP   s     r]   r  z.GroupedSchedulerNode.create.<locals>.<genexpr>I  s     B44>>Y.B   )rP   r[  rB  rZ   )r  r  grouped_snoder  rP   s       @r]   createzGroupedSchedulerNode.createF  sy    1I''	B6BBBBIv. 	KE=JI(()9:	KAN	$$]%;%;%=>r_   c                >    t         |   |       t        | ||       y rY   )r^  r   r  r  s      r]   r   zGroupedSchedulerNode.__init__P  s    #i0r_   c                   | j                   D ])  }|| j                  j                  |j                         <   + | j                  j                  | j                         = | j                  j	                  | j                         S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  rP   rB  rZ   
fuse_nodes)r[   r  s     r]   unpackzGroupedSchedulerNode.unpackT  se    
 [[ 	HEBGDNN--enn.>?	HNN--dmmo>~~((55r_   c                    | j                  | j                  j                  |             | j                  j	                  |       y rY   )r   r   r  r   r*  )r[   fake_deps     r]   r  z!GroupedSchedulerNode.add_fake_dep^  s5    T--77AB##H-r_   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w r  r  r  s     r]   rZ   zGroupedSchedulerNode.get_nameb  r  r  c                <    | j                   d   j                         S r
  r  rc   s    r]   rH  z#GroupedSchedulerNode.get_first_namef  r  r_   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rY   r  r  s     r]   rR  z%GroupedSchedulerNode.get_buffer_namesi  r  r  c                j    g }| j                   D ]!  }|j                  |j                                # |S rY   r  r  s      r]   r   z GroupedSchedulerNode.get_outputsm  r  r_   c                    t        t        d d | j                         D                    }t        |      dk(  ry t	        |      }|S )Nc              3  |   K   | ]4  }|j                         s|j                         r|j                          6 y wrY   r  rK  s     r]   r  z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>y  r  r  r   r  r  s      r]   r  z#GroupedSchedulerNode.estimate_flopss  r  r_   c                    | j                   S rY   r  rc   s    r]   rM  zGroupedSchedulerNode.get_nodes  r  r_   c                     yr5  r   )r  r=  rB  s      r]   rI  zGroupedSchedulerNode.can_fuse  s     r_   )r  r0  r   r  r4  r  )r  r'   r   r   r   r'  r5  r-  r*  r  )rk   r   r   r7  r   r8  r  r   r  r  r:   rZ   rH  rR  r   r  rM  rI  ri  rj  s   @r]   r  r  :  s     $# 16. = =) N N  "  r_   r  c           
          t         j                  d fd       }t        t        t	        t         d                           }t        |      dkD  r|D cg c]  } |   	 c} t        j                  r|j                  |       |S c c}w )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    c                t   |    dk(  s|   dk(  rt        |    dk(  |   dk(        S D cg c]  }t        ||           }}D cg c]  }t        ||          }}t        d t        ||      D              }t        d t        ||      D              }||kD  ry||kD  ryt        ||       S c c}w c c}w )Nr   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr   Nr   r  sl_asl_bs      r]   r  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  )      
)3tDAI$$
   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr  r   r  s      r]   r  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  r  r  r  )r;   absr  r  )	abslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          r]   	index_cmpz"pick_loop_order.<locals>.index_cmp  s    8q=E!HMuQx1}eAh!m44 .<<rBqE
<<-;<rBqE
<<  
7:<7V
 
  
7:<7V
 
 WW 1ay# =<s   B0	B5r   rE  )r  r   r  r   r   r   )		functools
cmp_to_keyr   r  rk  rp   r   pick_loop_orderssort)r  r  priority_idxr  orderpis   ``    r]   pick_loop_orderr    s      4 %N1$5 6789E
<17CD.,D

y
!L Es   Bc                  T    e Zd ZU ded<   dZded<   dZded<   ddZddZdd	Zdd
Z	y)NodeUser$Union[BaseSchedulerNode, OutputNode]rQ   Fr   rz  is_weakc                v    t        | j                  j                         | j                  | j                  f      S rY   )ra   rQ   rZ   rz  r  rc   s    r]   rd   zNodeUser.__hash__  s+    TYY'')4+;+;T\\JKKr_   c                    t        |t              xrW | j                         |j                         k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S rY   )r~   r  rZ   rz  r  r[   others     r]   __eq__zNodeUser.__eq__  s[    uh' .5>>#33.  E$5$55. -		
r_   c                6    | j                   j                         S rY   rw   rc   s    r]   rZ   zNodeUser.get_name  rx   r_   c                    | j                   |j                   u sJ t        | j                   | j                  xr |j                  | j                  xr |j                        S rY   )rQ   r  rz  r  r  s     r]   r   zNodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
r_   Nr   )r  objectr   r   r   )r  r  r   r  )
rk   r   r   r   rz  r  rd   r  rZ   r   r   r_   r]   r  r    s3    
..K GTL
$
r_   r  c                      e Zd ZU dZded<   dJdZdJ fdZdKdZedLd       Z	e	j                  dMd       Z	dNd	ZdOd
ZdPdZdNdZdNdZdNdZ	 	 	 	 dQdZdRdZdSdZdNdZdNdZdQdZdNdZ	 	 	 	 dTdZ	 	 	 	 	 	 dUdZ	 	 	 	 	 	 dVdZdNdZdWdZ	 	 	 	 	 	 dXdZdYdZ	 	 	 	 dQdZdZd[dZ d\dZ!	 	 	 	 d]d Z"	 	 	 	 	 	 d^d!Z#	 	 	 	 	 	 d^d"Z$	 	 	 	 	 	 d^d#Z%	 	 	 	 	 	 	 	 d_d$Z&	 	 	 	 	 	 d`d%Z'dad&Z(	 	 	 	 	 	 	 	 dbd'Z)d^d(Z*	 	 	 	 	 	 d^d)Z+	 	 	 	 	 	 	 	 dcd*Z,ddd+Z-ded,Z.	 	 	 	 	 	 d`d-Z/	 	 	 	 dfd.Z0	 	 	 	 dgd/Z1dNd0Z2dNd1Z3dNd2Z4dhd3Z5did4Z6djd5Z7dkd6Z8	 	 	 	 	 	 dld7Z9dad8Z:	 	 dmd9Z;	 	 	 	 dnd:Z<	 	 	 	 	 	 dod;Z=	 	 	 	 	 	 dpd<Z>	 	 	 	 dqd=Z?	 	 	 	 dQd>Z@	 	 	 	 dQd?ZA	 	 	 	 dQd@ZB	 	 drdAZCdNdBZD	 	 	 	 	 	 dsdCZEdNdDZFd\dEZG	 	 	 	 dtdFZHdudGZIdvdHZJdNdIZK xZLS )wrO   z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    zdict[Dep, int]_Scheduler__dep_size_hint_cachec                f    t        d      5  | j                  |       d d d        y # 1 sw Y   y xY w)NzScheduler.__init__)r   _initr[   r  s     r]   r   zScheduler.__init__  s,    ./ 	JJu	 	 	s   '0c           
         t                    i  _         t        j                  _        i  _        t        t               _	        t        j                          _        t                _        t        g t        j                  j                  j!                         t        j                  j"                  j!                         t        j                  j$                  j!                                _        |D cg c]  } j)                  |       c} _         j-                           j&                  j/                  t        j                  j"                  j!                                 j*                  D ]  }|j1                            j3                          _         j*                  D ci c]  }|j7                         | c} _         j*                  D ci c](  }|j;                         D ]  }|j7                         | * c}} _         j8                  j?                          _         i  _!        i  _"        tG        jH                   j*                   j<                   j@                         _         jK                           jM                   j*                         _         jO                           j*                  D ci c]  }|j7                         | c} _          jQ                          tR        xjT                  tW         j*                        z  c_*        ddl,m-}m.}  | j*                         tW         j*                         _/         ja                           jM                   j*                         _        t        tb        td        td        f              _3        th        jj                  $ti        jj                   j*                         _         jm                   j*                         _        th        jn                  $ti        jn                   j*                         _         jq                           js                          th        jt                  r jw                  d        th        jx                  rddl=m<}  | j*                   j<                   j@                  t        t        j                  j                  j!                               t        t        j                  j}                                      _        th        j~                  r$tG        j                   j*                         _         j                          t        j                  jh                  j                  r@ j                   j*                         _         j                   j*                         _         j                           | j*                         t        j                  j                   j*                          j                          t                _K        i  _L        t        d      j                   fd       y c c}w c c}w c c}}w c c}w )Nr   )log_ir_post_fusionlog_ir_pre_fusion)num_ck_nodesr   )reorder_for_peak_memorygraph_statsc                 ^     j                    j                  t         j                        dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesrp   r  rc   s   r]   r   z!Scheduler._init.<locals>.<lambda>`  s'     33+/+>+>*-djj/ r_   )Or^  r   r  rI   r   rP   backendsr  _post_grad_graph_counterr  r"  count_graph_partition_counterr   r  r  keys	constantstorchbind_constantsr0  create_scheduler_noder  update_zero_dim_cpu_tensorr  r  get_donated_buffersr   rZ   rA  r   r   copyrB  r  mutation_renamesr   decide_global_ordering_of_commscompute_dependenciestopological_sort_scheduledead_node_eliminationcompute_ancestorsr!   ir_nodes_pre_fusionrp   torch._inductor.debugr  r  r  create_foreach_nodesr  r   logged_slow_fusionr   _pre_fusion_custom_passr  _post_fusion_custom_passr  finalize_multi_template_bufferscombo_kernelscreate_combo_kernel_nodesr  memoryget_output_names reorder_for_compute_comm_overlap$reorder_compute_and_comm_for_overlapprocess_grouped_nodesr   r   graph_partition&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usager>  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)	r[   r  rY  rQ   r   r  r  r  ra  s	   `       r]   r  zScheduler._init  s   %'" <>"&'?"@(1(9%5?\!&0%%**,""'') ,,113'
# >CCd003C
'')##**177+<+<+A+A+CDJJ 	DOO	 $$& 	# &*ZZ;
 !AJJL!O;
 -1JJ8
$($BRBRBT8
;>CLLNC8
8
 AE@Q@Q@V@V@X 35 13 ::JJ##

 	!!#33DJJ?
""$<@JJ"Gq1::<?"G ##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ__TZZ0
**688DDJ,,.***= ))70

  ''177//44671773356DJ 22CCDJJODJ""$??!!11DDTZZPDJJJ4::VDJ!4::&	djj) 6@\! :<'//	
G D;
8
B #Hs   <Y1Y -YYc                   i }t         j                  j                  D ]d  }t        t         j                  j                  |   t        j
                        s9t        | t         j                  j                  |   d       ||<   f |S )N)rS   )rI   r   graph_inputs_originalr~   r    DonatedBufferr   )r[   name_to_donated_bufrb   s      r]   r  zScheduler.get_donated_buffersg  sp     GG11 	D!''77=r?O?OP,BGG11$7 $-#D)	 #"r_   c                6    t         j                  j                  S rY   rI   r   current_devicerc   s    r]   r  zScheduler.current_devicer  s    ww%%%r_   c                .    |t         j                  _        y rY   r   rh  s     r]   r  zScheduler.current_devicev  s    !'r_   c                    t         j                  j                  dd      dk(  rddlm}  || j
                  d       yy)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r>  r  r  )r[   r  s     r]   r  zScheduler.debug_draw_graphz  s1    ::>>:DASH+6 Ir_   c                    t         j                  t        j                        r8t         j	                  d|       | j
                  D ]  }|j                           y y )Nz%s:)r   isEnabledForloggingINFOr   r  r   )r[   labelrQ   s      r]   debug_print_nodeszScheduler.debug_print_nodes  sF    GLL)HHUE"

 #  "# *r_   c                6   |j                         J d       |j                         rt        | |      S t        |t        j
                  t        j                  f      rt        | |      S t        |t        j                        rt        | |      S t        |      )Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  r~   r    rw  rx  rW  ExternKernelr  r,  r&  s     r]   r  zScheduler.create_scheduler_node  s    !- 	
@	
- ==?)$55r00"2C2CDE t,,boo.,T488%d++r_   c                   t               }g }| j                  j                         }t        j                  j
                  j                         D ]  }|D cg c]%  }||v rt        | j                  |   t              s|' }}|s6|j                  |       |D cg c]  }| j                  |    }}t        j                  dkD  }t        | |d|      }|j                  |       |D ]  }|| j                  |<     | j                  D 	cg c]  }	|	j!                         |vs|	 c}	t#        |      z   | _        y c c}w c c}w c c}	w )Nr   FrS  rV  )r   rB  r  rI   r   listsr   r~   rA  r  r  r   combo_kernels_autotuner:  r  r  rZ   r   )
r[   removed_node_namesfe_nodeskept_node_namesnamesrb   r  rV  fe_noderQ   s
             r]   r  zScheduler.create_foreach_nodes  sN   .8l11668WW]]))+ 	8E "?*"4#4#4T#:<RS E  %%e,:?@$d''-@F@$;;a?O0*/ /	G OOG$ 807''-81	88 "ZZ
4==?BT+TD
N
5 A
s   *D<EE#Ec                    ! t        d      } G fddt        |         t        j                          j                  D ]  }|j                         D ]}  }|j                         }|j                         D ]X  }| v r=| v r9 |   } |   }||z   } j                         D ]  }	 |	   |u s |	   |u s| |	<    D| v r	 |    |<   Q |    |<   Z   d! fd!	 	 d	 	 	 	 	 	 	 	 	 d !fd}
i }t        j                  j                  j                         D ]  \  }}t        |t        j                        r|j                   D ]  }d||<   	 7t        |t"        j$                        sR|j'                         D cg c]  }t        |t        j                        s|! }}|D ]  }|j                   D ]  }d||<   	    j                  D ]  }t(        j+                  d|j,                         |j,                  J t/        |j,                  j1                         d 	      }|D ]6  }t        |t        j2                        sJ ||vs$|j                         ||<   8 t/        |j,                  j5                  d
      d 	      }|D ]d  }||v sJ | d|        ||   x} j6                  |   j                         D ]*  }|j9                  t;        |j                                      , f t=        |j>                  j@                        dk(  rGtC        tE        |j>                  j@                              x}rt        |tF              r|jH                  }nd}|j                         D ]  }t=        |jK                               dk  sJ |jK                         D ]  } !|      } |
||       |j9                  t;        ||              |   j                  D ]  }|j                         |j                         k(  r%t        |j,                  tL              sJ |j,                  jO                         D ]?  } !|      }|j9                  tQ        ||j                                       |
||d
       A    |j>                  jR                  D ]6  }t        |tP              r |
|jT                  ||jW                  |             8 |jY                   jZ                         |j                         D ]  }|jK                         D ]y  }|j                          jZ                   !|      <   |j                          jZ                  |<    j\                  j_                  ||       j\                  |j                         <   {   t        j                  ja                         D ]3  }t(        j+                  d|        |
|tc        t;        |                   5 t        j                  jd                  D ]  }|j5                  d
      D ]|  }||v sJ | d|j                                 ||   x}s) j6                  |   jO                         D ]4  }t(        j+                  d||        |
|tc        t;        |                   6 ~   jZ                  D ]  }|t        j                  j                  v rE |
|tc        t;        |                   t        j                  jf                  ji                  |       d|t        j                  jj                  v s |
|tc        t;        |                    tm        t        j                  j                  j                               D ci c]  \  }}||
 }}}t        j                  jf                  D cg c]  }||   	 c}t        j                  _7         j                  D ]C  }|j                         D ].  }|jq                   |j                            j                         0 E  jr                  D ]-  } jr                  |   jq                   |   j                         / yc c}w c c}}w c c}w )zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        Tc                  >    e Zd ZdZ	 	 d	 	 	 	 	 ddZddZd	 fdZy)
1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nc                @    |xs g | _         |xs
 t               | _        y rY   )r\  r   
membership)r[   r\  r!  s      r]   r   z:Scheduler.compute_dependencies.<locals>.DedupList.__init__  s    
 #[b
","<
r_   c                    || j                   v ry | j                  j                  |       | j                   j                  |       y rY   )r!  r\  r  r*  )r[   	node_users     r]   r  z8Scheduler.compute_dependencies.<locals>.DedupList.append  s5    /

!!),##I.r_   c                    t        j                  | j                  |j                        }| j                  |j                  D cg c]  }|| j                  vs| c}z   } ||      S c c}w rY   )r   r  r!  r\  )r[   r  new_membershipr  	new_items	DedupLists        r]   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1!1$//5CSCS!T JJ${{*at.FA* 	 !N;;*s   A+A+r  )r\  zOptional[list[T]]r!  zOptional[OrderedSet[T]]r   r   )r#  r  r   r   )r  DedupList[T]r   r)  )rk   r   r   r7  r   r  r(  )r'  s   r]   r'  r    s;     ,06:=(= 4= 	=/<r_   r'  c                N    | j                   v r j                   |          S | S rY   )r  )rY  r   r[   s    r]   r   z.Scheduler.compute_dependencies.<locals>.rename  s,    D)))d33A677Hr_   c                P     |          j                  t        |||             y rY   )r  r  )used_by_namer  rz  r  name_to_usersr   s       r]   add_userz0Scheduler.compute_dependencies.<locals>.add_user  s)     &./66K9r_   Nzscheduling %sc                    | j                   S rY   r  r   s    r]   r   z0Scheduler.compute_dependencies.<locals>.<lambda>!	  s
    AFF r_   rE  T)unbacked_onlyc                    | j                   S rY   r  r   s    r]   r   z0Scheduler.compute_dependencies.<locals>.<lambda>,	  s    RSRXRX r_   z not in r   )r  )mutating_buf)r  zscheduling output %sz+scheduling output %s for unbacked symint %s)rY  r   r   r   )FF)
r,  r   r  r  rz  r   r  r   r   r   ):r   r   r  r   r  r   rZ   rm   r  rI   r   r  r\  r~   r  r]  r   r    	TensorBoxr   r   r>  rQ   rJ  get_unbacked_symbol_defsSymbolget_free_symbol_usesrA  r  r)   rp   r   r   r  r  r(   r  ro   rL   rR  r*   r   rb   rz  r  r  r  r  r  r   graph_outputsmutated_inputsr*  r  r  mutated_input_idxsr   r   )"r[   r  rQ   buf1	buf1_name	buf2_namelist1list2combinedrF  r.  unbacked_symbol_to_origin_noderb   valfsr  sym_sizeunbacked_symbol_defsunbacked_symbol_usesrK  r   r  	node_modealt_namert   
other_namer  rb  r   r  	inp_namesr'  r-  r   s"   `                              @@@r]   r  zScheduler.compute_dependencies  sP    CL	<
 	<> @K?V?V@
 JJ 	LD((* L MMO	!%!1!1!3 LI M1i=6P -i 8 -i 8#(5=#0#5#5#7 >C -c 2e ;#0#5#>5=c 2> #m33@3Ki03@3Ki0LL	L(	 !&!			;	 	 		
 	 MO&
 --335 
	BID##uzz*** >B9=226>C. (+||~S!Auzz9RASS! BAnn B=A6r:BB
	B JJ J	DIIotyy1 99(((#)		224:J$  * H!!U\\222 ::8<215H $*		..T.BHX$  * C:: c"@!AB: 8::AG#003??A C))'#,,.*ABCC D$$++,1 d&6&6&=&=!>??S?sI.HH	 	 '') E3,,./1444 # 1 1 3 EH%h/HXt,%%ghY&GH -h 7 = = E==?dmmo=$)$))5FGGG*.))*D*D*F EJ)/
);J -- '
 P %ZtDEEEE, ((.. F!$0TYYd.>.>t.DEF %%d&;&;< '')  # 1 1 3 H>AllnD))&*:;69llnD))(3//33HhG ++CLLN;IJ	Z 002 	>HII,h7Xz'(*;<=	>
 77(( 
	JC--D-A 	J:: c"@"E"E"G!HI: 7q9919$($5$5a$8$I$I$K J		I8UV !:gh6G+HI	J	J
	J )) 	:Dqww+++z'$-89&&**40***z'$-89	: ,5QWW5I5I5N5N5P+Q
'E4D%K
	 
 )*(>(>&
 $IdO&
"
 JJ 	CD'') CmCLLN;AABC	C // 	SD''-77d8K8Q8QR	Sq TX
&
s   a"a7a#ac                Z  	 g }t        | j                        D ]  }dd	d}|j                         D ]  }t        	fd|j                  D              }|r\t
        j                  d|j                                t        j                  j                  j                  |j                                d} |j                          xr | }|s|j                  |       t
        j                  d|j                                t        j                  j                  j                  |j                                |j                  j                   D ]  }|j"                  | j$                  v s| j$                  |j"                     j                  }|D cg c]0  }|j&                  j                         |j                         k7  s/|2 c}| j$                  |j"                     _          t)        t        |            | _        | j                  D ]  }|j+                           yc c}w )	z0
        Remove any nodes without users
        c                r    | j                   xs* | j                         t        j                  j                  v S rY   )r  rZ   rI   r   r6  )rt   s    r]   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_user	  s&    ||Tt}}!'':T:T'TTr_   Fc              3  .   K   | ]  } |        y wrY   r   )r  urL  s     r]   r  z2Scheduler.dead_node_elimination.<locals>.<genexpr>	  s     #Ma$6q$9#M   zremoved dead buffer: %sTzremoved dead operation: %sN)rt   r  r   r   )r  r  r   r[  rV   r   r>  rZ   rI   r   r  r*  r|  r  r6  r   r   rb   r   rQ   r   r?  )
r[   updated_nodesrQ   active_buffersr   can_eliminater  rV   rN  rL  s
            @r]   r  zScheduler.dead_node_elimination	  s    TZZ( 	DU #N'') * ##M399#M M II7HGG++//?%)N* !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22 DyyD$4$44 $ 0 0 ; A A',="#0AT]]_0TA=((39-	8 (=12
 JJ 	#D  "	#=s   0H(H(c                    t        t                  t               g dfd|D ]  }|j                         D ]  }||<   	  |D ]
  } |        S )z?
        Ensure nodes is in topologically sorted order
        c                    | vrdj                  |        t        | j                  d       D ]&  }|j                  vr |j                            ( j	                  |        y y )Nc                    | j                   S rY   r  )ds    r]   r   zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>	  s
    aff r_   rE  )r*  rJ  r   rb   r  )rY  r  rA  rs   seenvisits     r]   rX  z2Scheduler.topological_sort_schedule.<locals>.visit	  se    }!!"6"6<LM 2Cxx|3 ,sxx01	2
 a  r_   )rY  rL   r   r   )r   rL   r  rR  )r[   r  rQ   rb   rA  rs   rW  rX  s       @@@@r]   r  z#Scheduler.topological_sort_schedule	  sy     +,.59V*,	! 	!  	*D--/ *%)T"*	*  	D$K	r_   c                2    t               }t        |t        t        t        t
        f      r-|j                  D ]  }|j                  |j                          nt        dt        |       d       fd|D        }t        t         fd|D                    S )Nz+get_unmet_dep_nodes is not implemented for .c              3  X   K   | ]!  }j                   |   j                          # y wrY   )r   r^   r1  s     r]   r  z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>	  s%     Xc))#.??AXs   '*c              3  <   K   | ]  }j                   |     y wrY   rB  )r  rY  r[   s     r]   r  z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>	  s     Qat66q9Qs   )r   r~   rW  r  r  r  r   r*  rb   RuntimeErrorrj   r   )r[   r  
unmet_depsr  unmet_dep_opss   `    r]   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodes	  s    &0l
)&"	
 // )sxx() =d5k]!L  YZXJQ=QQRRr_   c                z   g }t         j                  | j                  d      }i }| j                  D ]P  }| j                  |      }t	        |      ||<   |D ]*  }|j                  |g       }|j                  |       |||<   , R |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|j                  |
       |
D ]7  }|j                  |g       D ]  }||xx   dz  cc<    |j                  |       9 |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|rJ d       |S c c}	}w c c}	}w )zU
        Sort nodes by their topological order, return a list of node lists.
        r   r   zTopological sort failed!)	r  fromkeysr  ra  rp   r  r  r\  r)  )r[   r  r  childrenrQ   r-  r  crY  rb  zero_deg_nodesrt   s               r]   rj  z!Scheduler._topological_sort_nodes	  sF    djj!,#%JJ 	"D,,T2Dd)E$K "LLb) !"	" ).@1a!@@LL(# $LLB/ %D$K1$K%		! -2KKMDDAqQ!VaDND  444y A Es   D1%D1D7D7c                j   i }| j                   D ]w  }t               }|j                  D ]B  }| j                  |j                     j                         }|j                  |       |||   z  }D |||j                         <   ||_        y t        | j                         D ]  \  }}||_
        ||_         y)z.
        Populate each node.ancestors
        N)r  r   r   r   rb   r^   r*  rZ   r   r  r   r   )r[   name_to_ancestorsrQ   r   r  dep_node_namer  s          r]   r  zScheduler.compute_ancestors
  s    
 9;JJ 	'D)3I.. > $ 0 0 : K K Mm,.}==	> 2;dmmo.&DN	' %TZZ0 	#KE4"DN"DN	#r_   c                H   | j                   D ]  }t        j                  st        |t        t
        f      r#|j                         st        j                  dk7  rN|j                         D ]3  }t        |t              r|j                         r$|j                          5  y )Nhalide)r  r   r}  r~   rW  r  rD   cpu_backendrM  rr  r  )r[   rQ   r  s      r]   r  zScheduler.merge_loops
  s    JJ 	$D44 d]4F$GHKKMf&8&8H&D) $!%75;L;L;N!!#$	$r_   c                p   t        ddd      5  t        d      D ]  }t        |      }t        j	                  d|dz   |       | j                  |      }t        |      }t        j	                  d|dz   ||       ||k(  s|dk(  sjt        j	                  d|dz           n |cd	d	d	       S # 1 sw Y   y	xY w)
zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesT)log_pt2_compile_eventlog_waitcounter
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   rk  rp   r=  r>  fuse_nodes_once)r[   r  r  old_lennew_lens        r]   r  zScheduler.fuse_nodes1
  s     #4QU
 	 2Y e*  EE
 ,,U3e*  TE	 g%A$$Eq1u '( /	 	 	s   A5B,B,,B5c                    g }| j                   D ]4  }|j                  t        |t              r|j	                         n|g       6 || _         y)zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r,  r~   r  r  )r[   	new_nodesrQ   s      r]   r  zScheduler.process_grouped_nodesN
  sJ     .0	JJ 	D!+D2F!GdV	 
r_   c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        ddd      5  |j                  |      cddd       S # 1 sw Y   yxY w)
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)rn  dynamo_compile_column_usN)rp   r   r  r{  r   rx  )r[   r  ri  backends       r]   rx  zScheduler.benchmark_fused_nodesY
  st     5zA~~q$$&$""6*#"&%D
 	8
 007	8 	8 	8s   
A%%A.c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        d      5  |j                  ||      cddd       S # 1 sw Y   yxY w)rw  r   rx  N)rp   r   r  r{  r   generate_kernel_code_from_nodes)r[   r  benchmark_kernelri  r{  s        r]   r}  z)Scheduler.generate_kernel_code_from_nodesk
  sq     5zA~~q$$&$""6*12 	T::5BRS	T 	T 	Ts   A##A,c                    || _         | j                  |      }t        d      5  |j                  |      cddd       S # 1 sw Y   yxY w)rw  rx  N)r  r{  r   benchmark_codegened_module)r[   moduleri  r{  s       r]   r  z$Scheduler.benchmark_codegened_moduley
  sH     %""6*12 	>55f=	> 	> 	>s	   ?Ac                D   	 	 	 	 	 	 dd}t        | j                        D ]  \  }}t        |t              st        |j                  t
        j                        s=|j                  }t        j                  j                  s|j                         \  }}nt        d |j                  D              }t        |t        j                  j
                  j                        r|j                  j!                  |       |j#                         }|j$                  }t        |t
        j&                        sJ |j$                  }	t        |	t
        j(                        sJ |j*                  |	_         |||	       | j-                  |	      }
|
| j                  |<   |
| j.                  |j1                         <   |
| j2                  |j1                         <   i t5        j6                  |j8                  j:                  |j<                        D ]:  }| j>                  jA                  |jB                  d      x}s,|jB                  |<   < dfd} ||
j<                        |
_         ||
j8                  j:                        |
j8                  _        tE        |
jG                         |jG                               D ]3  \  }}|| jH                  |j1                         <   |jJ                  |_%        5 |jL                  |
_&        |jN                  |
_'        |jP                  |
_(         y)a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        c                   |j                         }| j                         }t        |t              rt        |t              sJ |j                         }| j                         }t        |t              rt        |t              sJ t        j
                  j                  |= ||_        t        j
                  j                  |= ||_	        t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   y rY   )rZ   r~   r   rE  rI   r   r+  rb   
name_to_opoperation_namebuffersr  remove
operations)	orig_noderX  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          r]   replace_operation_bufferzKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_buffer
  sW    !) 1 1 3%..0MmS1jARTW6XXX'::<$779LlC0Z@PRU5VVV&&'89)HM""#34&2H#77??((3DGGOO""8,$,AGGOOD!4<AGG""=177%%++I6DGG%%h/'/AGGt$/7AGG|,r_   c              3  |   K   | ]4  }t        |t        j                  j                  j                        r| 6 y wrY   )r~   r   r   select_algorithmExternKernelCaller)r  timings     r]   r  z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>
  s6       &) & % @ @ S S  #r  Nc                ,    t        fd| D              S )Nc              3  @   K   | ]  }|j                          y wrY   )r   )r  r  r  s     r]   r  zQScheduler.finalize_multi_template_buffers.<locals>.rename_deps.<locals>.<genexpr>
  s     %Sscjj1A&B%Sr  r   )r-  r  s    r]   rename_depsz>Scheduler.finalize_multi_template_buffers.<locals>.rename_deps
  s    %%Sd%SSSr_   )r  zir.MultiTemplateBufferrX  zir.OperationBufferr   r   )r-  r   r   r   ))r  r  r~   rW  rQ   r    MultiTemplateBufferr   test_configs%force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr   r   TritonTemplateCallerBasefinalize_as_triton_calleroutput_noder   
StorageBoxOperationBufferrl   r  rA  rZ   rB  r"  r#  r   r   r   r  r  rb   r  r   r   rV   r   r   r   )r[   r  r  rQ   
multi_nodemin_node_unfusedr  out_tensorboxout_storage
out_buffernew_scheduler_noder  	real_namer  new_outold_outr  s                   @r]   r  z)Scheduler.finalize_multi_template_buffers
  s   	8-	89K	8	86 !, B	@GAt$.:		2114 "YY
**PP*4*C*C*E'$a'+*4*C*C	($ $OO&&?? II778HI 0 < < >+00!+r}}===(--
!*b.@.@AAA$.$5$5
!(Z@%)%?%?
%K" 2

15G!!$--/2;M''8 $& $??$$**D,C,C ?C %)$;$;$?$?$$OOyO69hh(3	?T 9D&999"5 8C&22888"..4 ),&224d6F6F6H) 2$GW <CD$$W%5%5%78$+MMGM	2 04~~",/3~~",04"-EB	@r_   c                &    t        d |D              S )Nc              3     K   | ]q  }t        |j                  d       xrU |j                  duxrE t        |j                  j                  d      xr# |j                  j                  j                  dk(   s yw)r   Nscatter_moder  )r   rQ   r   r  rX  s     r]   r  z,Scheduler._any_atomic_add.<locals>.<genexpr>
  so      

 	 AFFF# 9d"9^49 ((L89
s   A7A9)r  r[   	node_lists     r]   _any_atomic_addzScheduler._any_atomic_add
  s     

 
 
 	
r_   c           	         t        d fD              }t        j                  s|syj                         r(t	        j                         t        j                        r j                         sj                         ryj                         }|d   j                         sJ j                  dk(  ryj                         }t        t        j                  ||            } j                  |      ryddlm t%              |d   j                         J dfdt&        j(                  j*                  j-                         	 	 	 	 d fd}|rt        d	 fD              rj                         durj                         nj                         t	        t        j.                        sJ j0                  }j3                         \  }	r j5                  |      n j5                  |      \  }
g d}t7        |j9                         t;        j<                  d
            D ]  \  }}t	        |t&        j(                  j                  j>                        s5s&tA        |d      r|jB                  jB                  k7  r]|z   k\  r nQ|d
z  }|t        jD                  kD  r n7jG                  |      5  jI                  |g ||             ddd        tK              dk(  ryd fd}|S  ||       ||       ||      d fd}|S # 1 sw Y   xY w)
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c              3     K   | ]>  }|j                         xr( t        |j                         t        j                         @ y wrY   )rr  r~   r  r    r  rX  s     r]   r  z.Scheduler.speedup_by_fusion.<locals>.<genexpr>
  sE       
  MMO J1..0"2H2HIJ 
s   AATr   rg  CompilationErrorNc           
     t   t         j                  t        j                        r| ||z   k  rFt         j	                  dj                         j                         t        ||z   | z  d             y t         j	                  dj                         j                         t        | ||z   z  d             y y )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r=  r  r  DEBUGr>  rR  r7   r8   )ms_fusedms1ms2r:  r;  s      r]   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion(  s    &&w}}5cCi'$$S..0..0"sSyH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6r_   c                    j                  | d      }t        j                  |      }j                         sd }||fS j	                  d|      }t        |t              sJ ||fS )NT)r~  triton_)kernel_namesource_code)r}  r   loaduse_process_pooltritonr~   r   )r  src_codemodfutasync_compiler[   s       r]   compile_kernelz3Scheduler.speedup_by_fusion.<locals>.compile_kernel;  s     ;; < H ""8,C 113
 : $**yh*W!#|444:r_   c              3  @   K   | ]  }|j                         d u  y wrY   r  rX  s     r]   r  z.Scheduler.speedup_by_fusion.<locals>.<genexpr>J  s#      %
23A!-%
s   r   rE  allowed_prologue_inpsFc            	        t        d      } d }i }D ]V  \  }}}	 ||j                          j                  |      5  j                  |	      \  }}|||<   || k  r|} |}d d d        X  |        | z   k  r|j                  |       |_        yy# t        $ rQ}t        j	                  t
        j                        r$t        j                  d
sdndt        |             Y d }~d }~ww xY w# 1 sw Y   xY w)NinfzException in compiling %s: %sr  r  TF)r.  rs   r   r=  r  r  r  r>  r   swap_as_triton_callerr  r  _choice_timings)min_ms_fusedms_fused_choicenew_timingschoicefuture	mod_fusedr  r  pathri  epilogue_fusionfuture_choicesr  r  r  r  r[   s            r]   benchmark_when_readyz9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready  s$   $U|"& 1? 5-FFI!!-"MMO $99&A 5)-)H)H%v*$ /7F+#l2+3L.4O5 550 <c239-/2M88I1<J. 1 % !%227==A&,, ?2A
z #A
 !!5 5s#   B$C7	C4#AC//C47D 	c                    ddl m}  	 d   d   d   fD ]  }||j                           j                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       y        t        d      rWz   k\  rOfj                  vr?j                  j                  f       t        d      j                  fd	       z   k  S # | $ r Y y	$ r}d
t        |      v rY d }~y d }~ww xY w)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r  r  r  path1path2
path_fuseds   r]   r   zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s(    053605365?8@3;sSy3I% r_   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  rs   r  mathisinfr   r  r*  r   r  r   )r  r  r  r  r  r  r  r  r  r  ri  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  r[   rN  s      @@@@@@r]   r  z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready  s   ; *!,)!,/2  )
 ?JJL) "&!@!@)!,f"JC zz#CD$!%!@!@)!,f"JC zz#DE$+/+J+J/2F,(Hj zz(+CD$xc2 0>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E AE +5E !5E A3E E.E.E)(E))E.)r  r.  r  r.  r  r.  r   r   )r  r  r   z)tuple[Optional[LambdaFuture], ModuleType]r   )&r  r   benchmark_fusionrr  r~   r  r    TritonTemplateBufferrv  rM  r   rj   r   r"  r#  r  triton.compiler.errorsr  r3  r   r   r  AsyncCompiler  r  r  rx  rJ  r\  operator
itemgetterr  r   r   max_epilogue_benchmarked_choicesr  r  rp   )r[   r:  r;  is_multi_templatenode_list_1node_list_2node_list_fusedr  r  r  r  triton_choicesr  unfused_timer  r  r  ri  r  r  r  r  r  r  r  r  r  rN  s   ```            @@@@@@@@@@@@@r]   speedup_by_fusionzScheduler.speedup_by_fusion
  sV       
 U^ 
 

 &&/@ u668":Q:QR!! oo'Q**,v ;;%oo'y{KHI
 0;u% #..0!!!	" 55BBD	.	6	  %
8=u~%
 "
 $557tCO # ''),,. 
 j"*@*@AAA (66N..0FAs # **;7//< C TVNN(.$$&H,?,?,B) V$ "&%//*<*<*U*UV ((?@44
8X8XX39,!#!F$K$KK55f= V"))6*TN?4S*TUV V3V8 >"a'%! %!N (' !/{ ; .{ ;&4_&E#@ @D ('oV Vs   M		M	c                <    | j                   |j                            S )z0Look up the node in Scheduler name_to_fused_node)rB  rH  r&  s     r]   r  zScheduler.get_fused_node  s    &&t':':'<==r_   c                    t        |      t        j                  t        j                        r@t        j                  d       D ]&  }t        j                  d|j                                ( i 	 	 	 	 	 	 d fd	 	 	 	 	 	 d fd} j                  |      D ]  \  }} |||        j                  |      } j                  |      } j                  ||      sD j                  ||      rW j                  ||      }t        |      r|||f|<   |||f|<   |s ||        t               }j                         D ]j  \  }}	}
||v r|j                  |        j                  |	      |	u sJ  j                  |
      |
u sJ  |       sO j                  |	|
      rb |	|
       l t        d       } j!                  |      } j#                  |       |S )	a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %sc                   t         j                  d| j                         |j                                | j                         }|j                         |k(  sJ j	                  |      j                  | |      }j                  |        j                  |       j                  |       j                  j                  |j                         D ci c]  }|j                         | c}       |S c c}w )Nzfusing %s with %s)r=  r>  rZ   r   r{  r  r  r*  rB  r  rM  )r:  r;  ri  node3rY  rW  r[   s        r]   fuse_two_nodesz1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodes  s     0%..2BENNDTU%%'F##%///$$V,11%?Eu%u%OOE"##**.3oo.?@u$@ L As   C6c                   j                  |       v sj                  |      v rj                  j                  |       j                  j                  |      d             }|J |\  }}}j                  |d        j                  |d        j                  |      |u sJ j                  |      |u sJ  |       rj                  | |      r ||       j                  |       v rωj                  |      v ry y rY   )r  r  r)  will_fusion_create_cycle)	r:  r;  pending_fusion
is_speedup	node_key1	node_key2r  pending_fusionsr[   s	         r]   resolve_pending_fusionsz:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusions  s    ##E*o=&&u-@!0!4!4''.#''(;(;E(BDI" &1113A0
Iy##It4##It4**95BBB**95BBB!|t'D'DUE'Ry)4' ##E*o=&&u-@r_   c                    | j                   S rY   r  r   s    r]   r   z+Scheduler.fuse_nodes_once.<locals>.<lambda>\  s
    !++ r_   rE  )r:  rL   r;  rL   r   rL   rB  )r   r=  r  r  r  r>  r   get_possible_fusionsr  rI  r  r  callabler   r*  rJ  r  rC  )r[   r  rQ   r  r:  r;  speedupseen_pair_speedup_fnis_speedup_fnr  r  r  rW  r  s   `          @@@r]   rq  zScheduler.fuse_nodes_once  s'    !'""7==1;<# A  )=)=)?@A  	
	$	->		 	5$	5->	5	52 !55e< 	-LE5 $E51''.E''.E}}UE*43P3Pu4 00>G$.5ue-DOE*.5ue-DOE*ue,)	-, @J|3B3I3I3K 	5/M9i 44 $$]3&&y1Y>>>&&y1Y>>>t'D'D9( y)4	5 {(=>..u5!!%(r_   c                   t        | j                        }d}t        | j                        }t        j	                  d|       t        t        j                  |             D ]  \  }}t        j                  |      }t        |      dk  r+|||kD  r n| j                  |      st        j	                  d|       \|dz  }t        j                  dkD  }t        |d   j                  |d|      }t        j                  d	t        |      |       |D ]  }	|j                  |	        |j                  |       | j                   j#                  |j%                         D 
ci c]  }
|
j'                         | c}
       ! t)        |d
       | _        | j+                  | j                        | _        t        j                  d||t        | j                               | j-                  | j                         yc c}
w )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r  Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                    | j                   S rY   r  r   s    r]   r   z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>  s
    q{{ r_   rE  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  rp   r   r>  r  r:  rx  rg  speedup_by_combo_kernelr   r  rP   r   r  r*  rB  r  rM  rZ   rJ  r  rC  )r[   r  rW  r  num_nodes_orignumr  rV  r  rQ   rY  s              r]   r  z#Scheduler.create_combo_kernel_nodesa  s    !,TZZ		FU'&DDTJ
 	NC 3CCINI9~!'EL,@//	:		EsKQJE$;;a?O4!&&*. /	K HHBI
 " )""4()OOK(##**4?4I4I4KLq{*L7	< K-BC
33DJJ?
R

O		
 	!!$**- Ms   !G=
c                H    |D ]  }|j                  | j                          y rY   )rC  rB  )r[   r  rQ   s      r]   rC  zScheduler.prune_redundant_deps  s%     	?D%%d&=&=>	?r_   c                   	
 g 	t        t        t        t        f             
d	
 fd}t        j                  t
              }|D ]=  } j                  |      r|j                         D ]  }||   j                  |        ? |j                         D ]
  } ||        t        j                  rat        j                  t
              }|D ]&  }t        |dd      }|s||   j                  |       ( |j                         D ]
  } ||         j                  	      		j                   j                  d       t         j#                  dt%        	             	S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        c                x   t        |       D ]  \  }}| |dz   |dz   t        j                  z    D ]  }||f}|v rj                  |       j	                  ||      rj                  |       A|j                         s|j                         sbj	                  ||      suj                  ||f         y r@  )r  r   )max_fusion_buffer_group_pairwise_attemptsr*  rI  r  rr  rv  )r  node1_indexr:  r;  rF  possible_fusionsrW  r[   s        r]   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairs  s    &/&6 @"U"!Ok'FF'G @E
 !%.Cd{ HHSM}}UE2(//4++-1A1A1CuJ )//?!@@r_   r   NT)rF  reversezfound %d possible fusionsr  r0  r   r   )r   r  rL   r  r   r   unfusable_noder$  r  r   r   aggressive_fusionr   *get_possible_fusions_with_highest_priorityr  score_fusion_keyr=  r>  rp   )r[   r  r  buffer_names_groupingrQ   r   node_groupinggroup_groupingr   r  rW  s   `        @@r]   r	  zScheduler.get_possible_fusions  sh    % 13D DEFH	@( !, 7 7 = 	8D""4(--/ 8%c*11$78	8
 399; 	+MM*	+ ##(44T:N 7gt4"5)0067 "0!6!6!8 /./  JJ
 	$"7"7F4c:J6KLr_   c                    t        t                  d fd|j                         j                  j	                         |j                         j                  j	                         z  |j
                  j                  j	                         |j
                  j                  j	                         z  z
  t         fdD              }|r t        ||      d       |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        c                   t        | t              rq| vrmj                  |        | j                         j	                        ryt        | j                  z        xs" t        fd| j                  z
  D              S y)NFc              3  H   K   | ]  } j                   |           y wrY   r]  r  rY  
found_pathr[   s     r]   r  zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s+      H #4#:#:1#=>H   ")r~   r  r*  rN  issubsetr   r   r  )rQ   combined_ancestorscombined_namesr'  r[   visiteds    r]   r'  z6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 23G8KD!++-667IJ !   ?@ C H!%2D!DH E  r_   c              3  H   K   | ]  } j                   |           y wrY   r]  r&  s     r]   r  z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s!     WqJt66q9:Wr(  zwill create cyclerQ   rL   r   r   )r   r  rN  _dictr  r   r  r3  )r[   r:  r;  cycler*  r+  r'  r,  s   `   @@@@r]   r  z"Scheduler.will_fusion_create_cycle  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWW#IeU#$78r_   c                    ddl m 	 	 	 	 d fd} ||      } ||      }t        fd|D              }t        fd|D              }|j                  |      }d}	|D ]  }
	 |	t	        |
d         z  }	  j                  ||      }t        j                  j                  j                  |	d	|z        ry
y# t
        $ r Y  yw xY w)a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   )buffer_reuse_keyc                0   g }| j                   j                  D ]y  }j                  j                  |j                        }|s+t        |j                        dk(  sD|j                  j                         s_|j                  |j                         { |S r@  )
r   r   r   r  rb   rp   rV   rQ   has_tensor_outputr  )rQ   r   rD  r   r[   s       r]   _find_single_user_inputszKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  sw     F&&,, ,&&**27733syy>Q.3883M3M3OMM#((+, Mr_   c              3  .   K   | ]  } |        y wrY   r   r  r   r2  s     r]   r  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>       #Sc$4S$9#SrO  c              3  .   K   | ]  } |        y wrY   r   r7  s     r]   r  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>  r8  rO  r   r  F    T)rQ   rL   r   zlist[ir.Buffer])r  r2  r   intersectionr   r  score_fusion_memoryrI   r   r  statically_known_gt)r[   r:  r;  r5  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadrF  	bw_savingr2  s   `           @r]   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707##S]#SS##S]#SS*77G$ 	C3s1v;.	 ,,UE:	 77//iP  s   $B88	CCc                    t        t        |j                  |j                  z
        t        |j                  |j                  z
              }|dkD  S )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  r  r   r   )r[   r:  r;  proximity_scores       r]   are_long_distant_nodesz Scheduler.are_long_distant_nodes2  sE    * %//12%//12
 ##r_   c                   i }|j                   j                         D ci c]  }|j                  | }}|j                   j                         D ci c]  }|j                  | }}|D ]}  }t        j                  j                  |      }	||   }
||   }t        |
t              rt        |t              sdt        |
       dt        |       ||<   k|
j                         |j                         k7  r(d|
j                          d|j                          ||<   t        |
j                        t        |j                        k7  rd||<   |
j                         }|j                         }||k7  rd| d| ||<   |
j                         |j                         k(  rd|
 d| ||<   Ed}t        |	t        j                        sd|	j                    }d	|
 d| d
| ||<    t#        |      S c c}w c c}w )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        znot MemoryDep: z v.s. zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r   zLayout: zUnknown reason: z. )r   r  rb   rI   r   r  r~   r(   rj   	get_numelrH   r  
get_offsetnormalize_with_stride_orderr    r  rl   r   )r[   r:  r;  common_buf_namesreasonsr  node1_name2depnode2_name2deprb  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  r]   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reasonM  s    383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX( ,	H''$$X.C$X.G$X.Ggy1GY9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G'! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#56'

|4
"7)6'"ZLI HU,	\ 7|c YXs   G5G:c                   t         j                  rt        d ||fD              ry|j                  j	                         }|j                  j	                         }||z  }|sy|j                  j                         D ci c]  }|j                  | }}|j                  j                         D ci c]  }|j                  | }}g }	|D ]y  }
||
   }||
   }|j                         |j                         k(  s/|	j                  t        j                  j                  j                  |j                         d      ||f       { t        |	      dk(  ryt        |	t!        j"                  d            \  }}}t%        |t&              rt%        |t&              sy|j(                  |j(                  k7  r3|j+                         |j+                         k(  r| j-                  |      S y|j/                         s|j1                  ||       nV|j/                         s|j1                  ||       n3t2        j5                  d|j7                         |j7                                | j9                  ||      S c c}w c c}w )z
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.
        c              3  <   K   | ]  }|j                           y wrY   )rj  rX  s     r]   r  z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>  s      8
AHHJ8
rL  r   r  rE  z?Don't reorder loops since both nodes are reductions: %s v.s. %s)r   r}  r  r   buffer_namesr  rb   rN  r  rI   r   r  r  rL  rp   r  r  r  r~   r(   r  rv  dep_size_hintrm  r   r  r>  rZ   r<  )r[   r:  r;  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr  rQ  rR  
candidatesbuffer_namerS  rT  _numels                 r]   !shared_data_after_reordering_loopz+Scheduler.shared_data_after_reordering_loop  sL    00C 8
!&8
 5
 "..;;="..;;=03EE"383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX 
. 	K$[1G$[1G3356689 !!((2273D3D3FQR2S	 z?a $'zx7J7J17M#N '9-Z5Sw///
   "g&7&7&99))'22 !!#++GW=##%++GW=##Q   ''u55e YXs   >I10I6c                    t        |t        t        f      xr) |j                          xr t	        |j
                         S )z>
        Is this node unfusable under any conditions.
        )r~   r  r  rr  rF   rQ   r&  s     r]   r  zScheduler.unfusable_node  sD    
 t79OPQ C$$&&C7		BB	
r_   c                   |j                         t        j                  j                  k  ry|j	                         }|j                         }d}|||z  kD  r	 |d       yt        d |j                         D              }|t        j                  j                  j                  j                  fk(  r	 |d       yd	d} ||j                         j                        r|j                         s	 |d       yy)
zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     K   | ]J  }|j                   <|j                   j                         D ]  }|j                  dk(  r|j                   ! L y w)Ncall_function)rQ   r  r\   r  )r  rY  r  s      r]   r  zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>   sT      
vv!VV'')	
 tt&	 HH

s   AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                <    | j                   dk  xr | j                  S )Nr  )itemsizeis_floating_point)r  s    r]   low_prec_fpzGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >>Q&B5+B+BBr_   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r  ztorch.dtyper   r   )rN  rI   r   invoke_quant_opsr  r  r  rM  r   opsatenconstant_pad_nddefaultr  r  r\  )	r[   prologue_noder  rN  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  rl  s	            r]   (check_prologue_fusion_heuristics_fusablez2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHI!>>@h r_   c                   ||u ryt        ||      }|j                         r0| j                  |j                               j	                  ||      ryt        |t              st        |t              r	 |d       yt        |t        t        f      r|j                         s	 |d       yt        |t        t        f      r|j                         s	 |d       y|j                         |j                  z  r	 |d       y|j                         r!t        j                  s	 |d       y|j                         s|j                         r	 |d       y|j                         }t        |t        j                         s	 |d	       y|j#                         }t%        d
 |j&                  D              |z
  }|j)                         |z  r	 |d       y|j+                         s|j+                         r	 |d       y|j-                         dd D ]B  }|j/                         }|D ]+  }	t1        fd|	j2                  D              r" |d         y D t        |t4              s|gn*|j6                  D 
cg c]  }
|
j                         s|
 c}
}t9        |      dk(  sJ |d   }t9        d   j:                        dk(  rSt9        d   j:                  d   j2                        dk(  r+d   j:                  d   j2                  d   j<                  |u s	 |d       y| j?                  |||      sy|j                         r9|j+                         s |j                         st        j@                  s	 |d       y|j)                         tB        jD                  jF                  z  s+|j)                         tB        jD                  jF                  z  r	 |d       y|j                         }|j                         }||k7  r |d||       y~| jI                  ||      }|t        jJ                  k  r"t        jL                  r| jO                  ||      }tP        jS                  tT        jV                        r4tP        jY                  d|j[                         |j[                         |       tB        j\                  j_                  | |||      sy|j                         |j                  z  rY| ja                  ||      xrE tB        j\                  ja                  | |||      xr! | j                  |      ja                  ||      S tB        j\                  jc                  | |||      xr! | j                  |      jc                  ||      S c c}
w )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  <   K   | ]  }|j                           y wrY   rG  )r  inps     r]   r  z%Scheduler.can_fuse.<locals>.<genexpr>R  s     Ec3<<>ErL  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr  c              3  :   K   | ]  }|j                   v   y wrY   r  )r  rt   prologue_nodess     r]   r  z%Scheduler.can_fuse.<locals>.<genexpr>b  s     QttyyN:Qr  z7template prologue can only fuse nodes with a single user   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)z%s and %s has %s shared data)2r3  rr  r{  r   can_fuse_multi_outputs_templater~   r  r  r  rN  r   r   prologue_fusionrm  r  r    r  get_allowed_prologue_inpsr   inputsrR  r  rM  r   r[  rV   r  r  rp   r   rQ   rv  r  rI   r   no_fuse_buffer_namesr<  score_fusion_memory_thresholdr}  rc  r  r  r  r  r>  rZ   choicesrI  can_fuse_verticalcan_fuse_horizontal)r[   r:  r;  rN  r  r  unsupported_prologue_argsrQ   	node_outsr   rY  template_snodestemplate_snoderi  device2shared_data_scorer{  s                   @r]   rI  zScheduler.can_fuse  s
    E>u%4#3#3$

)
)%
7$8 e12j'7
 ABu8:PQR%%'()u8:PQR%%'()$$&8,-))01!!#u'8'8':HI779Hh(?(?@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--/53Q3Q3SPQ"__.N&s+ % ,,.	$ %CQsyyQQUV$%% "%);< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sS**,!!#))12""$qww'C'CC""$qww'C'CC56!!#""$W,fg> 44UEB D DD11 $ F Fue T))'--8##.  !	 yy!!$u6GH$$&8 &&ue4 MII//eUDUVM$$V,>>ueL 9900eU$5 M""6*>>ueLMC Bs   5VVc                   |j                         }t        ||      }t        t              }|j                  D ]j  }| j
                  j                  |j                  |j                        }t        |t              r| j                  |||      rW||   j                  |       l |j                  j                  D ]  }t        |t              s|j                  | j
                  j                  |j                  |j                              }	|	sV|	D ]&  }
| j                  |
|      s|	j!                  |
       (  t#        d t$        j&                  j)                  |j+                               D              }||z  r	 |d       y|j-                         }|D ]E  }| j.                  |   j1                         }|| j2                  |   j4                  z  s= |d        y y)a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  4   K   | ]  }|j                     y wrY   r  r   s     r]   r  z.Scheduler.can_fuse_vertical.<locals>.<genexpr>  s      $
 HH$
r!  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)rR  r3  r   r   r   r  r  rb   r~   r*   fusable_weak_depr  r   r   r(   fusable_read_and_writer  r   r"  r#  r  r   rN  r   r^   rB  r   )r[   r:  r;  node1_buf_namesrN  remaining_deps_by_namer  rb   cd	remainingrD  remaining_depsnode1_op_namesr7  s                 r]   r  zScheduler.can_fuse_vertical  s     002u%7B47H++ 	5C((,,SXXsxx@D#w'D,A,A#ue,T"4(//4		5 ##** 		-Bb),.22%%))"''277;I # -B222r:!((,-		- $ $
 445K5R5R5TU$
 

 O+
 +,224" 	D&&t,==?G 7 7 @ J JJ>?		 r_   c                    |j                   |j                         vry|j                  j                  D cg c]  }|j                   |j                  k(  r| }}t        |      dk7  ry|d   t        t              sJ t        j                  t        j                        ry| j                  |j                     }|j                  j                  D cg c]  }|j                   |k(  s| }}t        fd|D              S c c}w c c}w )NFr   r   c              3     K   | ]q  }t        |t              xr[ t        |j                  t        j
                         xr4 |j                  j                  k(  xr |j                  j                  k(   s y wrY   )r~   r(   r   r  r   TMPr  )r  r  r  s     r]   r  z-Scheduler.fusable_weak_dep.<locals>.<genexpr>  sm      

 	 tY' ('

DHH==(

ekk)( 		UZZ'(
s   A7A:)rb   rR  r   r   r2  rp   r~   r(   r   r  r   r  r  r   r[  )	r[   weak_depr:  r;  r  mutating_writesr  r  relevant_readss	       `    r]   r  zScheduler.fusable_weak_dep  s
    == 6 6 88 **11
zzX222 
 

 1$"%+++u{{DHH5++H,A,AB	"..44
		Y8ND
 
  

 '
 
 	
#

s   "DD,Dc                   t        |t              rH| j                  j                  |j                  |j                        }||j                  k7  sHt        |j                  t        j                        s$t        |j                  t        j                        ryt        j                  r9|j                  |j                  k7  r |j                         }|j                         }|j                  |j                  k(  xr\ t        |j                        t        |j                        k\  xr/ |j                  d t        |j                         |j                  k(  S t        |t              r| j                  j                  |j                  |j                        }| j                  j                  |j                  |j                        }|j                   |j                   k(  r|j                   ||k(  ryyr   )r~   r(   r  r  rb   r   r  r   r  r   r}  r  rv  rp   r  r)   r  )r[   r  r  	read_name
write_names        r]   r  z Scheduler.fusable_read_and_write  s`   dI&--11$))TYYGI UZZ'&tzz488<&u{{DHH=00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+r_   c                    d}|| j                   vr2	 |j                         s|j                         }|| j                   |<   |S | j                   |   }|S # t        $ r Y -w xY wr
  )r  has_unbacked_symbolsnumbytes_hintKeyError)r[   r  ress      r]   r\  zScheduler.dep_size_hint*  sz    d000//1++-C /2D&&s+ 
 ,,S1C
   	s    A 	A A c                2    t        |j                  j                        t        |j                  j                        z   }t        |j                  j                        t        |j                  j                        z   }t	        ||      dz  t        ||      k  r||kD  r|}|}|}|j                  j                  |j                  j                  z  D cg c]4  }||j                  j                  v s||j                  j                  v r|6 }}t         fd|D              S |j                  j                  |j                  j                  z  |j                  j                  |j                  j                  z  z  }t         fd|D              S c c}w )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        rG  c              3  @   K   | ]  }j                  |        y wrY   r\  r1  s     r]   r  z0Scheduler.score_fusion_memory.<locals>.<genexpr>Q  s     ?3t))#.?r  c              3  @   K   | ]  }j                  |        y wrY   r  r1  s     r]   r  z0Scheduler.score_fusion_memory.<locals>.<genexpr>V  s     Is4%%c*Ir  )rp   r   r   r   r  r  r  )	r[   r:  r;  node1_dep_lennode2_dep_lentmpr  r-  common_memory_depss	   `        r]   r<  zScheduler.score_fusion_memory:  sh    E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT }m,q03}m3TT}, !,,22U5F5F5M5MM%++111SE<M<M<T<T5T D  ?$???#//558I8I8P8PP##e&7&7&>&>>
 I6HIIIs   9Fc                   t        |      dk(  r|S i }|D ]  \  }}|j                         |j                         k(  sJ |j                         }t        | j                  |      j	                  ||            }||vr	||fg||<   p||   j                  ||f        t        |j                         t        j                  d            d   }t        |      dkD  sJ |S )Nr   rE  r   )
rp   r   r   r{  get_fusion_pair_priorityr  r  r\  r  r  )r[   r  "possible_fusions_group_by_priorityr:  r;  ri  fusion_pair_priority&possible_fusions_with_highest_prioritys           r]   r  z4Scheduler.get_possible_fusions_with_highest_priorityX  s   
  A%##  	+ - 	LE5##%)9)9);;;;%%'F#&  (AA%O$  $+MMENL23GH 33GHOOEN	 25.446H<O<OPQ<R2

2. 9:Q>>>55r_   c                B    t        j                  j                  | g| S )z-
        Shim for list.sort(key=...)
        )rI   r  score_fusionr  s     r]   r  zScheduler.score_fusion_keyx  s     yy%%d3U33r_   c                    t        t        j                  j                               }t	        | j
                        D ]9  }|j                  || j                         |j                  |j                         ; y)zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rI   r   r  r  r  r  r  r  r   )r[   r  rQ   s      r]   r  zScheduler.compute_last_usage  s]    
 ))A)A)CDTZZ( 	8D 3T5L5LM&&t7	8r_   c                   t        | j                  t        j                  j                  z
  t        j                  j
                  j                  z
        D ]i  }|| j                  v rT| j                  |   }|j                         s2t        j                  j
                  j                  |j                         f|t        j                  j                  v st        j                  j                  |   }t        |t        j                        r*t        j                  j
                  j                  |       t        |t        j                        r|j                   }t        |t        j"                        r|j%                         sJ t        j                  j
                  j                  |j                          l | j                  j'                          y)z*Free any buffers that are no longer neededN)rJ  r  rI   r   r  r   freedr   r   codegen_freerQ   r  r~   r    r  GeneratorStater   r  is_input_bufferclear)r[   rb   r   ry  storages        r]   free_bufferszScheduler.free_buffers  sK   %%gg%%&gg""(()
 	DD
 t'''&&t,<<>GG((55chh?---gg**40c2#5#56GG((55c:R%6%67!hhG"7BMM:w?V?V?XXGG((55gllC)	D, 	!!'')r_   c                    | j                   j                         D ]  }|j                           | j                          y rY   )r  r   flushr  )r[   r{  s     r]   r  zScheduler.flush  s3    }}++- 	GMMO	r_   c                   t        |t              sJ t        d   dxx   dz  cc<   t        j                  t        d            5  |j                          |j                          d d d        |j                  }t        |t        j                        sJ dt        |             |j                  t        j                  j                         | j                          y # 1 sw Y   |xY w)Nr  extern_callsr   F)increase_kernel_countztype(node)=)r~   r  r   rI   set_kernel_handlerr%   r  r  rQ   r    r  rj   r  r   r   r  )r[   scheduler_noderQ   s      r]   codegen_extern_callzScheduler.codegen_extern_call  s    .*CDDD
 	^,1,!!&u"EF 	&002##%	& ""$0B[T$ZM2BB0QWW))*	& 	&s   !C""C+c                P   t        |j                        r|j                  
J | d       t        j                  j                  |       t        |j                        }|t        d|j                         t               s|j                  dk(  rLt        j                  j                  |      x}j                  dk  rt        |t        j                               t        |j                        r,|j                  dk(  st!        t        j                                ||       S )Nz( should have been normalized in loweringzUnsupported device type: cuda   mps)rD   rj   r  rI   r   add_device_infor$   r^  r   r   r  get_device_propertiesmajorr+   inspectcurrentframer,   )r[   ri  device_schedulingdevice_propss       r]   create_backendzScheduler.create_backend  s    &++&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII|v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$V[[E-A#G$8$8$:;; &&r_   c                    |J || j                   vr| j                  |      | j                   |<   | j                   |   S rY   )r  r  rh  s     r]   r{  zScheduler.get_backend  sB    !!!&$($7$7$?DMM&!}}V$$r_   c                    d fd}|j                         D ci c]8  }|j                  *|j                  j                         D ]  } ||      |fd  : }}}t        |j	                               }|rMt        |t        j                  d            \  }}t        j                  j                  j                  |       y y c c}}w )Nc                    | j                   vrLj                   j                  t        | j                  j                        D  ci c]  \  }} | |
 c} }       j                       S c c} }w rY   )r  r  r  r   r  )rY  r  r[   s     r]   	get_orderz*Scheduler.enter_context.<locals>.get_order  s\    ,,,$$++i>V,WdaQT,WX''** -Xs   A+
r   rE  )rY  ztorch.fx.Noder   r   )rM  rQ   r  r   r  r  r  r  rI   r   r   enter_context)r[   rQ   r  rY  r  r  r  lasts   `       r]   r  zScheduler.enter_context  s    	+ ^^%
vv!VV'')	
  q\1t#

 
 w||~&'x':':1'=>GAtGG  ..t4 
s   =Cc                    	 | j                   |   j                  }t        fd|D              xr || j                  vxr || j
                  vS # t        $ r Y yw xY w)NFc              3  ^   K   | ]$  }|j                   xs |j                         v  & y wrY   )r  rZ   )r  rt   fused_node_namess     r]   r  zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s)     VC3C CCVs   *-)r   rV   r  r[  r  r  )r[   rb   r  rV   s     ` r]   $can_buffer_be_removed_through_fusionz.Scheduler.can_buffer_be_removed_through_fusion  sn    	$$T*00E VPUVV 4D1114D333	
  		s   A 	AAc                    t        |t              rt         fd|j                  D              S |j	                         sy|j
                  yt        |j
                  t        j                        ryt        |j
                  t        j                        ryt        |j
                  dd      ryt        |j
                        ryy)zBReturn True if we should partition the inductor graph on this nodec              3  @   K   | ]  }j                  |        y wrY   )should_partition)r  r  r[   s     r]   r  z-Scheduler.should_partition.<locals>.<genexpr>  s     Mt,,U3Mr  TNunbacked_bindingsF)r~   r  r  r  rD   rQ   r    
DeviceCopyConditionalr   rC   r&  s   ` r]   r  zScheduler.should_partition  s    d./MMMM{{}99dii/dii0499148!$)),r_   c                    i }|j                  t        j                  j                         | j                  D ]3  }|j
                  j                         D ]  \  }}|j                  ||<    5 |S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r  rI   r   r  r  r   r\  rQ   )r[   rA  rQ   rb   scheduler_buffers        r]   get_name_to_nodeszScheduler.get_name_to_nodes  sr     UWAGG001JJ 	;D*.*>*>*D*D*F ;&&%5%:%:T";	; r_   c           	        t        t        j                  j                        D ci c]  \  }}||
 }}}t        t        j                  j	                               D ci c]  \  }}||
 }}}g t        j                  _        t        |      D ]  \  }}|j                  rg }|j                  D ]"  }|j                  |j                  |             $ g }	|j                  D ]0  }
|	j                  |j                  |
j                                      2 t        j                  j
                  j                  t        |||	|j                                yc c}}w c c}}w )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        N)r  rI   r   r  r  partition_mapsskip_cudagraphinput_nodesr  r  output_nodesrZ   r@   constant_names)r[   
signaturesidxrb   name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingoutput_mappingrQ   s              r]   compute_graph_partition_mapsz&Scheduler.compute_graph_partition_maps  sT    (11E1E'F%
##tD#I%
! %
 (11I1I1K'L&
##tD#I&
" &
 "$'0'< 	#L)''
 M!-- J$$%>%B%B4%HIJ  N!.. W%%&@&D&DT]]_&UVW GG""))! !",,	!	%
&
s   E!E c                  	
 d		fd		 	 	 	 d
	
fd
	 	 	 	 d	fd	 	 	 	 dd} t               j                  
fd|D         } |j                  fd|j                         D           ||      }t               }|D ]F  }t        j                  j
                  j                  |      }|j                  |j                         H t        t        |t        j                  d                  S )ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        c                   t               }| j                         }t        |t        j                        r|j                  t        |j                        t        |j                        z  t        |j                        z         t        |t        j                        r!|j                   |j                               |S |
J d|        |S )Nz*Expect layout to be None but found layout=)r   maybe_get_layoutr~   r    Layoutr  r   r  strideoffsetr  r  )rQ   free_symbol_usesrl   get_layout_symintss      r]   r  zGScheduler.get_graph_partition_symbol_inputs.<locals>.get_layout_symintsR  s    9C**,F&")), '' -"6==12"6==12
 fb&C&CD$++,>v}},MN
 $# ~ @I~ $#r_   c                ,   t        | t              r* t               j                  fd| j                  D         S | j
                  J | j
                  j                         } |j                  fd| j
                  j                         D          |S )z4
            Gets symbols used in node.
            c              3  .   K   | ]  } |        y wrY   r   )r  r  get_scheduler_node_symbol_usess     r]   r  zfScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses.<locals>.<genexpr>l  s     U4U;UrO  c              3  .   K   | ]  } |        y wrY   r   )r  ir_noder  s     r]   r  zfScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses.<locals>.<genexpr>q  s     U'$W-UrO  )	r~   r  r   r  r  rQ   r6  r  r   )rQ   r  r  r  s     r]   r  zSScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_usesd  s     $ 23)z|))UU  99(((#yy==?###UTYY=R=R=TU $#r_   c                    t        | t        j                        r
t               S t        | t        j                        r |       S t        dt        |              )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )r~   r    r  r   r  r,  rj   )rQ   r  s    r]   get_input_node_symbolszKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbolsu  sO     $ 2 23!|#D")),)$// *,I$t**VWWr_   c                &    t        d | D              S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c              3     K   | ]N  }t        |t        j                  t        j                  t        j                  t        j
                  f      r| P y wrY   )r   r   SIZEFLOATUNBACKED_INTUNBACKED_FLOAT)r  r  s     r]   r  zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sH      !		

))++	 s   AAr   )symbolss    r]   filter_symbolszCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s         r_   c              3  .   K   | ]  } |        y wrY   r   )r  rQ   r  s     r]   r  z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s     It,T2IrO  c              3  4   K   | ]  \  }} |        y wrY   r   )r  r  rQ   r  s      r]   r  z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s     Nwq$$T*Ns   rb   rE  )rQ   z	ir.IRNoder   OrderedSet[sympy.Symbol])rQ   rL   r   r
  )rQ   z0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]r   r
  )r  r
  r   r
  )r   r  r\  rI   r   r  simplifyr  r   rJ  r  
attrgetter)r[   	partitionr  r  candidate_symbolsr  r  symplified_sr  r  r  s           @@@r]   !get_graph_partition_symbol_inputsz+Scheduler.get_graph_partition_symbol_inputsE  s    	$$	$#	$%	$"	XB	X%	X 	-	%	, 7Ijl6H6HIyI7
 	 N+:K:K:MN	
 ++<=(2" 	2A77++44Q7LJJ|001	2
 &(*=*=f*EFGGr_   c           	     B    g }t        t        j                  j                               } j	                         }d fdt        t        |      t        |            D ]  \  }}t               }|D ]+  }	|j                  |	j                  j                                - |j                  |      }
t        j                  j                  |D 	cg c]  }	|	j                   c}	      }t        |j                  |j                   z  D cg c]   } |j"                        s|j"                  " c}      |z
  }t         fd|D              }t               }|D ]  }	|j                  |	j$                          |D ci c]  }||v r|||    }}|D ci c]  }||v r	|||v rdnd }}|D cg c]  }||v r||vr| }}|
j                  |       t         fd|
D              }
|
D cg c]  } |      s||    }}|D cg c]!  }|t        j                  j&                  v s |# }} j)                  ||      }t+        ||||||      }|j-                  |       |j/                  ||
z
        } |ddd   S c c}	w c c}w c c}w c c}w c c}w c c}w c c}w )	z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        c                "   j                   j                  | d      }|yt        |j                  j                  t
              rKt        |j                  t        j                        r&j                  j                  | d      x}r |      S yy)z
            Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
            so graph partition should not take it as inputs or outputs.
            NFT)	r   r  r~   rQ   rl   r3   r    MutationOutputr  )rb  r   r  is_none_layoutr[   s      r]   r  z?Scheduler.get_graph_partition_signature.<locals>.is_none_layout  sz    
 ""&&x6C{#((//:6chh(9(9:!%!8!8!<!<Xt!LLIL))44r_   c              3  V   K   | ]   }j                   j                  ||       " y wrY   r  r  r  rb   r[   s     r]   r  z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>  ,      / ''++D$7/   &)TFc              3  V   K   | ]   }j                   j                  ||       " y wrY   r  r  s     r]   r  z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>	  r  r  Nr  )rb  r   r   r   )r   rI   r   r  r  r  r  r  r   r  r;  r   r  r  r   r   r   rb   r   r  r  r0   r  r  )r[   
partitionsskip_cudagraphsr  unmet_output_namesrA  r  r  output_namesrQ   returned_output_namesr   r  partition_input_namesr  rb   r  input_deallocationextra_output_namesr  r  symbol_inputspartition_signaturer  s   `                      @r]   get_graph_partition_signaturez'Scheduler.get_graph_partition_signature  s!    
'(@(@(BC--/	( *-Z (?";*
 ]	%I~ -7LL! A##D$8$8$=$=$?@A %1$=$=>P$Q! '11<<.78d!!8K  "-!2!2[5G5G!G-aff5   " %/ /1/ %!
 5?L ! =$++DOO<=
 2<' l4((K  2"<' d&::dE" " 2"<'D8L,L " " "(();<$. /1/ %! 2%d+ T"L  "7$!''BSBS:SN  !BB;M #:"# 12!6!<!<"%::"w]	~ $B$e 9$
""s0   I>

%J
=JJ.J*J!J&Jc                   |j                   j                         D ci c]$  \  }}|t        j                  j                  vr||& }}}|j
                  j                         D ci c]$  \  }}|t        j                  j                  vr||& }}}|j                  D cg c].  }|j                         t        j                  j                  vr|0 }}|j                  D cg c]   }|t        j                  j                  vr|" }	}t        |j                  ||||j                  |	      S c c}}w c c}}w c c}w c c}w )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        )r  r\  rI   r   r  r!  r  maybe_get_namer  r0   r#  r  )
r[   r  rb   r  r  rA  r!  rQ   r  r  s
             r]   .clean_removed_buffer_from_partition_signaturesz8Scheduler.clean_removed_buffer_from_partition_signatures-  sK    !* 5 5 ; ; =
f177222 &L
 
 '99??A
c177222 #I
 
 "..
""$AGG,C,CC 
 
 "00
177222 
 

 '##$$
 	
)






s   )D/')D5!3D;$%E c                p   	
 ddl 	t               g g t        |      D ci c]  \  }}||
 c}}d	 fd
d
fd}|D ]5  }t        |j                  j
                        |<   |   dk(  s. 
|       7 g }d}|t        |      k  rsr}r0	j                        \  }}|j                  |        ||       r0r0	j                        \  }}|j                  |        ||       r0|dz  }|t        |      k  rrzr}|t        |      kD  rt        d      |S c c}}w )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                    |    | f}j                  |       rj                  |       y j                  |       y rY   )r  heappush)rQ   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesr[   s     r]   insert_pending_nodeszHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodesd  s>    ,T2D9O$$T*6H2ODr_   c                    | j                   j                  D ]*  }|   dkD  sJ |xx   dz  cc<   |   dk(  s# |       , y )Nr   r   )r   
succ_nodes)rQ   	succ_noder1  node_to_indegrees     r]   update_indegreezCScheduler.reorder_for_minimizing_partition.<locals>.update_indegreek  sT    !]]55 4	'	2Q666 +q0+#I.!3(3	4r_   r   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                rQ   rL   r   r   )	r.  r  r  rp   r   
pred_nodesheappopr  r^  )r[   r  r  rQ   r6  schedule	num_itersr  r-  r.  r1  r5  r/  r0  s   `       @@@@@@r]    reorder_for_minimizing_partitionz*Scheduler.reorder_for_minimizing_partitionQ  sU    	9=CEGI4=e4DEysDsE	E 	E	4  	+D%()A)A%BT"%*$T*	+
 -/	#e*$#':)--(?@4%% *
 &--(;<4%% &
 NI #e*$#': s5z!  ] Fs   D2c           	     X   ddl m}m} t        t        j
                  j                               } ||| j                  | j                  t        t        j
                  j                  j                               |      \  }}| j                  |      } ||||      \  }}	||dz  k  r|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r   )estimate_peak_memoryprepare_planning_inforf  )r  r>  r?  r   rI   r   r  r   rB  r  r  r<  )
r[   r  r>  r?  r7  default_peak_memoryname_to_freeable_input_bufreordered_nodesreorder_peak_memoryr  s
             r]   r  z0Scheduler.maybe_reorder_for_minimizing_partition  s     	H"177#;#;#=>:O##qww++0023;
77 ??F!57"
Q
 !4s!::""r_   c                   g }g }g }dd}|D ]l  }| j                  |      }|r*t        |j                        dk(  r|j                  |       @|r ||      r|j                  |       \|j                  |       n ||z   |z   S )a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        c                    | j                         D ]0  }|j                  D ]  }t        |j                  t              r  y 2 yr   )r   rV   r~   rQ   r   )rQ   r   r   s      r]   only_output_userzPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_user  sC    '') %99 %C%chh
;$%% r_   r   r.  )r  rp   r   r  )r[   r  frontmiddlebackrF  rQ   r  s           r]   r  z6Scheduler.reorder_for_partition_with_simple_dependency  s     *,*,(*	  	$D#44T:C(?(?$@A$ET"!&6t&<D!d#	$ v~$$r_   c                j   g }d}g }g }| j                   D ]Q  }| j                  |      }|r)||k7  r$|j                  |       |j                  |       g }|}|j                  |       S |r"|j                  |       |j                  |       | j                  ||      }| j	                  |       ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        T)r  r  )r  r  r  r%  r  )r[   r  r  cur_partitionr  rQ   r  r  s           r]   r  zScheduler.graph_partition  s     +-
')JJ 	'D#44T:3C!C!!-0&&~6 "-N  &	' m,"">277!? 8 

 	))*5:%%r_   c                    t        d      5  t        j                  j                  j                  r| j                         n| j                  | j                        	 cd d d        S # 1 sw Y   y xY w)NzScheduler.codegen)r   r   r   r   r  _codegen_partitions_codegenr  rc   s    r]   r  zScheduler.codegen  sX    -. 	 ??))99 ((*]]4::.	 	 	s   AA&&A/c                <   ddl m} t        j                  j                  }t        | j                        }t        j                  j                         5  t        j                  j                  dd| ||       | j                  |       t        t        j                  j                  |      sJ | j                  |      }|t        j                  j                  _        t        j                  j                  j                          t        j                  j                  j                  t        j                  j                        \  }}ddd       t        j                  j                  j!                  j"                         t        j                  j                  j%                  ||       t        j                  j                  j&                  j)                  |j*                  D cg c]  }|j-                          c}       y# 1 sw Y   xY wc c}w )z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)r  rP  rI   r   r   r  r  set_current_wrapper_codeinit_wrapper_coderN  r~   r(  rU  write_prefixgenerateis_inferencedefine_subgraph_launcher_fnvaluecodegen_partition_call	allocatedr  r  rZ   )	r[   r  r  rP  rT  graph_partition_idpartition_coder  rQ   s	            r]   _codegen_partition_wrapperz$Scheduler._codegen_partition_wrapper  s    	Bgg22!$"?"?@WW--/ 	TGG%%  *+=*>?$7%.	 &  MM)$ agg224PQQQKKIVI8AAGG  5GG  --/ ! 4 4 = =agg>R>R SNA-	T0 	
889M9MN	334F	R	&&--)2)?)?@T]]_@	
7	T 	T8 As   C:H.HHc                \   | j                         \  }}t        ||      D ]V  \  }}t        |      dk\  sJ dt        |              |j                  r| j	                  |       E| j                  ||       X t        | j                        }t        j                  j                  j                  |       |dkD  rqt        j                  j                  J |t        t        j                  j                        k(  s.J d| dt        t        j                  j                                yy)z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   z5Each partition must have at least one node but found r   NzExpect z partition maps but got )r  r  rp   r  rN  ra  r  r  rI   r   r   set_all_partition_namesr  )r[   r  r  r  r  num_partitionss         r]   rM  zScheduler._codegen_partitions#  s    "&!5!5!7
J$'
J$? 	F Iyy>Q& GIGWX& ''i(//	9E	F d;;<	44^D A77))555!S)?)?%@@ .))A#aggF\F\B]A^_@ r_   c                   t         j                  rdd l}t        j                         }t               }t        |      D ]  }|j                  dk(  r/|j                  |j                  j                  j                  k(  r nQ|j                  |j                  f}||vs"J d|j                   d|j                   d       |j                  |        d | _        |D ]  }t        j!                  t"        j$                        r4	 t        j'                  d|j)                         |j+                                | j/                  |       |j1                         x}r|| j                  k7  s |j3                         s|j5                         r| j7                          || j                  k7  r| j                  rGt9        | j                  j:                        r(t<        j>                  j@                  jC                          || _        t9        |j:                        rF|jD                  J d       t<        j>                  j@                  jG                  |jD                         | jH                  jK                  |jL                         |j5                         rP|jO                  tQ        |jS                                     \  }	}
}| jU                  |      jW                  |
||	       n|j3                         r,tY        jZ                  t\        |      }| j_                  |       n|ja                         rqtY        jZ                  tb        |      }| jU                  |      }d	d
l2m3} d	dl4m5} tm        |||f      r|}nto        dt;        |             |jq                  |       nYtm        |tr        tt        f      r!| jU                  |      jw                  |       n"tm        |tx              sJ |j{                          t         j|                  j~                  r| jU                  |      j                          | j                  jK                  |j                                | j                  jK                  |j                                tm        |tx              r|j1                         }||j:                  dk7  s| jU                  |      j                         s | j7                           | j                  rGt9        | j                  j:                        r(t<        j>                  j@                  jC                          | j7                          y # t,        $ r( t        j'                  d|j)                                Y Lw xY w)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0zdevice should have an indexr   )CUDACombinedSchedulingr  ztype(self)=r  )Fr   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r  rb   filename_dynamoconvert_frame__file__linenor*  r  r   r  r  r  r>  rZ   r  r   r  r   rt  rr  r  r<   rj   rI   r   r   codegen_device_guard_exitr  codegen_device_guard_enterr  r  r   r   r   rM  r{  codegen_templaterL  rM  r  r  rv  r:   codegen.cuda_combined_schedulingrh  r  r  r~   r  codegen_combo_kernelr  rW  codegen_noder  r  r  debug_sync_kernelcodegen_syncr0  rR  r  rN  ready_to_flush)r[   r  r   stackrW  framerF  rQ   ri  r  r  r  backend_rh  r  r{  s                   r]   rN  zScheduler._codegen?  sA   44.++-E7A|D!%  JJ"22%--*E*E*N*NN~~u||4$ ,U^^,<Aell^ LJ J
  # K	!D.
IIO224 t$**v*d111~~''')JJLT000**/@++000 ,,FFH*0D'(5%||7V9VV7,,GGU%%,,T__=!484W4W)*51-   (99!8X !{{#<dC((."{{#=tD++F3T8h9O(PQ&G(KDJ=)9::,,T2D#5}"EF  (55d;!$(>???}}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;*&v-((0??AJJLWK	!Z #4T5H5H5M5M#N GG  ::<

U ! IIPs   3U-V Vc                    |d   j                         }| t        j                  _        || _        |J | j                  |      }|j                  |      S )rw  r   )r   rI   r   rP   r  r{  benchmark_combo_kernel)r[   r  ri  r{  s       r]   r  z Scheduler.benchmark_combo_kernel  sW     1((* $!!!""6*--i88r_   c                   t         j                  sy|}|d   j                         }||j                  dk(  ryddlm} dg }}t        |      D ]  \  }}|j                         }	| j                  |	      rt        j                  d       	 | j                  |	      \  }
}t        j                  |
      rt        j                  d|        y		 ||
z  }|j                  |        	 | j                  |      \  }}}||z
  dk  xs |dk  }t        j!                  t"        j$                        rP||kD  s|r%t        j                  dt'        ||z  d             n$t        j                  dt)        ||z  d             ||z
  |k  xs |S # |$ r.}d
t        |      v rt        j                  d       Y d}~ y d}~ww xY w# |$ r-}d
t        |      v rt        j                  d       Y d}~y d}~ww xY w)r  Tr   Nrg  r  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r  r   rj   r  r  r  rM  r  r=  r>  rx  r  r  r   r  r  r  r  r7   r8   )r[   r  subkernel_nodesri  r  r  
path1_listr  r  r  msr  r  r  	ms2_clone_path2_listsmall_kernels                    r]   r  z!Scheduler.speedup_by_combo_kernel  s   
 ,, #..0 >V[[E1;rZ!/2 	$HAu)I ##I.  R55i@D::b>$$U ! " 2ICd#7	$:
	*.*E*Eo*V'CK Y,9c	""7==1SyL  E#)C2
   Ic	#0
 Y$44M $ *c!f4$$]     	&#a&0  Y 	s<   ?F	&F? 	F<"F76F77F<?G1"G,+G,,G1c                p    | j                   |   }|j                  J |j                  j                         S rY   )r   rQ   
get_layout)r[   rb  r   s      r]   get_buffer_layoutzScheduler.get_buffer_layout  s5    x(xx###xx""$$r_   c                   | j                   D ]  }|j                         s|j                  j                  D ]  }t        j
                  j                  j                  |j                        }|s9t        |      dk(  sHt        |j                  t        t        f      ri|j                         g k(  s}t        j
                  j                  j!                  |j                           y rf  )r  rD   r   r   rI   r   r+  r  rb   r/   r~   rl   r3   r2   r   zero_dim_cpu_tensor_listr*  )r[   rQ   r  r  s       r]   r  z$Scheduler.update_zero_dim_cpu_tensor  s    JJ 	HD{{} ,,22 
HDWW3377		BF+F3u< *"MMJ8I+J! #OO-388<<TYYG
H	Hr_   )r  zlist[ir.Operation]r   r   )r   z!dict[str, SchedulerDonatedBuffer]r   )ri  r   r   r   r   )r  r   r   r   )rQ   r!  r   rL   r  )r  rL   r   r0  )r   r  r  r  r   tuple[float, str]r  r  r~  r   r   r   )r  r   ri  r6  r   r  )r  r  r   r   )r:  rL   r;  rL   r   zUnion[bool, Callable[[], bool]])rQ   rL   r   rL   rY   )r  zOptional[int]r   r   r  )r  r0  r   1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r:  rL   r;  rL   r   r   )r:  rL   r;  rL   rO  z"Union[tuple[str], OrderedSet[str]]r   r   r:  rL   r;  rL   r   r   r.  )rr  rL   r  rL   rN  r3  r   r   )r  r*   r:  rL   r;  rL   r   r   )r  r'   r  r(   r   r   )r  r'   r   r   )r  r  r   r  )r  z+tuple[BaseSchedulerNode, BaseSchedulerNode]r   r   )r  r  r   r   )ri  r6  r   BaseScheduling)ri  r   r   r  r7  )rb   r   r  r&  r   r   )r   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])r  list[GraphPartitionSignature]r   r   )r  PartitionTyper  r  r   r
  )r  zlist[PartitionType]r  z
list[bool]r   r  )r  r0   r   r0   )r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r  r  r  r0   r   r   r  r  r   z(tuple[float, float, list[Optional[str]]])r  r0  r   r   )rb  r   r   z	ir.Layout)Mrk   r   r   r7  r   r   r  r  propertyr  setterr  r  r  r  r  r  r  ra  rj  r  r  r  r  rx  r}  r  r  r  r  r  rq  r  rC  r	  r  rE  rI  rX  rc  r  rv  rI  r  r  r  r\  r<  r  r  r  r  r  r  r  r{  r  r  r  r  r  r  r%  r(  r<  r  r  r  r  ra  rM  rN  r  r  r  r  ri  rj  s   @r]   rO   rO     s\   
 *)z
x	# & & ( (7#,"HVSp(#T,	 6S(4#&$6:	808	8$T0TDHT	T
> 
>*6
>	
>g@R
z(&z(/@z(	(z(x>h,h	 hT..`?4 ,4 	:4 l,&,/@,	,\7&7/@7	7r$&$/@$	$6< < !< =	<
 
<|I6&I6/@I6	I6V
9(9 )9 	9
 
9vQMf3&3/@3	3j

(9
BS
	
J D J&J/@J	J<6 Q6	:6@4@4	4	8*4
'*%5$

+:
	
2	D '1' 
'ReH eH QeH 
"	eHN - @J 	& B"
0"
	 "
H?&? 
!?B& 
!>%,%	 %@&	B&@(
 (
 +(
 
	(
T8hT949	19I5V%
Hr_   c                      e Zd Zd fdZddZddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ		 	 	 	 ddZ
	 	 	 	 	 	 	 	 dd	Z	 	 	 	 	 	 dd
ZddZddZddZddZ	 	 	 	 ddZddZ	 	 	 	 	 	 ddZ	 	 	 	 d dZ xZS )!r  c                0    t         |           || _        y rY   )r^  r   rP   )r[   rP   ra  s     r]   r   zBaseScheduling.__init__  s    "r_   c                R    | j                   r| j                   j                          y y rY   )rP   r  rc   s    r]   free_buffers_in_schedulerz(BaseScheduling.free_buffers_in_scheduler  s    >>NN'') r_   c                    t               S )z0Return a set of .codegen.common.BackendFeature()r   rh  s     r]   get_backend_featuresz#BaseScheduling.get_backend_features   s
    |r_   c                    t         )zO
        Check whether node1 and node2 can be vertically fused or not.
        r+  r9  s      r]   r  z BaseScheduling.can_fuse_vertical$  
     "!r_   c                    t         )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r+  r9  s      r]   r  z"BaseScheduling.can_fuse_horizontal,  r  r_   c                     y)au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        Fr   r9  s      r]   r|  z.BaseScheduling.can_fuse_multi_outputs_template4  s     r_   c                    |j                         s|j                         rt        j                  ||      S t        j                  ||      S )z 
        Fuse two nodes
        )rv  r:  r  r  r9  s      r]   r  zBaseScheduling.fuse@  sA     !1!1!3-225%@@%**5%88r_   c                    t         )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r+  )r[   r  s     r]   r|  zBaseScheduling.group_fnK  r  r_   c                    t         )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r+  )r[   r  epilogue_nodesr{  s       r]   rt  zBaseScheduling.codegen_templateS  s
     "!r_   c                    t         zD
        Generate a kernel given a list of pre-fused nodes.
        r+  )r[   r  r~  s      r]   r}  z.BaseScheduling.generate_kernel_code_from_nodesa  r  r_   c                    t         r  r+  r&  s     r]   rw  zBaseScheduling.codegen_nodei  
     "!r_   c                    t         )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r+  rc   s    r]   ry  zBaseScheduling.codegen_synco  r  r_   c                     y)z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr   rc   s    r]   rz  zBaseScheduling.ready_to_flushu  s    
 r_   c                    t         )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r+  rc   s    r]   r  zBaseScheduling.flush|  r  r_   c                    t         )rw  r+  r  s     r]   rx  z$BaseScheduling.benchmark_fused_nodes  
     "!r_   c                    t         )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r+  )r[   r  s     r]   r  z)BaseScheduling.benchmark_codegened_module  s
    
 "!r_   c                     y)z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r   r9  s      r]   r  z'BaseScheduling.get_fusion_pair_priority  s     r_   c                    t         )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r+  r  s     r]   r  z%BaseScheduling.benchmark_combo_kernel  r  r_   )rP   zOptional[Scheduler]r   )ri  r6  r   zOrderedSet[BackendFeature]r  r3  )r  r  r   z"tuple[tuple[sympy.Expr, ...], ...])r  rL   r  r  r{  r  r   zOptional[str]r  )rQ   z(Union[FusedSchedulerNode, SchedulerNode]r   r   r   r  )r  r   r   r  r  r  )rk   r   r   r   r  r  r  r  r|  r  r|  rt  r}  rw  ry  rz  r  rx  r  r  r  ri  rj  s   @r]   r  r    sC   #*"&"/@"	""&"/@"	"
&
/@
	
	9&	9/@	9		9"3"	+""(" 4" 4	"
 
""0"DH"	"""""0"	""&/@	"4"	1"r_   r  )rM  r   r   r   )rQ   rL   rB  r)  r   zdict[str, SchedulerBuffer]r   r   )r  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   r   )r  r  rP   rO   r  r0  r   r   )r   )r  zlist[list[int]]r  r  r  ztuple[int, ...]r   z	list[int])
__future__r   r  r   r  r  r"  r  r  r  r  rK  rL  rk  rL  r   r   r   r   r   r	   r
   r   r   collections.abcr   typesr   r  r   torch._inductor.async_compiletorch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   r   torch.utils._tritonr   r   r   r   r   r    r!   analyze_preserves_zero_maskr"   codegen.commonr#   r$   r%   comm_analysisr&   r'   r(   r)   r*   excr+   r,   fx_utilsr-   r.   r/   r0   r1   r2   r3   	loop_bodyr4   r  r5   r6   runtime.runtime_utilsr7   r8   r  r9   utilsr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   virtualizedrI   	getLoggerrk   r   _logginggetArtifactLoggerr=  r  r   r  	dataclassrN   r   rL   r3  rn   r   rA  r  r  rW  r  r  r  r:  r  r  r  r  r  rO   r  r   r_   r]   <module>r     s   "         	     , R R R (    $ 6 ? M > / O O * 6 6 D M M ; : : 2 2    J 7 &    "  g!^^--hA
NN44XO () h8 h8 h8V 4_ 4 4u
1 u
1p
 
,  &K
&K4&K ,&K 
	&KRW 1 W"5. 5~+% ~+B@	$@ $ 
	,b** b*J~:!3 ~:B
Q, Qn %'+#++ "+ 	+\ 
 
 
> +9??, u,H u,HpYK" K"r_   