
    rh                   >   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$ d d	l%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z; ddl<m=Z= ddl;m>Z>m?Z? ddl@mAZA ddlBmCZC ddlmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZUmVZVmWZW ddlXmYZY ddlZm[Z[m\Z\m]Z] erd dl^m_Z_m`Z` d dlaZaddlbmcZc ddldmeZe  ej                  eg      Zh eU       j                  Zjekej                  ej<                  emenf   Zoee;j                  eVf   Zqedgdf   ZrdXd ZsdYd!Zteuemevf   Zweekeevej.                  f   d"f   eewgekevd"f   f   f   Zx	 	 dZ	 	 	 	 	 	 	 	 	 	 	 d[d#Zyd\d$Zzej                   G d% d&             Z| G d' d(      Z} G d) d      Z~ej                   G d* d+e~             Zej                   G d, d-e~             Zej                   G d. d/e~             Zej                   G d0 d1e~             Z G d2 d3e~      Zej                   G d4 d5e~             Zej                   G d6 d7e~             Zej                   G d8 d9e~             Zej                   G d: d;e~             Zej                   G d< d=e~             Zej                   G d> d?e~             Zej                   G d@ dAe             Zej                   G dB dCe             Zej                   G dD dEe             Zej                   G dF dGe             Z G dH dIe      Zej                   G dJ dKe~             Zej                   G dL dMe             Zej                   G dN dOe             Zej                   G dP dQe~             Zej                   G dR dSe~             ZemZeeeIf   Z G dT dUeS      Z G dV dWe      Zy)]    )annotationsN)chaincount)AnyCallableOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfDelayReplaceLineget_benchmark_nameIndentedBuffer#is_codegen_graph_partition_subgraphLineContext'set_kernel_post_grad_provenance_tracingsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLinePythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLowering)FxConverterWrapperLinec                8   t         j                  j                  |       }| j                         t         j                  j                  v}| j                         | j                         t        t         j                  j                  j                  |            |fS N)
r0   graphget_allocation_storage_sizeget_nameunaligned_buffersget_device_or_error	get_dtyper-   sizevarssimplify)nodestorage_size	alignments      r/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyrO   [   sr    7766t<Lqww'@'@@I  " 	!''""++L9:     c                   | j                         |j                         k7  ry| j                         |j                         k7  ryt        j                  j                  j                  t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  |            }t        |      t        |      k(  sWt        j                  j                  j                  |d|z        r+t        j                  j                  j                  ||      ryy)NFgffffff?T)
rG   rH   r0   rC   rI   rJ   rD   r-   statically_known_geqstatically_known_leq)	input_buf
output_buf
input_sizeoutput_sizes       rN   can_match_buffer_sizerX   i   s     $$&**H*H*JJ
 4 4 66!!**	++I6J ''""++	++J7K 	*;!77 	
--k4*;LMGG11+zJrP   .c                    t               dd	 d	 	 	 dfd}dd fd}d  } |d| d       r4t        j                  j                  rj                  j                         nt        j                         }j                         5  |5  t        j                  j                  rV|rTt        j                  j                  r:|t        j                  j                  v rt        j                  j                  |   }	nd gt        |      z  }	t        |      dk(  r" ||d   |	d         \  }
} |d	|
 d	|        nt        |      dkD  sJ t        |      t        |      k(  sJ t               }t        t        |||	      d
 d      D ]  \  }
}}|j                  rD|j                  j!                         D  cg c]  \  } }d|  d|  }} }dj#                  |      }nd} ||
|      \  }
}d| d|
 }||v rr|j%                  |        ||d| d|         d d d        d d d        |j'                         fS c c}} w # 1 sw Y   )xY w# 1 sw Y   -xY w)Nc                d    t        | t        j                        r| S t        j                  |       S rB   )
isinstancesympyr   Integer)items    rN   _convert_to_sympy_exprz@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s#    !$

3tLt9LLrP   c                    t        |       r| | fS t        fd| D              }|s|}j                  |      t        j                  j
                  r$j                  t        fd|D                    fS dfS )a'  
        This function return a tuple of two values: the first one is for the real grid
        which is used in the generated code; the second one is an example grid with
        concreate values which is used in the autotune block to run the generated
        kernels at compile time.
        Nc              3  .   K   | ]  } |        y wrB    ).0gr_   s     rN   	<genexpr>zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     C1!4Cs   c              3  T   K   | ]  }j                  |t        |             ! y wrB   generate_example_arg_valuetype)rc   rd   wrappers     rN   re   zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s*        ::1d1gF   %()callabletuplecodegen_python_shape_tupler   tritonautotune_at_compile_time)gridexample_grid
sympy_gridr_   rj   s      rN   determine_gridz8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htn:CdCC
%L..z: ==99 22 !- 
 	
 
 	
rP   c                    j                  |        rJt        j                  j                  r/j                  vr j
                  j                  |xs |        y y y y rB   )	writeliner   ro   rp   kernel_autotune_nameskernel_autotune_calls)linerr   nameoutputrj   s     rN   rv   z3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 rP   grid_wrapper_for_def z(meta):r1   r   zreturn c                2    t        | d   j                        S Nr1   lenkwargsxs    rN   <lambda>z2user_defined_kernel_grid_fn_code.<locals>.<lambda>   s    c!A$++. rP   Tkeyreversezmeta['z'] == z and Trueif z	: return )r^   Union[int, sympy.Expr]return
sympy.ExprrB   )rq   
TritonGridrr   zOptional[TritonGrid])ry   strrr   Optional[str])r(   r   ro   rp   rx   indent
contextlibnullcontextr0   rC   autotuning_gridsr   r   sortedzipr   itemsjoinaddgetvalue)rz   configsgridsrj   original_fxnode_namert   rv   fn_namekernel_autotune_calls_indentexample_gridsrq   rr   seencvalguards	statementr_   r{   s   `  `             @@rN    user_defined_kernel_grid_fn_coder      s|    FM
 .2

*
>J J "$(GWIW%& v}}== 	%%,,.##% !
 
 'L6 'LMM22$(($(@(@@GG445IJM!FSZ/Mu:?!/a-:J!KD,v&',(@Au:>!>u:W---$.LD *0E7M2.* L%a
 88DEHHNNDT7@tS&fSE2F  %\\&1F#F%3D,%G"l!&4&9	$#)s6()L>%JK#L-'L 'LR FOO%%%9'L 'L 'L 'Ls8   ID-II	AI'I	II	II$c                    t               j                  | j                  d       ddlm ddlm t        | j                  g      fd |        j                         S )zg
    Given a triton kernel function pointer collect the transitive closure of
    its dependencies
    Tstripr   )JITFunction)	constexprc           	        t        d t        j                  | j                        D              }| j                  j                  j                  di       }| j                  j                  j                  D ]  }|v r	|| j                  j                  v s"| j                  j                  |   }t        |      rX	j                          	j                  d       	j                  |j                  d       j                  |        |       t        |t        t        t         
f      r	j                          t        |
      rd|j"                  d}n|}|j                  |      x}rKt        |t$              rd|j&                   d	|j(                   }nd|}	j                  | | d
|        n	j                  | d
|        j                  |       r||v sx|dk7  st+        |d      s|j&                  j-                  d      s	j                  d|j&                   d|j(                   d|        j                  |        y )Nc              3  R   K   | ]  }|j                   d k(  r|j                   ! yw)LOAD_GLOBALN)opnameargval)rc   insts     rN   re   z^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>
  s(      '
{{m+ KK'
s   %'__annotations__z@triton.jitTr   ztl.constexpr(): . = tl
__module__ro   zfrom z import z as )r   disBytecodefn__globals__get__code__co_namesr[   newlinerv   splicesrcr   intr   boolvalueri   r   __name__hasattr
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverses           rN   r   zKuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse  s?   
 ' '
Z]]3'
 

 (]]66::;LbQ%==11:: -	6K..jmm777#22;?fk2#++-#--m<#**6::T*B$((5V$c4(CD#++-!&)4'4V\\4DA%F
(.z
%7%;%;K%HHzH%j$7"$Z%:%:$;1Z=P=P<Q R , 13:..AO'11*mO+<C
|L (11[MZL2QR$((5#44#t+5 ))44X>
 $-- 1 12(6??:K4P[}] %((5[-	6rP   )
r(   r   r   ro   r   triton.languager   r   r   r   )kernelr   r   r   r   r   s    @@@@@rN   9user_defined_triton_kernel_transitive_closure_source_coder      sd    
 %&O6::T2 #) "6??"3486 86t V##%%rP   c                  (    e Zd ZU ded<   ded<   d Zy)SymbolicCallArgr   innerr   
inner_exprc                ,    t        | j                        S rB   )r   r   selfs    rN   __str__zSymbolicCallArg.__str__I  s    4::rP   N)r   r   __qualname__r   r   rb   rP   rN   r   r   C  s    JrP   r   c                  6     e Zd Z fdZddZddZddZ xZS )MemoryPlanningStatec                l    t         |           t        j                  t              | _        d| _        y Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_size)r   	__class__s    rN   r   zMemoryPlanningState.__init__N  s-    ##D) 	 12(rP   c                L    t        | j                  j                  |d             S rB   )r   r   r   )r   r   s     rN   __contains__z MemoryPlanningState.__contains__U  s    DOO''T233rP   c                \    | j                   |   j                         }|j                  rJ |S rB   )r   pop	is_reusedr   r   r^   s      rN   r   zMemoryPlanningState.popX  s+    s#'')>>!!rP   c                \    |j                   rJ | j                  |   j                  |       y rB   )r   r   appendr   s      rN   pushzMemoryPlanningState.push]  s&    >>!!##D)rP   )r   ReuseKeyr   r   )r   r   r   FreeIfNotReusedLine)r   r   r^   r   r   None)r   r   r   r   r   r   r   __classcell__r   s   @rN   r   r   M  s    24
*rP   r   c                      e Zd ZddZy)r@   c                    t        d      )Nz2FX codegen not yet supported for type {type(self)})NotImplementedErrorr   	converters     rN   
codegen_fxzWrapperLine.codegen_fxc  s    !"VWWrP   Nr   r?   r   FxConversionFuncr   r   r   r   rb   rP   rN   r@   r@   b  s    XrP   c                  :    e Zd ZU ded<   ded<   d	dZd
dZddZy)EnterSubgraphLinePythonWrapperCodegenrj   r>   rC   c                b    | j                   j                  | j                   j                         y rB   )rj   push_computed_sizescomputed_sizesr   s    rN   __post_init__zEnterSubgraphLine.__post_init__l  s    (()D)DErP   c                n    | j                   j                  | j                         |j                          y rB   )rj   push_codegened_graphrC   	do_indentr   codes     rN   codegenzEnterSubgraphLine.codegeno  s"    ))$**5rP   c                    |j                   S rB   )_generate_enter_subgraphr   s     rN   r   zEnterSubgraphLine.codegen_fxs  s    111rP   Nr   r   r  r(   r   r   r   r   r   r   r   r	  r  r   rb   rP   rN   r  r  g  s    !!F2rP   r  c                  2    e Zd ZU ded<   ddZedd       Zy)CommentLiner*   ry   c                :    |j                  | j                         y rB   )rv   ry   r  s     rN   r  zCommentLine.codegen{  s    tyy!rP   c                    | j                   S rB   )_generate_comment)r   s    rN   r   zCommentLine.codegen_fx~  s    ***rP   Nr  r   )r   r   r   r   r  staticmethodr   rb   rP   rN   r  r  w  s!    
" + +rP   r  c                  0    e Zd ZU ded<   ddZddZd	dZy)
ExitSubgraphLiner  rj   c                V    | j                   j                         | j                   _        y rB   )rj   pop_computed_sizesr  r   s    rN   r	  zExitSubgraphLine.__post_init__  s    &*ll&E&E&G#rP   c                X    | j                   j                          |j                          y rB   )rj   pop_codegened_graphdo_unindentr  s     rN   r  zExitSubgraphLine.codegen  s    ((*rP   c                    |j                   S rB   )_generate_exit_subgraphr   s     rN   r   zExitSubgraphLine.codegen_fx  s    000rP   Nr  r  r   r  rb   rP   rN   r  r    s    !!H1rP   r  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
EnterDeviceContextManagerLiner   
device_idxzOptional[int]last_seen_device_guard_indexc                x   t         j                  j                  r|j                  d       t         j                  j                  rg| j
                  ;|j                  t         j                  j                  j                          d       y | j
                  | j                  k(  s{J d       | j
                  H|j                  t         j                  j                  j                          d| j                   d       y |j                  d| j                   d       y y |j                  dt         j                  j                  j                  | j                         d       |j                          |j                  t         j                  j                  j                  | j                               y )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r0   rC   cpp_wrapperrv   aot_moder'  
device_opscpp_aoti_stream_guardr&  cpp_aoti_device_guarddevice_guardr  
set_devicer  s     rN   r  z%EnterDeviceContextManagerLine.codegen  sP   77NN4 ww 44<NN77--CCEFFop  <<O NO 44<NN77--CCEFnUYUdUdTeegh NN%<T__<MR#PQ P NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJrP   c                    |j                   S rB   )&_generate_enter_device_context_managerr   s     rN   r   z(EnterDeviceContextManagerLine.codegen_fx  s    ???rP   Nr  r   r   r   r   r   r  r   rb   rP   rN   r%  r%    s    O"//K:@rP   r%  c                      e Zd ZddZddZy)ExitDeviceContextManagerLinec                Z    t         j                  j                  s|j                          y y rB   )r0   rC   r,  r!  r  s     rN   r  z$ExitDeviceContextManagerLine.codegen  s     ww"" #rP   c                    |j                   S rB   )%_generate_exit_device_context_managerr   s     rN   r   z'ExitDeviceContextManagerLine.codegen_fx  s    >>>rP   Nr  r   r   r   r   r  r   rb   rP   rN   r7  r7    s    ?rP   r7  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ExternKernelAllocLiner  rj   ir.ExternKernelAllocrK   c                    | j                   }g |j                         |j                         }| j                  j	                  | j                   |       y rB   )rK   codegen_argscodegen_kwargsrj   $_generate_extern_kernel_alloc_helper)r   r  rK   argss       rN   r  zExternKernelAllocLine.codegen  sD    yy=""$=t':':'<=99$))TJrP   c                    |j                   S rB   )_generate_extern_kernel_allocr   s     rN   r   z ExternKernelAllocLine.codegen_fx  s    666rP   Nr  r   r5  rb   rP   rN   r=  r=    s    !!
K
7rP   r=  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ExternKernelOutLiner  rj   ir.ExternKernelOutrK   c                b   | j                   }g |j                         |j                  d      }|j                         }t        j
                  j                  r|j                  dk(  rd}n|j                         }|j                         x}r|j                  nt        j
                  j                  }t        j                  j                  rt        ||d       | j                  j!                  ||j#                         |j$                  r|j$                  j#                         nd ||       y )NT)skip_outztorch::inductor::_mm_plus_mmaoti_torch__mm_plus_mm_out)	is_extern)rK   r@  rA  get_kernel_namer0   rC   r,  cpp_kernel_name
get_deviceri   device_typer   traceenabledr+   rj   "_generate_extern_kernel_out_helpercodegen_referenceoutput_view)r   r  rK   rC  kernel_nameddevices          rN   r  zExternKernelOutLine.codegen  s    yyJ""$Jt':':D':'IJ**,GG$$(FF 7K..0K!%!22A29L9L<<3D+QUV77""$484D4DD..0$	
rP   c                    |j                   S rB   )_generate_extern_kernel_outr   s     rN   r   zExternKernelOutLine.codegen_fx      444rP   Nr  r   r5  rb   rP   rN   rG  rG    s    !!

05rP   rG  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
FreeLiner  rj   %Union[BufferLike, ir.TorchBindObject]rK   c                    | j                   j                         t        j                  j                  vsJ |j                  | j                  j                  | j                                y rB   )rK   rE   r0   rC   removed_buffersrv   rj   make_buffer_freer  s     rN   r  zFreeLine.codegen  sF    yy!!#177+B+BBBBt||44TYY?@rP   c                    |j                   S rB   )_generate_freer   s     rN   r   zFreeLine.codegen_fx      '''rP   Nr  r   r5  rb   rP   rN   r]  r]    s    !!
//A(rP   r]  c                      e Zd ZU ded<   ded<   ded<   ded<   ded<   d	ed
<   ded<   ded<   ded<   ded<   ded<   ddZddZy)KernelCallLiner  rj   r   rV  ztuple[Any, ...]	call_argsraw_keysraw_args	list[str]	arg_typesr   ro   zdict[str, Any]triton_metaztorch.devicerX  
graph_namer   c                   | j                   j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  
       y )N)ro   rk  rh  ri  rl  rX  rm  r   )rj   _generate_kernel_call_helperrV  rg  ro   rk  rh  ri  rl  rX  rm  r   r  s     rN   r  zKernelCallLine.codegen  se    11NN;;nn]]]]((;;!%!:!: 	2 	
rP   c                    |j                   S rB   )_generate_kernel_callr   s     rN   r   zKernelCallLine.codegen_fx      ...rP   Nr  r   r5  rb   rP   rN   rf  rf    sL    !!LO
/rP   rf  c                  f    e Zd ZU ded<   ded<   ded<   dZded<   d	Zd
ed<   dZded<   ddZddZy)KernelDefinitionLiner  rj   r   rV  kernel_bodyNr   metadataTr   gpucpp_definitionc                    | j                   j                  | j                  | j                  | j                  | j
                  | j                         y N)rv  rw  rx  )rj   _define_kernel_helperrV  ru  rv  rw  rx  r  s     rN   r  zKernelDefinitionLine.codegen&  sB    **]].. 	+ 	
rP   c                    |j                   S rB   )_generate_kernel_definitionr   s     rN   r   zKernelDefinitionLine.codegen_fx/  r[  rP   r  r   )	r   r   r   r   rv  rw  rx  r  r   rb   rP   rN   rt  rt    s<    !!"Hm"C$(NM(
5rP   rt  c                  0    e Zd ZU ded<   ddZddZd	dZy)
MemoryPlanningLiner  rj   c                    | S )zFirst pass to find reuserb   r   states     rN   planzMemoryPlanningLine.plan7  s    rP   c                     y)zSecond pass to output codeNrb   r  s     rN   r  zMemoryPlanningLine.codegen;  s    rP   c                r   g }t        j                  |       D ]t  }|j                  dk(  rt        | |j                        }|j	                  |j                   d|j
                  t        j                  u r|j                         n|        v t        |       j                   ddj                  |       dS )zF
        Emits a string representation that fits on one line.
        rj   =(, r   )dataclassesfieldsrz   getattrr   ri   r   BufferrE   r   r   )r   rC  fieldr   s       rN   r   zMemoryPlanningLine.__str__>  s      ''- 	EzzY&$

+CKK::,a%**		2IsST		 t*%%&a		$'8::rP   Nr  r   r   r  r  r   r   )r   r   r   r   r  r  r   rb   rP   rN   r  r  3  s    !!);rP   r  c                  0    e Zd ZU ded<   ddZddZd	dZy)
AllocateLine
BufferLikerK   c           	        | j                   j                         t        j                  j                  v rt        | j                        S t        | j                         }t        j                  rG||v rC|j                  |      }d|_        t        | j                  |j                   | j                         S | j                   j                         j                  dk(  rh| j                  j                  | j                         }|A|xj                   t#        t%        j&                  t(        j*                  |d            z  c_        | S )NTcpur1   )rK   rE   r0   rC   r`  NullLinerj   rO   r   allow_buffer_reuser   r   	ReuseLinerG   ri   static_shape_for_buffer_or_noner   r   	functoolsreduceoperatormul)r   r  r   	free_linestatic_shapes        rN   r  zAllocateLine.planQ  s    99177#:#::DLL)) tyy)$$		#I"&IT\\9>>499EE99((*//58<<GG		RL'11S$$X\\<C6 1 rP   c                    | j                   j                         t        j                  j                  vsJ | j
                  j                  | j                         }|j                  |       y rB   )rK   rE   r0   rC   r`  rj   make_buffer_allocationrv   r   r  ry   s      rN   r  zAllocateLine.codegene  sK    yy!!#177+B+BBBB||22499=trP   c                    |j                   S rB   )_generate_allocater   s     rN   r   zAllocateLine.codegen_fxj  s    +++rP   Nr  r  r   r   r   r   r   r  r  r   rb   rP   rN   r  r  M  s    
(
,rP   r  c                  >    e Zd ZU ded<   dZded<   d
dZddZddZy	)r   r  rK   Fr   r   c                   t        | j                  j                               dkD  r| S t        | j                  j                  t
        j                        r| S | j                  rJ | j                  j                         t        j                  j                  v rt        | j                        S t        j                  r%|j!                  t#        | j                        |        | S r   )r   rK   get_inputs_that_alias_outputr[   layoutr   MultiOutputLayoutr   rE   r0   rC   r`  r  rj   r   r  r   rO   r  s     rN   r  zFreeIfNotReusedLine.plans  s    tyy55781<Kdii&&(<(<=K>>!!99177#:#::DLL))$$JJ'		2D9rP   c                    | j                   j                         t        j                  j                  vsJ | j
                  s5|j                  | j                  j                  | j                                y y rB   )	rK   rE   r0   rC   r`  r   rv   rj   ra  r  s     rN   r  zFreeIfNotReusedLine.codegen  sR    yy!!#177+B+BBBB~~NN4<<88CD rP   c                    |j                   S rB   )_generate_free_if_not_reusedr   s     rN   r   zFreeIfNotReusedLine.codegen_fx  s    555rP   Nr  r  r   )r   r   r   r   r   r  r  r   rb   rP   rN   r   r   n  s"    
It
E
6rP   r   c                  D    e Zd ZU ded<   ded<   ded<   d
dZddZddZy	)ReinterpretLiner  rK   	reused_asz	ir.Layoutr  c                    | S rB   rb   r  s     rN   r  zReinterpretLine.plan  s    rP   c                @   t        | j                  t        j                        sJ t        | j                  j                  t        j
                        sJ | j                  j                  | j                  j                         | j                  j                         y rB   )
r[   r  r   NonOwningLayoutviewr"   rj   codegen_deferred_allocationr  rE   r  s     rN   r  zReinterpretLine.codegen  sj    $++r'9'9:::$++**B,>,>???00NN##%t{{'7'7	
rP   c                    |j                   S rB   )_generate_reinterpretr   s     rN   r   zReinterpretLine.codegen_fx  rr  rP   Nr  r  r   r  rb   rP   rN   r  r    s#    

/rP   r  c                  H    e Zd ZU ded<   ded<   dZded<   ddZddZdd	Zy
)r  r  rK   r  Tr   
delete_oldc                p   | j                   j                         t        j                  j                  v rK| j
                  j                         t        j                  j                  v sJ t        | j                        S | j
                  j                         t        j                  j                  vsJ | S rB   )rK   rE   r0   rC   r`  r  r  rj   r  s     rN   r  zReuseLine.plan  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGrP   c                p   | j                   j                         t        j                  j                  vsJ | j
                  j                         t        j                  j                  vsJ |j                  | j                  j                  | j                   | j
                  | j                               y rB   )
rK   rE   r0   rC   r`  r  rv   rj   make_buffer_reuser  r  s     rN   r  zReuseLine.codegen  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
rP   c                    |j                   S rB   )_generate_reuser   s     rN   r   zReuseLine.codegen_fx  s    (((rP   Nr  r  r   )r   r   r   r   r  r  r  r   rb   rP   rN   r  r    s'    
J
)rP   r  c                      e Zd ZddZy)r  c                    |j                   S rB   )_generate_nullr   s     rN   r   zNullLine.codegen_fx  rd  rP   Nr   r  rb   rP   rN   r  r    s    (rP   r  c                  X    e Zd ZU ded<   ded<   ed	d       Zed
d       Zedd       Zy)CommBufferLiner  rj   	ir.BufferrK   c                    ddl m} | j                  j                         }| j                  j	                         } ||      rt        d| j                         t        |      |j                  z  S )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )torch._inductor.utilsr  rK   	get_numelrH   AssertionErrorr   itemsize)r   r  numelr   s       rN   sizezCommBufferLine.size  sa    5		##%		##%u ?		{K  5zENN**rP   c                    | j                   j                         }t        |t        j                        sJ |j
                  S rB   )rK   get_output_specr[   r   CommBufferLayoutcomm_buffer_typer   r  s     rN   r  zCommBufferLine.comm_buffer_type  s6    **,&""5"5666&&&rP   c                    | j                   j                         }t        |t        j                        sJ |j
                  S rB   )rK   r  r[   r   r  
group_namer  s     rN   r  zCommBufferLine.group_name  s6    **,&""5"5666   rP   Nr   r   )r   zir.CommBufferTyper  )r   r   r   r   propertyr  r  r  rb   rP   rN   r  r    sG    !!
O	+ 	+ ' '
 ! !rP   r  c                  ,    e Zd ZddZed        ZddZy)CommBufferAllocateLinec                "   | j                   j                         t        j                  j                  vsJ | j                   j                         }| j                   j                         }| j                   j                         }t        | j                   j                               }t        | j                   j                               }|j                  | j                  | j                  | j                  | j                  |||||             y rB   )rK   rE   r0   rC   r`  rO  rH   rm   get_size
get_striderv   make_allocation_liner  r  rj   )r   r  rz   rX  r   shapestrides          rN   r  zCommBufferAllocateLine.codegen  s    yy!!#177+B+BBBByy!!#%%'		##%dii((*+tyy++-.%%%%		
rP   c                    | t         j                  j                  k(  rS| d|j                  |       d|j                  |       d| d|j                   d| dt        j                  dd       dS t        d	|        )
Nz = empty_strided_p2p(r  z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    r   zUnsupported comm buffer type: )r   CommBufferTypeSYMM_MEMcodegen_shape_tupleindexrandomrandintr   )r  r  rj   rz   rX  r   r  r  s           rN   r  z+CommBufferAllocateLine.make_allocation_line  s     r00999&-..u56b..v67r' &&,ll^ 4)l +"NN1i89< &01A0BC rP   c                    |j                   S rB   )_generate_comm_buffer_allocater   s     rN   r   z!CommBufferAllocateLine.codegen_fx   s    777rP   Nr  r   )r   r   r   r  r  r  r   rb   rP   rN   r  r    s     
(  $8rP   r  c                      e Zd ZddZddZy)CommBufferFreeLinec                    | j                   j                  | j                        }|j                  | d| j                  j
                   d       y )Nz # z buffer free)rj   ra  rK   rv   r  r   r  s      rN   r  zCommBufferFreeLine.codegen  s@    ||,,TYY7$s4#8#8#>#>"?|LMrP   c                    |j                   S rB   )_generate_comm_buffer_freer   s     rN   r   zCommBufferFreeLine.codegen_fx
  s    333rP   Nr  r   r;  rb   rP   rN   r  r    s    N4rP   r  c                  J    e Zd ZU dZded<   ded<   ded<   ded<   dd	Zdd
Zy)MultiOutputLinezU
    Given a MultiOutputLayout buffer, indexes actual buffer(s) from the result.
    r  rj   r   result_namearg_nameSequence[Any]indicesc                      fd  j                    j                        }|j                   j                  j                    j
                   d|  j                  j                          y )Nc                l   t        |      dkD  r|d   \  }}t        |t              r |  d| d|dd        S t        |t              r<j                  j                  | j                  t        |            } ||dd        S t        |t              r |  d| d|dd        S t        d|      | S )Nr   []r1   z['z']znon supported index type: )
r   
issubclassr   rm   rj   codegen_tuple_accessr  r   dictr  )basenamer  itypeituple_accesscodegen_list_tuple_accessr   s        rN   r  z:MultiOutputLine.codegen.<locals>.codegen_list_tuple_access  s    7|a"1:qeT*4z1#Q5GQRQSUUu-#'<<#D#D $"2"2CF$L 5\712;OOt,4zA3b5I7STSU;WW()EuMMrP   r   )r  r  rv   rj   declarer  ending)r   r  r   r  s   `  @rN   r  zMultiOutputLine.codegen  s]    	 $ *$--F||##$T%5%5$6c%ATAT@UV	
rP   c                    |j                   S rB   )_generate_multi_outputr   s     rN   r   zMultiOutputLine.codegen_fx1  s    ///rP   Nr  r   )r   r   r   __doc__r   r  r   rb   rP   rN   r  r    s*     "!M
00rP   r  c                  <    e Zd ZU ded<   ded<   ded<   d
dZddZy	)SymbolicCallArgLiner  rj   r   argr>   rC   c                d    | j                   j                  | j                  | j                         y rB   )rj   "_generate_symbolic_call_arg_helperr  rC   r  s     rN   r  zSymbolicCallArgLine.codegen;  s    77$**MrP   c                    |j                   S rB   )_generate_symbolic_call_argr   s     rN   r   zSymbolicCallArgLine.codegen_fx>  r[  rP   Nr  r   r5  rb   rP   rN   r  r  5  s    !!	N5rP   r  c            	      b    e Zd ZdZdZ fdZe	 d	 	 	 	 	 	 	 dd       ZddZddZ	ddZ
dd	Zdd
Zedd       ZddZedd       ZddZedd       ZddZ	 	 ddZddZddZddZddZddZddZddZddZddZddZd Zd Z d Z!d Z"d  Z#dd!Z$dd"Z%dd#Z&dd$Z'dd%Z(dd&Z)dd'Z*dd(Z+dd)Z,d* Z-	 	 	 	 dd+Z.	 	 	 	 	 	 	 	 	 	 	 	 dd,Z/dd-Z0dd.Z1dd/Z2d0 Z3d1 Z4d2 Z5	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd3Z6d4 Z7dd5Z8e9jt                  dd6       Z;dd7Z<d8 Z=d9 Z>d: Z?d; Z@dd<ZA	 	 	 	 	 	 dd=ZBd> ZCdd?ZDd@ ZEddAddBZFddAddCZGddDZHddEZIddFZJddGZKddHZL	 d	 	 	 ddIZMddJZNddKZOdL ZPdM ZQdN ZR	 	 	 d	 	 	 	 	 	 	 	 	 ddOZSe	 d	 	 	 	 	 ddP       ZT	 	 	 d	 	 	 	 	 	 	 	 	 ddQZUddRZV	 	 ddSZWdddTZX	 	 	 	 	 	 ddUZYddVZZddWZ[dX Z\dY Z]dZ Z^d[ Z_d\ Z`d] Zad^ Zbd_ Zcdd`Zdda Zeddddddddb	 ddcZfdddddddddde	 ddfZgdg Zhdh Zidi ZjddjZkddkZl	 ddlZmdm ZnddnZoddoZpddpZqddqZrddrZsddsZtddtZudu ZvddvZwdw ZxddxZydy Zz	 	 	 	 	 	 	 	 ddzZ{d{ Z|	 	 	 	 dd|Z}dd}Z~d~ Zd Zd Zd Zd Zd Zd Zd Zed        Zed        Zed        Zed        Zed        Z xZS )r  zB
    Generate outer wrapper in Python that calls the kernels.
    Tc                    t                    t                _        i  _        t                _        t                _        t                _        t                _	        t                _
        t                _        t                _        t                _        t                _        t                _        i  _        d _        i  _        t                _        g  _        d _        d _        d _        d _        d _        t6        j8                  j:                  rdnd _        t6        j8                  j:                  rdnd _        d  _         d _!        i  _"        t                _#        t                _$        d  _%         jM                          g  _'        g  _(         jS                          tU               s jW                           jY                          t6        j8                  jZ                  sBt6        j8                  j\                  j_                         D ]  \  }} ja                  ||        t        tb                   _2        t        tb                   _3        i  _4         tk        jl                  d        jn                         _7        tj        jp                  d
 fd       }| _9        i  _:        t                _;        ty                _=        t                _>        i  _?        t        t        j                  j                  t        j                  j                  	       _E        g  _F        y )Nr    #r   z
std::move(r   Tc                    j                   j                  |        t        j                  j                  rj
                  j                  |        y y rB   )importsrv   r   ro   rp   rx   )ry   r   s    rN   add_import_oncez6PythonWrapperCodegen.__init__.<locals>.add_import_once  s;    LL""4(}}55**44T: 6rP   )debug_printer_leveluse_array_ref)ry   r   r   r   )Gr   r   r   _names_iterargs_to_buffersr(   r  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsrx   subgraph_definitionsr   rw   kernel_autotune_example_argskernel_autotune_tmp_arg_idxsrc_to_kernelkernel_numel_exprlinesr  declare_maybe_referencer  commentnone_strr0   rC   r,  
move_beginmove_endr'  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerr)   write_prefix!write_kernel_autotune_defs_headerr-  constant_reprsr   write_constant
BufferName	allocatedfreedreusesr  	lru_cachewrite_get_raw_streamcacher  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r   rz   hashedr  r   s   `   rN   r   zPythonWrapperCodegen.__init__M  s   */'  	 &'$&$&$&#1#3 *,$2$4!%3%5"$2$4!6@l" IK)01( .0HR!#
')$*+''*=*=,2 ww22;?)+/(QS&L 	" 9C $!!# &("$&!248..0ww ! 6 6 < < > 2f##D&12 $J/1
+-
 57$=I$7$7$=%%%
! 
	; 
	;
  /&(+5<"2"4<FL(46! 1 & 3 3 T T --DD
 !#rP   Nc                D    | r|J |J t        |||      S t               S rB   )SubgraphPythonWrapperCodegenr  )is_subgraphsubgraph_nameparent_wrapperpartition_signaturess       rN   createzPythonWrapperCodegen.create  s?      ,,,!---/~/C  $%%rP   c                    d| _         y )Ncall)r(  r   s    rN   r)  z)PythonWrapperCodegen.set_launcher_fn_name  s
     &rP   c                D    | j                   j                  | d|        y )Nz = None  # )r  rv   )r   rz   rB  s      rN   r0  z#PythonWrapperCodegen.write_constant  s    k&:;rP   c           	     T   t         j                  j                  j                         }d}||j                  d|j                   }d}t        t        j                  j                        dkD  rd}| j                  j                  d| dt        j                   d| dd	
       | j                  j                  dd	
       	 ddlm} | j                  j                  dd	
       t        j$                  r| j                  j'                  d       y y # t         t"        f$ r Y >w xY w)Nr  z
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infoz
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from zq import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                assert_alignment = torch._C._dynamo.guards.assert_alignment
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtx)torch_guardsTracingContexttry_getaot_graph_namer   r   r=  r>  r  r   r   r   r  torch._C._distributed_c10drN  AttributeErrorImportErrorannotate_trainingrv   )r   contextaot_config_commentaot_inductor_debug_utilsrN  s        rN   r,  z!PythonWrapperCodegen.write_header  sD   --..6687#9#9#E#-g.D.D-E!F#% v""CCDqH'{$#$ % $,,- .)* +!$ ' 	 	
* 	  	 	
 	 DKK 	   ##KK!!"?@ $ , 		s   #D D'&D'c                     y rB   rb   )r   r  s     rN   include_extra_headerz)PythonWrapperCodegen.include_extra_header      rP   c                ^    | j                   j                  dt        j                   d       y )Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            )r  r   r   r   r   s    rN   r.  z6PythonWrapperCodegen.write_kernel_autotune_defs_header  s3    !!((
 $,,- .	
rP   c                   dt         j                   d}t        j                  j                  r]| j
                  j                  |       | j
                  j                  t        j                  j                  j                  d             t        j                  j                  s`| j                  j                  |d       | j                  j                  t        j                  j                  j                  d             y y )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r#   r   r   ro   rp   rx   r   rv   r0   rC   r.  import_get_raw_stream_asr,  r  )r   
import_strs     rN   write_triton_header_oncez-PythonWrapperCodegen.write_triton_header_once  s     $,,- .

 ==11&&--j9&&00"";;<LM ww""LL
$7LL"""";;<LM #rP   c                v   t         j                  j                  rB| j                  j	                  t
        j                  j                  j                  d             t
        j                  j                  sC| j                  j	                  t
        j                  j                  j                  d             y y )Nr`  )r   ro   rp   rx   rv   r0   rC   r.  ra  r,  r  r   s    rN   write_get_raw_stream_headerz0PythonWrapperCodegen.write_get_raw_stream_header#  s{    ==11&&00"";;<LM ww""LL"""";;<LM #rP   c                $    | j                          y rB   )re  r   s    rN    write_get_raw_stream_header_oncez5PythonWrapperCodegen.write_get_raw_stream_header_once-  s    ((*rP   c                   t        |      }|| j                  vrdt        | j                         }|| j                  |<   | j                  j	                  | d|        t
        j                  j                  r;| j                  j	                  | d|        | j                  j                  |       | j                  |   S )Nmetar   )reprr8  r   r  rv   r   ro   rp   rx   r9  r   )r   ri  vars      rN   add_meta_oncez"PythonWrapperCodegen.add_meta_once1  s    Dzt{{"T[[)*+C #DKKKK!!SETF"34}}55**44uCv5FG##C({{4  rP   c                z    | j                         D cg c]  }|j                  | j                         c}S c c}w rB   )get_graph_outputsrT  r  r   r   s     rN   get_output_refsz$PythonWrapperCodegen.get_output_refs<  s<     =A<R<R<T
78A 1 12
 	
 
s   "8c                     y rB   rb   r   s    rN   mark_output_typez%PythonWrapperCodegen.mark_output_typeB      rP   c                6    t         j                  j                  S rB   )r0   rC   graph_inputsr   s    rN   get_graph_inputsz%PythonWrapperCodegen.get_graph_inputsE  s     ww###rP   c                6    t         j                  j                  S rB   )r0   rC   graph_outputsr   s    rN   rn  z&PythonWrapperCodegen.get_graph_outputsJ  s    ww$$$rP   c           
        | j                         j                         D ]  \  }}t        |t        j                  t
        j                  f      r1|t        j                  j                  vst        |t
        j                        rht        |j                               dk(  r| j                  |j                               }| j                  |j                               }| j                  j!                  d| d| d| d        y )Nr   zassert_size_stride(r  r   )rv  r   r[   r\   r   r   TorchBindObjectr0   rC   graph_input_namesGeneratorStater,   r  rn   r  r  rv   )r   rz   bufr  r  s        rN   codegen_input_size_assertsz/PythonWrapperCodegen.codegen_input_size_assertsM  s    ..0668 	SID##

B,>,>?@ 177444
R&&9  S\\^,1223<<>BD44S^^5EFFKK!!$7vRvRxq"QR	SrP   c                `   | j                   j                  d       | j                         j                         D ]r  \  }}t	        |t
        j                  t        j                  f      r1d| d}| j                   j                  |       d| d}| j                   j                  |       t y )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r  rv   rv  r   r[   r\   r   r   rz  )r   rz   r}  ry   s       rN   codegen_input_nan_assertsz.PythonWrapperCodegen.codegen_input_nan_asserts_  s    HI..0668 	(ID##

B,>,>?@ &;<DKK!!$' &;<DKK!!$'	(rP   c                :    | j                   j                  d       y )NzV

            async_compile.wait(globals())
            del async_compile
            )r  r   r   s    rN   write_async_compile_waitz-PythonWrapperCodegen.write_async_compile_waitj  s    	
rP   c                    dj                  |      }t        |      dk(  r|dz  }| j                  j                  | d       | j                  j                  d       y )Nr  r1   ,z = argszargs.clear())r   r   r  rv   )r   input_nameslhss      rN   
write_argszPythonWrapperCodegen.write_argss  sP    ii${q 3JCWo.n-rP   c                    t         j                  r| j                  j                  d       d}|S | j                  j                  d| j                   d       d}|S )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r1   )r   graph_partitionr  r   r(  r   prefix_indents     rN   !write_launcher_fn_call_get_indentz6PythonWrapperCodegen.write_launcher_fn_call_get_indentz  sm    !!KK M  KK**+ ,
 MrP   c                6    t         j                  j                  S rB   )r0   rC   r{  r   s    rN   get_graph_input_namesz*PythonWrapperCodegen.get_graph_input_names  s    ww(((rP   c                   | j                   J | j                          | j                         }| j                  j	                  |      5  t
        j                  j                  rA| j                  j                  t        j                  j                  j                                t        j                  j                         }t
        j                  r| j                  j                  d| d       | j                         x}r| j!                  |       | j#                          | j%                          d d d        y # 1 sw Y   y xY w)Nz0training_annotation = nvtx._device_range_start(''))r(  r  r  r  r   r   ro   debug_sync_graphrv   r0   rC   r.  synchronizeget_training_phaserW  r  r  codegen_inputs"codegen_input_size_and_nan_asserts)r   r  phaser{  s       rN   r-  z!PythonWrapperCodegen.write_prefix  s    $$000%%'>>@[[. 	6}}--%%agg&8&8&D&D&FGGG..0E''%%FugRP %)$>$>$@@ @ 12!335	6 	6 	6s   
C,D??Ec                    t         j                  r| j                          t         j                  r| j	                          y y rB   )r   size_assertsr~  nan_assertsr  r   s    rN   r  z7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  s1    ++-**, rP   c                   | j                          d| }t        j                  j                  r=| j                  j                  | d| d       t        j                  j                  r|S | j                  | d| d       |S )Nstream = get_raw_stream(r   )	rg  r   ro   rp   rx   rv   r0   rC   r,  )r   r&  rm  rz   s       rN   r6  z)PythonWrapperCodegen.write_get_raw_stream  s    --/
|$==11&&00&*:,a8 ww""$1*Q?@rP   c                     | j                   d   S )N)r*  r   s    rN   get_codegened_graphz(PythonWrapperCodegen.get_codegened_graph  s    ))"--rP   c                :    | j                   j                  |       y rB   )r*  r   )r   rC   s     rN   r  z)PythonWrapperCodegen.push_codegened_graph  s    ""))%0rP   c                6    | j                   j                         S rB   )r*  r   r   s    rN   r   z(PythonWrapperCodegen.pop_codegened_graph  s    ))--//rP   c                P    ddl m} | j                  j                   ||            S )Nr   )deepcopy)copyr  r+  r   )r   r  r  s      rN   r  z(PythonWrapperCodegen.push_computed_sizes  s!    !((//0HIIrP   c                6    | j                   j                         S rB   )r+  r   r   s    rN   r  z'PythonWrapperCodegen.pop_computed_sizes  s    ((,,..rP   c                .    t        | j                         S rB   )nextr  r   s    rN   next_kernel_suffixz'PythonWrapperCodegen.next_kernel_suffix  s    t''()*rP   c                n   | j                  t        || j                               t        j                  j
                  r| j                          | j                  j                  dt        j                  j                  j                  |       d       | j                  j                          | j                  j                  t        j                  j                  j                  |             t        |       r| j                          | j                  j                  d| d| d       || _        y )Nr*  r+  r  r  r   )rv   r%  r'  r   ro   rp   rc  rx   r0   rC   r.  r1  r  r2  r)   re  )r   r&  s     rN   codegen_device_guard_enterz/PythonWrapperCodegen.codegen_device_guard_enter  s    )*d6W6WX	
 ==11))+&&00**77
CDAF &&002&&00""--j9 348002&&00$6zl!D -7)rP   c                    | j                  t                      t        j                  j                  r| j
                  j                          y y rB   )rv   r7  r   ro   rp   rx   r!  r   s    rN   codegen_device_guard_exitz.PythonWrapperCodegen.codegen_device_guard_exit  s6    356==11&&224 2rP   c                   |r,t         j                  r| j                  j                  ddj	                  |      z   dz          | j                  j                  d       | j                  j                          | j                  j                  d       | j                  j                          | j                  j                  d       | j                  j                  d       | j                  j                  d       | j                  j                  d	dj	                  |      z   dz          y | j                  j                  d
       y )Nzreturn_vars = (r  , )zfor var in return_vars:z!if isinstance(var, torch.Tensor):z#assert not var.isnan().any().item()z#assert not var.isinf().any().item()r   zreturn (z	return ())r   r  r  rv   r   r  r!  )r   output_refss     rN   generate_returnz$PythonWrapperCodegen.generate_return  s   !!!!++%		+(>>F !!++,EF!!++-!!++,OP!!++-!!++,QR!!++,QR!!--a0''
TYY{5K(Ke(ST''4rP   c                     y rB   rb   r   results     rN   generate_before_suffixz+PythonWrapperCodegen.generate_before_suffix  rs  rP   c                    t         j                  rNdj                  | j                        t	        | j                        dk(  rdndz   }|j                  d| d       y y )Nr  r1   r  r  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r   r  r   all_partition_namesr   r   )r   r  all_partition_name_lists      rN   generate_after_suffixz*PythonWrapperCodegen.generate_after_suffix  se    !!&*ii0H0H&I43349r'# MM--D,E F "rP   c                     y rB   rb   r  s     rN   generate_endz!PythonWrapperCodegen.generate_end  rs  rP   c                :    | j                  t        | |             y rB   )rv   r=  r   rK   s     rN   generate_fallback_kernelz-PythonWrapperCodegen.generate_fallback_kernel  s    ,T489rP   c                    |j                  |        | j                  t        | |             t        |j                  t
        j                        r|j                  |        y y rB   )codegen_commentrv   r=  r[   r  r   Layoutcodegen_size_assertsr  s     rN   generate_extern_kernel_allocz1PythonWrapperCodegen.generate_extern_kernel_alloc  sI    T",T489dkk299-%%d+ .rP   c           
        t        |j                  t        j                        }|j	                         }|j                         }|j                         }| j                  }t        j                  r	d|v rd| }|r5| j                  | j                   | ddj                  |       d|        y | j                  | j                   | d| ddj                  |       d|        | j                  rKt        j                  r:|7t        d   dxx   d	z  cc<   | j                  d
|j                   d| d       y y y y )Nview_as_complexz.clone()r  r  r   r   inductorintermediate_hooksr1   zrun_intermediate_hooks()r[   r  r   
NoneLayoutrE   get_origin_noderM  r  r   memory_planningrv   r  r   r%  generate_intermediate_hooksr   rz   )r   extern_kernelrC  	no_returnoutput_nameorigin_noderV  r  s           rN   rB  z9PythonWrapperCodegen._generate_extern_kernel_alloc_helper!  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1rP   c                \    |j                  |        | j                  t        | |             y rB   )r  rv   rG  r  s     rN   generate_extern_kernel_outz/PythonWrapperCodegen.generate_extern_kernel_out>  s&     	T"*467rP   c                    t         j                  j                  j                  }|j	                  ||d d d       |j                  d|r|n|        |5  | j                  | ddj                  |       d       d d d        y # 1 sw Y   y xY w)Nexternzout=r  r  r   )r0   rC   wrapper_coder@  set_printer_argsr   rv   r   )r   r   outout_viewrC  rX  debug_printer_managers          rN   rS  z7PythonWrapperCodegen._generate_extern_kernel_out_helperE  s     !" 4 4 B B..tVT4Rdx8S9:;" 	;NNfXQtyy&7q9:	; 	; 	;s   'BBc                    |j                   }|j                  }|r$t        d |D              }t        d |D              }|j                  j	                          d}dj                   fd|D              }dj                   fd|D              }t        j                   |j                        }d}| d|j                   d	}| d| d| d| }	| d
|	 d}
|
S )Nc              3  n   K   | ]-  }t         j                  j                  j                  |       / y wrB   r0   rC   rI   atomically_apply_size_hintrc   rW  s     rN   re   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>X  s%     VA))DDQGV   35c              3  n   K   | ]-  }t         j                  j                  j                  |       / y wrB   r  r  s     rN   re   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>Y  s*      CD  ;;A>r  z.data_ptr()r  c              3  J   K   | ]  }t         j                  |        y wrB   r  val_to_arg_strrc   dimr   s     rN   re   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>_  s     XC-<<T3GX    #c              3  J   K   | ]  }t         j                  |        y wrB   r  r  s     rN   re   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>`  s$      
?B //c:
r  z$triton.tools.experimental_descriptorz.create_d_tma_descriptorr  r   )
dims
block_dimsrm   tensorrT  r   r  r  element_sizerank)r   descapply_size_hintsr  r  ptrr  r  r   rC  rK  s   `          rN   *_generate_tma_descriptor_call_experimentalz?PythonWrapperCodegen._generate_tma_descriptor_call_experimentalT  s    yy__
VQUVVD HR J ..01=yyXSWXXYY 
FP
 

 ,::4ARARS7xx		{*:;bbB|n=QtfArP   c                    |j                   }|rt        d |D              }d}| d}|j                  j                          d| }| d| d}|S )Nc              3  n   K   | ]-  }t         j                  j                  j                  |       / y wrB   r  r  s     rN   re   zLPythonWrapperCodegen._generate_tma_descriptor_call_stable.<locals>.<genexpr>m  s*       CD  ;;A> r  z/triton.tools.tensor_descriptor.TensorDescriptorz.from_tensorr  r  r   )block_shaperm   r  rT  )r   r  r  r  r  r   rC  rK  s           rN   $_generate_tma_descriptor_call_stablez9PythonWrapperCodegen._generate_tma_descriptor_call_stablej  so    &&  HS  K Cx|$++//12"[MBQtfArP   c                    t        |t        j                        r| j                  ||      S t        |t        j                        sJ | j                  ||      S rB   )r[   r   TMADescriptorExperimentalr  TMADescriptorStabler  )r   r  r  s      rN   _generate_tma_descriptor_callz2PythonWrapperCodegen._generate_tma_descriptor_callw  sW    dB889BB&  dB$:$:;;;<<TCSTTrP   c                    | j                  |      }|j                   d| | j                   }| j                  |       y Nr   )r  rz   r  rv   )r   r  rK  ry   s       rN   generate_tma_descriptorz,PythonWrapperCodegen.generate_tma_descriptor  s:    11$7))Cvdkk]3trP   c                    | ddj                  t        t        |             }|j                  d      r|dj                  dg|z         z  }n|r|dt	        |       z  }|dz  }| j                  |       y )Nr  r  zaten.scatter_reducer  r  z	, reduce=r   )r   mapr   r   rj  rv   )	r   r{   inputsrN  python_kernel_namesrc_is_tensorr  r   ry   s	            rN   generate_scatter_fallbackz.PythonWrapperCodegen.generate_scatter_fallback  s{     %%QsxxC0@'A&BC(()>?DIIrdVm,,D)DL>22trP   c                |    ddj                  |       d}||||g}| j                  | j                  ||             y )Nr  r  r  )r   rv   wrap_kernel_call)r   r   r   r  values
accumulateindices_strrC  s           rN   generate_index_put_fallbackz0PythonWrapperCodegen.generate_index_put_fallback  sA    $))G,-Q/;
3t,,VT:;rP   c           
     `    | j                  | d| ddj                   |              d       y )Nr   r  r  r   )rv   r   )r   buf_namer  get_argsop_overloadri  outputss          rN   ,generate_fallback_kernel_with_runtime_lookupzAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  s2     	(3'9&:!DIIhj<Q;RRSTUrP   c                f    t        d      5  | j                  |      cd d d        S # 1 sw Y   y xY w)NzPythonWrapperCodegen.generate)r   	_generater   is_inferences     rN   generatezPythonWrapperCodegen.generate  s,    9: 	0>>,/	0 	0 	0s   '0c                &    t         j                  ryy)Nr   r1   )r   r  r   s    rN   get_wrapper_call_indentz,PythonWrapperCodegen.get_wrapper_call_indent  s    !!rP   c              #  b   K   | j                   }	 || _         | || _         y # || _         w xY wwrB   rv   )r   newolds      rN   set_writelinez"PythonWrapperCodegen.set_writeline  s.     nn	! DNI DNSDNs   /# /	,/c                    | j                   j                  }t        j                  j                  r| j
                  j                  |       y | j                  j                  |       y rB   )r:  kernel_defsr   ro   rp   r  r   r  )r   r  s     rN   _write_multi_kernel_defsz-PythonWrapperCodegen._write_multi_kernel_defs  sF    --99==11%%,,[9KK{+rP   c                	   t         j                  r| j                          t        j                         5 }|j                  | j                  j                                t         j                  r| j                  |       t         j                  r| j                          | j                  |       t         j                  j                  r*t         j                  j                  s| j                          | j!                  | j                  j"                        5  | j$                  D ]I  }t'        |t(              r|j+                  | j                         /| j                  j#                  |       K 	 d d d        | j-                          | j/                         }| j1                          t         j                  j2                  rA| j                  j#                  t4        j6                  j8                  j;                                t         j                  r| j=                          t         j                  j                  r*t         j                  j                  s| j?                          t         j                  j                  r| jA                          t         jB                  r+t         jD                  s| j                  j#                  d       | jG                  |       d d d        tI               }|jK                  | jL                         |j#                  d       |jK                  | jN                         t4        j6                  jP                  r>t4        j6                  jD                  r$t4        j6                  jR                  r
tI               }|jK                  | jT                         | jW                          |jK                  | jX                         | j[                         }|j                  |      5  |jK                  | j                         d d d        | j]                  |       |jK                  | j^                         | ja                  |       | jc                  |       | je                  |       |jg                         | jh                  jg                         fS # 1 sw Y   4xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nz+nvtx._device_range_end(training_annotation)r  )5r   profile_bandwidthrc  r   	ExitStackenter_contextr  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphrun_wrapper_ir_passesro   store_cubinrp   !generate_reset_kernel_saved_flagsr  rv   r  r[   r@   r  r  rp  rr  r  r0   rC   r.  r  generate_end_graph generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockrW  r,  r  r(   r   r  r  r-  is_const_graphr  finalize_prefixr  r  r  r  r  r  add_benchmark_harnessgetvaluewithlinemapr  )r   r  stackry   r  r  wrapper_call_indents          rN   r  zPythonWrapperCodegen._generate  se   ##))+!!# *	.u 1 1 8 8 :;0088?''))+&&|4}}((1W1W668 ##D$5$5$?$?@ : JJ :D!$4T%6%67))33D9	:: ))+..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}((1W1W557}}55446 ''0B0B!!++A   -U*	.Z  !dll#dkk" 77 3 38N8N#%F 	d//0dkk""::<]]./ 	-MM$++,	- 	##F+dkk"""6*&!""6* &&($$88:
 	
u: :*	. *	.z	- 	-s2   C%R3AR&4E-R39S &R0	+R33R= S	c                6   | j                   j                  d       i }t        j                  j                  r_t
        j                  j                  rEt        t
        j                  j                        D ci c]  \  }}| j                  |      | }}}| j                   j                         dz   | j                  j                         z   }t        j                  t        j                  k(  rkt!        j"                  t%               dd      5 }|j'                  |j)                  d             |j*                  }ddd       t        j,                  d       	 t/        ||       yc c}}w # 1 sw Y   3xY w# t0        $ r}t3        d	|       |d}~ww xY w)
z
        Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
        code and execute it to trigger Triton kernel compilation and auto-tuning
        zQ
            async_compile.wait(globals())
            del async_compile
        r)  z.pyF)dirr  deletezutf-8NzAuto-tuning code written to %sz%Failed to run autotuning code block: )r  r   r   ro   rp   r0   rC   autotuning_inputs	enumerateget_autotuning_input_namer   rx   r    levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoderz   debugexec	ExceptionRuntimeError)r   scopeidxvtuning_codef	file_pathes           rN   r(  z4PythonWrapperCodegen.generate_and_run_autotune_block  sp   
 	!!((	
 ==11agg6O6O ((A(ABC ..s3Q6E 
 %%..0((1134 	
   GMM1 ,,Ke #**734FF		#
 !!0
	Se$/# #  	S!FqcJKQRR	Ss*   8E)-E/E; /E8;	FFFc                \    ddl m}  ||       j                  | j                        | _        y )Nr1   )MemoryPlanner)r  rH  r  r  )r   rH  s     rN   memory_planz PythonWrapperCodegen.memory_plan;  s     2"4(--djj9
rP   c                   t         j                  j                         }| j                  rt	        | j                  d   t
              r| j                  d   j                  j                  |vri| j                  j                          | j                  rCt	        | j                  d   t
              r&| j                  d   j                  j                  |vrit               g}g }t        t        | j                              D ]  }| j                  |   }t	        |t
              r"|j                  |d         | j                  |<   Dt	        |t              r|j                  t                      nt	        |t              s|j                  |j                                 |j                  |j                                t        |      dk(  sJ t!        d |D              }y )Nr  r   c              3  4   K   | ]  }|j                     y wrB   )r   )rc   ss     rN   re   z9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>]  s      +
./A))+
s   )r0   rC   get_output_namesr  r[   r  rK   rz   r   r   ranger   r  r  r   r  sum)r   	out_namesplanning_statespast_planning_statesr  ry   _total_allocated_buffer_sizes          rN   memory_plan_reusez&PythonWrapperCodegen.memory_plan_reuse@  s}   GG,,.	 JJ4::b>+=>

2##((	9 JJNN JJ4::b>+=>

2##((	9 /01!s4::' 	CA::a=D$ 23 $		/"*= >

1D"34&&':'<=D"23$++O,?,?,AB	C 	##O$7$7$9:?#q(((
 (+ +
3G+
 (
$rP   c                j    |r!t         j                  r| j                          y | j                          y rB   )r   r  rI  rT  r  s     rN   r#  z*PythonWrapperCodegen.run_wrapper_ir_passesa  s%    F22""$rP   c           	        	 | j                   	t        j                  	fd       }t        j                  	fd       }t        |t        j
                        rGt        |t        j                        r||v ry 	j                  | d|        |j                  |       y t        |t        j                        rt        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X t        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X y t        |t        j                        ry t        |t        j                        ry t         j"                  j$                  j&                  ry t)        dt+        |             )Nc                <    j                  |  d|  d       |  dS )Nz_size = z.size()_sizer  rz   r  s    rN   sizeofzDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeofp  s(    NNdV8D69:V5>!rP   c                <    j                  |  d|  d       |  dS )Nz
_stride = z	.stride()_strider  rY  s    rN   strideofzFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideofu  s)    NNdV:dV9=>V7##rP   r   r  r  zUnknown value type: )r  r  r7  r[   r\   r   Symbolrv   r   r   	TensorBoxr3  r  r  rz  r|  rO  	_inductorr   r  r  ri   )
r   rz   r   
bound_varsrZ  r]  r  r  r  r  s
            @rN   codegen_input_symbol_assignmentz4PythonWrapperCodegen.codegen_input_symbol_assignmenth  s    {{		" 
	" 
	$ 
	$ eUZZ(eU\\2ez6INNeWCv./NN5!r||,&u~~'78 )	TdELL1d*6LNNdV3vd|nAcU!#DENN4()  ))9)9);< +Vfell3j8PNNfXS$0@#a#HINN6*+ r112r001%%55$';DK=%IJJrP   c           	        t        t        j                            }| j                         }|j	                         D cg c]$  \  }}t        |t        j                        s!||f& c}}|j	                         D cg c]$  \  }}t        |t        j                        r!||f& c}}z   }|D ]  \  }}| j                  |||        	 	 	 	 dd}|D ])  \  }	}t        |t        j                        s! |||       + yc c}}w c c}}w )z$Assign all symbolic shapes to localsc                P   t        j                  | j                         | j                         g      D ]k  }t	        |t
              rt	        |t        j                        r.|j                  D cg c]	  }||vs| }}t        |      dkD  s[t        d| d| d       y c c}w )Nr   zFor z, expected z to have been codegen-ed.)r   from_iterabler  r  r[   r   r\   r^  free_symbolsr   r  )r   ra  exprsymundefined_symbolss        rN   _verify_input_symbol_assignmentzLPythonWrapperCodegen.codegen_inputs.<locals>._verify_input_symbol_assignment  s     ++U^^-=u?O?O?Q,RS 
!$-D%,,1O $(#4#4%:8MC%! % ()A-(tfK0A/BB[\ 
%s   0	B#:B#N)r   ir.TensorBoxra  OrderedSet[sympy.Symbol])	r   r\   r^  rv  r   r[   rb  r   r_  )
r   ra  ru  krB  r  rz   r   rj  _s
             rN   r  z#PythonWrapperCodegen.codegen_inputs  s    -/
 ,,.+113
q!z!U\\7RQF
 , 2 2 4X1Jq%,,<WaVXY " 	JKD%00ujI	J		0	&  	?HAueR\\2+E:>	?3
Xs   "D"D>"D!Dc                P   t        |t        j                        rt        |t        j
                        rq|| j                  v ry | j                  j                  |       t        j                  j                  j                  |   }| j                  | dt        |              y y y r  )r[   r\   r^  r   r   PRECOMPUTED_SIZEr  r   r0   rC   rI   inv_precomputed_replacementsrv   pexpr)r   rh  rg  s      rN   ensure_size_computedz)PythonWrapperCodegen.ensure_size_computed  s    c5<<(^CAVAV-Wd)))##C(77##@@EDNNcU#eDk]34 .X(rP   c                     y rB   rb   r   s    rN   r*  z$PythonWrapperCodegen.finalize_prefix  r]  rP   rJ   c                   t        d      )Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r?  r   r   rJ   s      rN   codegen_cpp_sizevarz(PythonWrapperCodegen.codegen_cpp_sizevar  s    UVVrP   c                   t        ||      S )Nru  )rr  rw  s      rN   codegen_python_sizevarz+PythonWrapperCodegen.codegen_python_sizevar  s    Q**rP   c                $    | j                  |      S rB   )rz  ro  s     rN   codegen_sizevarz$PythonWrapperCodegen.codegen_sizevar  s    **1--rP   c                    | d| dS )Nr  r  rb   )r   r  rz   r  s       rN   r  z)PythonWrapperCodegen.codegen_tuple_access  s    1UG1%%rP   c                    g t        | j                  |      }t        |      dk(  ryt        |      dk(  r	d|d    dS ddj                  |       dS )Nr   ()r1   r  r  r  r   )r  rz  r   r   )r   r  partss      rN   rn   z/PythonWrapperCodegen.codegen_python_shape_tuple  s^    :#d1159:u:?u:?uQxj$$499U#$A&&rP   c                $    | j                  |      S rB   )rn   )r   r  s     rN   r  z(PythonWrapperCodegen.codegen_shape_tuple  s    ..u55rP   c                    dj                  dj                  |t        |      t        |      | j	                  |      | j	                  |      g            S )Nzalloc_from_pool({})r  )formatr   rr  r   rn   )r   rz   offsetr   r  r  s         rN   codegen_alloc_from_poolz,PythonWrapperCodegen.codegen_alloc_from_pool  sS    $++II&MJ33E:33F;

 
	
rP   c                   ||j                   j                  k(  rk||j                   j                  k(  rR||j                   j                  k(  r9|&||j                  k7  rd|j                          d| dS |j                          S | j                  |      }| j                  |      }| j                  |      }|/||j                  k7  r d|j                          d| d| d| d| dS d|j                          d| d| d| d	S )Nzaten.view.dtype(r  r   z#aten.view.dtype(reinterpret_tensor(z), zreinterpret_tensor()r  r  r  r  r   rE   rn   r|  )r   datar  r  r  rv   r   s          rN   codegen_reinterpret_viewz-PythonWrapperCodegen.codegen_reinterpret_view  s    DKK$$$$++,,,$++,,, Udjj%8)$--/):"UG1EE--/*+2248D44V<F))&1F Udjj%8<T]]_<MRPTvUWX^W__abhaiilmrlsstuu *$--/):"TF"VHBvhVWXrP   c                8    | j                  | d| d| d       y )Nz.copy_(r  r   r  )r   r   dstnon_blockings       rN   codegen_device_copyz(PythonWrapperCodegen.codegen_device_copy  s!    #gcU"\N!<=rP   c                    |j                         }|j                  d   j                         }| j                  t        | |||j                               y r   )rE   r  rv   r  r  )r   rK   r  r  s       rN   codegen_multi_outputz)PythonWrapperCodegen.codegen_multi_output
  s>    mmo;;q>**,t[(DLLQRrP   c                   d |j                   D        \  }t        |j                        dk(  r#| j                  |j                   d| d       nkt        |j                        dk(  r@t        |j                  d   t              r#| j                  |j                   d| d       nt        |j                        dk(  rt        |j                  d   t              r| j                  |j                   d| d       | j                  d	|j                   d
|j                  d   j                   d|j                   d|j                  d   j                   d	       | j                  |j                   d|j                   d|j                  d   j                          nt        d|j                         | j                  |j                          d       y )Nc              3  <   K   | ]  }|j                           y wrB   )rT  )rc   ts     rN   re   z>PythonWrapperCodegen.codegen_dynamic_scalar.<locals>.<genexpr>  s     >Q1&&(>s   r   r   .item()r1   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath z = None)r  r   keypathrv   rh  r[   r   r   divisorr  rE   )r   rK   r  s      rN   codegen_dynamic_scalarz+PythonWrapperCodegen.codegen_dynamic_scalar  s   >$++>t||!NNdhhZs4&89!#
4<<?M(RNNdhhZxv^DE!#
4<<?K(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23rP   c           
     0     fd}fd}fd}j                  g d       j                         5  j                  dd       t        j                  j
                  j                         D ]U  \  }}j                  d|         |||j                         |j                         |j                  |j                         W t        t        j                  j                        d	kD  r^j                  d
       t        j                  j                  j                         D ]"  \  }}j                  d|         |||       $ t        j                  j                  j                         D ]  \  }}t        |t         j"                        rCt        t        j                  j$                  j&                  j)                  |d       t*              rdt        |t,        j.                        rct        t        j                  j                        d	k(  rj                  d
       j                  d|         |||j1                                t        |t         j2                        r4 ||t        j                  j$                  j5                  |d             /t        |t,        j6                        r# ||d|j                  j8                   d       l|j;                         D cg c]-  }t        j                  j$                  j5                  |d      / }	}|j=                         D cg c]-  }t        j                  j$                  j5                  |d      / }
} |||	|
|j?                         |jA                                " ddjC                  t        j                  j                  jE                                d}j                  d|        j                  d       d d d        y c c}w c c}w # 1 sw Y   y xY w)Nc                    j                  |  dj                  |       dj                  |       d| d| d
       y )Nz = rand_strided(r  
, device='	', dtype=r   )rv   rn   )rz   r  r  rX  r   r{   r   s        rN   add_fake_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input%  sT    &(2259:"226:; <!()E7!5rP   c                2    j                  |  d|        y r  r  )rz   r   r{   s     rN   add_expr_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input-  s    vS./rP   c                    dd l }t        |t        j                        sJ j	                  |  d|j                  |      d       y )Nr   z = pickle.loads(r   )pickler[   rO  ScriptObjectrv   dumps)rz   r   r  r{   s      rN   add_torchbind_inputzKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input0  sB    eU%7%7888v%5fll56I5LANOrP   )r  r  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   fallbackztorch.cuda.default_generators[z].graphsafe_get_state()zcall([r  z])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r0   rC   	constantsr   rv   r  r  rX  r   r   torchbind_constantsru  r[   r\   r^  rI   
var_to_valr   r   r   rz  get_real_objr   	size_hintr|  r  r  r  rO  rH   r   keys)r   r{   r  r  r  rz   r   torchbind_objr   r  r  call_strs   ``          rN   benchmark_compiled_modulez.PythonWrapperCodegen.benchmark_compiled_module$  sy   		0	P 	K	
 ]]_ E	YMM     !ww00668 e   74&!12%**,ekk	 177../!3  1+,77+F+F+L+L+N ='D- $$wtf%56'm<	=  !ww3399; (eeU\\2zGG$$//33E4@,8 eR%7%781776671<((9$$wtf%56'e.@.@.BCuzz2
 #4)9)9)C)CETV)C)WXr'8'89"89K9K8LLcd "'!1 ((221r2BE  "'!1!1!3 ((221r2BF  #((*)E(T  		!''*>*>*C*C*E FGrJH}XJ78WXKE	Y E	YfoE	Y E	Ys+   J>P42P&P:2P,BP
PPc                    t         j                  sy| j                  |       |j                  g d       |j	                         5  |j                  ddt                dg       ddd       y# 1 sw Y   yxY w)zL
        Append a benchmark harness to generated code for debugging
        N)r  r  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   benchmark_harnessr  r  r   r'   r   r{   s     rN   r+  z*PythonWrapperCodegen.add_benchmark_harness  ss     ''&&v.@A]]_ 	X,-?-A,BB_`	 	 	s    A//A8c           
     D    | j                  t        | |||||             y rz  )rv   rt  )r   rV  ru  rv  rw  rx  s         rN   define_kernelz"PythonWrapperCodegen.define_kernel  s*     	 !-		
rP   c                ,    |r| dnd}d| |  d| }|S )Nr)  r  z

r   rb   )rV  ru  rv  metadata_commentbodys        rN   _format_kernel_definitionz.PythonWrapperCodegen._format_kernel_definition  s1     /7hZr?B&'}C}ErP   c                *   t         j                  j                  rJ| j                  ||d       }| j                  j                  |       t        j                  j                  ry | j                  |||      }| j                  j                  |       y )N)rv  )
r   ro   rp   r  r  r   r0   rC   r,  r  )r   rV  ru  rv  rw  rx  r  s          rN   r{  z*PythonWrapperCodegen._define_kernel_helper  s     ==1111[4 2 D %%,,T2ww""--x . 
 	4 rP   c                :    | j                   j                  |       y rB   )r  r   )r   fn_codes     rN   define_subgraph_launcher_fnz0PythonWrapperCodegen.define_subgraph_launcher_fn  s    !!((1rP   c                  ,-./01 ddl m}m}m}	 ddlm,m}
m}m}m	} ddl
m}m} |j                  }g 1i /g .g }.1fd-d-,-/fd	}t        |j                        D ]  \  }}||j                   v r || ,|      d	
       (|vr-|   }|    || ,|      d	       Jt#        |t$        j&                        r[t#        |t$        j(                        r'd|j*                  |j,                  j/                         fnd\  }}} || |||||             t#        |t$        j0                        r0 || |||j3                         |j/                                      	t#        |t$        j4                        rO || |||j6                  j3                         |j/                         |j8                  j:                               rt#        |t<        t>        j@                  f      xr* tB        jD                  jF                  jI                  |d      } || |||      |        tK        1d .|j                  D cg c]  }tM        |       c}      }|tO        jP                  tB        jD                  jS                               i /tT        jW                  |d      tY        1.      gd}|rt[        |      |d<   |rt[        |      |d<   t]        |      dk(  r0|j_                         }g ta        t>        jb                  |d         }nd.0fd}i 0|D  cg c]  } g ta        ||        }} |rt]        |      t]        |      k(  sJ g }!te        tg        ||      d d	      D ]@  \  } }"|!ji                   ||"      g ta        tj        |       g ta        tl        |       d       B |	j                  |!g ta        tn        0jq                               d}g 0js                         }tu        |jv                        g}#t]        |      dkD  rQjq                         D ]>  }t#        |t$        j0                  t$        j4                  f      r.|#ji                  |       @ |#ji                  to        |             |#jy                  to        |             t[        |#      }#|#| jz                  v rg | jz                  |#   |S | dt]        | jz                         }$t}               }%t~        j                  j                  r|%j                  d|$d       n|%j                  d|d       |$|d <   |j                  |j                                |%j                   |              |%j                  d!g ta        ||      d"|d#|d$       t        |      }&t~        j                  j                  r|&j                  d%| d&d%|$ d&      }&|&j                  d'd(      }&|%j                  |&       tB        jD                  jS                         }'|%j                  d)|'j                   d*       t        j                  |jv                        \  }(})t        j                  |jv                        }*d+|* d,|) }+| j                  |$|%j                         |+       |$|f| jz                  |#<   |$||fS c c}w c c} w )/Nr   )config_to_dict	FixedGridPrecomputedGridr1   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                J    j                  |       j                  |        y rB   )r   )rA  r  arg_indices	signatures     rN   add_to_signaturezPPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signature  s    S!s#rP   c                   |r?t               r	 | |       |j                  v r|j                     |j                  <   y y |j                  v sJ |r>t               r |  |j                               n	 | |       d|j                  <   y |r4t               r |  |j                               d |j                  <   y  | |       y )Nrz   r1   )r/   rz   )	rA  r  is_constexprequals_1equals_noner  r  r  r   s	        rN   add_argzGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_arg  s    13 %S#.88v% +1*:Ichh' & xx6)))57
 )l.IJ(c2*+Ichh' 57 )l.IJ*.Ichh'$S#.rP   r  T)r  )r  stable)experimentalNN)rz   api_typer  r   )rz   bufferr   )rz   r  r   r  )r  )
size_dtyper  argdefs)r  )r  rX  r  r   restore_valuereset_to_zeror   c                N   t        | t        j                        rdg | j                  }|s| S |j	                  t
               |D ]+  }|v rt        j                  dt                     |<   - t        |       S t        | t              sJ t        j                  |       S )N)r   _launcher_s)r[   r\   r   rf  sortr   r^  r   r.   r   r]   )rg  symbolsrh  extra_launcher_argss      rN   rename_sizes_for_launcherzYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcherp  s    dEJJ/2 1 12G"#LLSL)& "55$38<<)#.A*B)CD4+C0 &d,?@@!$,,,}}T**rP   c                2    t        | d   j                        S r   r   r   s    rN   r   zHPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>  s    3qt{{3C rP   r   )r   pythoncpp)	grid_typeprecomputed_gridsr  rn  zasync_compile.triton(z, '''rV  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r}   r  z'''z\'\'\'z''', device_str='r  z# Original path: r+  )FFF)rg  r   r   r   )Mruntime.triton_heuristicsr  r  r  commonr  r  r  r  r  ro   r  r  r   r3  	arg_names
constexprsr[   r   TMADescriptorr  r  r  rH   r  rE   r"   r  r  r  r   r\   r]   r0   rC   rI   statically_known_equalsr;   r2   r$   rI  get_current_device_or_throwr  fromkeysr9   rm   r   setup_grid_as_argsr  sympifyr   r   r   rr  r8   r   r  r  idr   extendr&  r(   r   unique_user_kernel_namesrv   updateinductor_meta_commonr   r   replaceri   inspectgetsourcelinesgetsourcefiler  r   )2r   r   r   r   restore_value_argsreset_to_zero_argsr   r  r  r  r  r  r  r  r  r  original_nameequal_to_1_argsr  rA  r   r  r  r  r   r  r   triton_signaturerl  inductor_metaextra_launcher_call_argsr  rq   r  cfg	cache_keyrz   r   
kernel_srccurrent_devicern  linenosrcfilerv  r  r  r  r  r  r  s2      `                                        @@@@@@rN   !define_user_defined_triton_kernelz6PythonWrapperCodegen.define_user_defined_triton_kernel  s   	
 	

	
 	
 	D)+	$&	!#%'	$"	/ "	/H "&"2"23 9	GHCf'''\s3$G& +Cc{"\s3Fc2#3#34 &c2+A+AB "3??CJJ4H4H4JK9 1Hk5
 (!$%-(3"'	  RYY/!!$#&<<>"%--/  R%7%78 !!$#&88#4#4#6"%--/#&::#4#4	  *c5==1   ''**BB  Cc!2XFs9	Gv -)/)9)9:AWQZ:	
 *&--agg.Q.Q.ST--3
 ''
, +01C+DK(+01C+DK(u:?,5,H,H,JM'FU]]E!H)E'F$+  EGINO<s4d;<OEOSZ3w<777 "#E7#)CT 		c "(("0"5"5Ct$4"52UD!12	 -55%6'PS2E2L2L2N)O'PM
 (E)<)A)A)C'D$ VYY-	w<!}} *!#		23E3E'FG$$S)* 	[)*]+,)$	666//	:( 
  #d&D&D"E!FG(*==11%%(=dXU&KL%%(=m=Ne&TU'+m$\>>@A8:;83~w78; <,/ 0(O ,			
 OvV
==11#++d=/,CtD6QR^TJ''{;
z*<<>!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	
 6:;4G&&y1[":::K ;j Ps   	Z6'Z;c                    | d|j                    d}||d| z  }t        ||j                        }| j                  t	        | |t
        j                               |S )Nrn  r  )r  r   r  rv   r  r0   rC   )r   rV  treer  rg  r  s         rN   generate_numel_exprz(PythonWrapperCodegen.generate_numel_expr  s_    a}E2axL D dDJJ/*4agg>?
rP   c                j    | j                  |j                   dt        |j                                y r  )rv   r   rr  r   )r   r  rC   s      rN   r  z7PythonWrapperCodegen._generate_symbolic_call_arg_helper  s)     	#))Ccnn(='>?@rP   c                   |j                         }t        | |      }|j                  t        j                  k(  r| j                  |       n1|j                  t        j                  k(  r2| j                  |       | j                  | j                  |             n|j                  t        j                  k(  r| j                  j                  |      }|rRt        |t              rt        |j                  t              sJ t        j                  |j                  |      |_        nV| j                  |       | j                  | j                  |             || j                  |<   nt        |j                        t         j"                  j$                  r| j&                  j                  t(        j+                  | ||j,                  |j.                  t0        j2                  j4                  j7                  |j8                        fd             |j                  t        j                  k7  r0| j&                  j                  t(        j                  | |             y y y )N)r1   )r  r  )rE   r  	zero_moder7   UNINITIALIZEDrv   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr<  r   r[   rK   r6   maximumr  r   ro   rp   rx   r  make_allocationrX  r   r0   rC   rI   r  r   )r   wsrz   ry   priors        rN   generate_workspace_allocationz2PythonWrapperCodegen.generate_workspace_allocation  s   {{}D"%<<,:::NN4 \\.;;;NN4 NN40067\\.===--11$7E!%6:JJ<   *11%**bA
t$t44T:;26))$/ ..==11&&00$44IIHH77++55bhh?A 5 	 ||0>>>**44(99$E ? 2rP   c                v    |j                   t        j                  k7  r| j                  t	        | |             y y rB   )r  r7   r  rv   r   )r   r  s     rN   generate_workspace_deallocationz4PythonWrapperCodegen.generate_workspace_deallocation	  s.    <<,;;;NN.tR89 <rP   c                $    | d| j                    S )Nz.zero_())r  )r   rz   s     rN   r  z%PythonWrapperCodegen.make_zero_buffer	  s    x}--rP   c                H    | ddj                  |       d| j                   S )Nr  r  r   )r   r  )r   rz   rg  s      rN   r  z%PythonWrapperCodegen.wrap_kernel_call	  s'    q9-.a}==rP   c                    | j                   j                  d       | j                   j                  dt        j                  j                   d       |j                  | j                   j                                y )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r  rv   r0   rC   graph_idr  r   )r   r-  s     rN   r!  z8PythonWrapperCodegen.generate_profiler_mark_wrapper_call	  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467rP   c                :    | j                   j                  d       y )Nzstart_graph())r  rv   r   s    rN   r"  z)PythonWrapperCodegen.generate_start_graph!	  s    ##O4rP   c                ^    | j                   j                  dt        j                  d       y )Nz
end_graph(r   )r  rv   r   profile_bandwidth_outputr   s    rN   r&  z'PythonWrapperCodegen.generate_end_graph$	  s'    ##j1P1P0SST$UVrP   c                ^    | j                   j                  dt        j                   d       y )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r  r   r#   r   r   s    rN   r%  z6PythonWrapperCodegen.generate_reset_kernel_saved_flags'	  s2      ''8'A'A&B C	
rP   c                ^    | j                   j                  dt        j                   d       y)a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        r   a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr!  r   s    rN   r'  z5PythonWrapperCodegen.generate_save_uncompiled_kernels0	  s4     	  ''8'A'A&B 	C	
rP   c                >    d }|D cg c]
  } ||       c}S c c}w )Nc                    t        | t              rt        |       r| dz   S | S t        | t        t        t
        t        f      rt        |       S t        t        j                  j                  j                  |             S )Nr  )r[   r   r:   r   floatr   r   rr  r0   rC   rI   rJ   )r  s    rN   wrap_argzAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_argK	  s^    #s#*B3*GsYPSPC#udO!DE3xQWW--66s;<<rP   rb   )r   rg  r&  r  s       rN   prepare_triton_kernel_callz/PythonWrapperCodegen.prepare_triton_kernel_callJ	  s!    	= *33#333s   c                &    t        |t              rt        |t        j                        r.|j	                         j                         } j                  |   }n\ j                  j                  |      r|} j                  |   }n/|J d       d j                   }|} xj                  dz  c_        |
J d|        t        d |j                         D              }t        d t        j                  j                  |      D              }t        d |j                         D              }|j                         }	|j!                         }
t        j                  j"                  j%                  |j'                         j(                  t*        j,                        }d	| d
| d|	 d|
 d
| d
| d} j.                  j1                  | d|        t        |t        j                        r5 j3                  |d      }|} j.                  j1                  | d|        |S t5        |t6        j8                        st        |t:              rt        |t<              r| j>                  v r|S |y|}t        |t:              r|j@                  }|t        j                  j"                  jB                  v r't        j                  j"                  jB                  |   }t=        t        j                  j"                  jE                  |t*        j,                              S t        |t<        tF        tH        tJ        f      rt=        |      S t        |tL              rdd
jO                   fd|D               dS tQ        dtS        |             )NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_r1   z Failed to find a buffer for arg c              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr  Nr0   rC   rI   r  r   unbacked_symint_fallbackrc   rF  s     rN   re   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>h	  s@      
 	   ;;#<< <    AAc              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr+  r,  r.  s     rN   re   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>o	  s@      $
 	   ;;#<< < $r/  c              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr+  r,  r.  s     rN   re   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>v	  s@      
 	   ;;#<< < r/  r  zgenerate_example_value(r  z, 'z', r   r   T)r  r  r   r  c              3  T   K   | ]  }j                  |t        |             ! y wrB   rg   )rc   ar   s     rN   re   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>	  s#      ZQR!@!@DG!L Zrk   r  zUnsupported type )*r[   torch_dtyper   r  
get_tensorrE   r  r   r  rm   r  r0   rC   get_allocation_sizer  rO  rH   rI   r  
get_layoutr  r   r-  rx   rv   r  r  r\   Basicr   r   r9  r   rq  r  r   r%  r   r   r   r   ri   )r   r  arg_typeraw_argr  r}  r  allocation_sizer  rX  r   r  r   s   `            rN   rh   z/PythonWrapperCodegen.generate_example_arg_valueV	  sF   h,'2#3#34"--/88:**3/%%))#.**3/* X* &d&F&F%GH00A50?L&Fse$LL? 
  D $ $
 44S9$ O  
 ) F ^^%FMMOEWW%%// ''88 0 F .dV2fXSE7RTU[T\\^_n^oopqE&&00H:S1HI'2#3#34 :: %) ;  **44zUG5LMO%++.*S/2R#s#$//)J?!#/nnagg&&CCCgg&&CCCH  ;;&"A"A <   c3t45s8OT"tyy ZVY ZZ[[\]]%(9$s)&EFFrP   c                z     t        |t              r ddj                   fd|D              z   dz   S t        |      S )Nr  r  c              3  @   K   | ]  }j                  |        y wrB   )_grid_dim_str)rc   r^   r   s     rN   re   z5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>	  s     RT 2 24 8Rs   r  )r[   r   r   rr  )r   grid_per_dims   ` rN   r>  z"PythonWrapperCodegen._grid_dim_str	  s<    lD)diiR\RRRUXX &&rP   )rX  ro   rk  rh  ri  rl  r   c               z   | j                   j                  |D 
ci c]2  }
t        |
t              r |
t        j
                  j                  |
      4 c}
       |xs t        j
                  j                         }| j                  t        | ||||||||t        j
                  j                  |	             yc c}
w )z
        Generates kernel call code.

        triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
                and C++ when gpu=False.
        )
rV  rg  rh  ri  rk  ro   rl  rX  rm  r   N)r  r  r[   r   r0   rC   try_get_bufferr  rv   rf  rz   )r   rV  rg  rX  ro   rk  rh  ri  rl  r   r  s              rN   generate_kernel_callz)PythonWrapperCodegen.generate_kernel_call	  s    , 	## %c3' QWW++C00	
 @177>>@'#!!#'77<<%9	
s   7B8r  )rX  ro   rk  rh  ri  rl  rm  r   c          
     b    |xs t         j                  j                         }|s1|j                  dk7  s" j	                   j                  |             y  j                  |      }dj                  |      }t        j                   |j                  |	      }|s$d| d} j	                   d d| d| d       y  j                          t        j                  j                  rʉ j                  vr|t!        |      t!        |      k(  sJ d       d |
rDt         j                  j"                  r*t         j                  j"                  j%                  |
d       d fd} fd	}g }|(|J d
       d gt!        |      z  }d gt!        |      z  }nt!        |      t!        |      k(  sJ d       i }t'        t)        ||||            D ]P  \  }\  }}}}d }t+        |t,              r!dt-        |      v r|j/                  d      \  }}d }r|v r j1                  |         }|rB|}t+        |t2              st5        |t6        j8                        st+        |t:              r|||<   n|dk(  r |||||      r||   }nt+        |t2              r_t=        j>                  d|      r|}n4| j@                  vr jC                  |||      }n j@                  |   d   }|f j@                  |<   n jC                  |||      }|jE                  ||n| d|        S  jF                  j	                   ddj                  |       d| d        jF                  j	                  tI        d|d              j                  jK                         t         j                  jL                  ry t         j                  jN                  jP                  }|jS                  ||d        |5   j	                   d| d| d       d d d         j                          y # 1 sw Y   xY w)Nr  r  z	c_void_p(r   r   r  z$call_args and arg_types do not matchc                     j                   j                         D  cg c]  \  } }|k(  r|  }} }|rddj                  |       dS yc c}} w )a  After all the autotune kernel calls have been written (i.e.
                self.kernel_autotune_example_args is complete), returns a deletion call
                for all autotune example tensors that are unnecessary after kernel_name
                is called.del r  r)  r  )r  r  r   )r  kntensors_to_deleterV  r   s      rN   get_autotune_deletion_callzUPythonWrapperCodegen._generate_kernel_call_helper.<locals>.get_autotune_deletion_call
  se     '+&G&G&N&N&P%"[( %! %
 %!$)),=">!?rBB%s   Ac                j   ||   }||v ryt        t        | |            D ]  \  }\  }}||k(  st        |t              sd}r|v rj	                  |         }|dk(  rA	 |j                         }	t        |	j                        D ]  \  }
}||k(  s| d|
 d||<     y  y# t        $ r Y w xY w)zWe try to infer raw_arg (i.e. raw_args[idx]) from remaining raw_args.
                This is particularly useful for jagged cases, where the dimension is often
                being passed in as an input.Tr  z.shape[r  F)r3  r   r[   r!   r4  r7  r  r   )rh  ri  rA  reused_args
target_argr  raw_keyr:  triton_inputr  r  rL  autotune_argsr   s               rN   infer_arg_by_inputszNPythonWrapperCodegen._generate_kernel_call_helper.<locals>.infer_arg_by_inputs 
  s    
 &c]
,-6s8X7N-O !)A)Cxz'6'B #%L$M)A'+'E'E)'2( $r) 	!!(!3!3!5&/&< ,FC J=IN'RUQVVW:XJ 7'+,!, 	 / ! !!s   #0B&B&"B&&	B21B2zkeys are not None but args arez#call_args and raw_args do not matchr  r  z^(workspace|semaphore)r   z.run(z	, stream=z
<del_call>r  )*r0   rC   r  ri   rv   r  r'  r   r  r6  r  rc  r   ro   rp   rw   r   autotuning_mappingr   r3  r   r[   r   splitr4  r4  r  r\   r8  r   rematchr  rh   r   rx   r&   r   r,  r  r@  r  )r   rV  rg  rX  ro   rk  rh  ri  rl  rm  r   call_args_strstream_name
stream_ptrrH  rO  all_argsrJ  r  r  r9  rL  r:  r   rM  arg_strr  rN  s   ``                         @rN   ro  z1PythonWrapperCodegen._generate_kernel_call_helper	  s,    @177>>@&++.NN400iHI77	B		-0*??&,,

 $[M3JNN-qQ}oR
|1M %%' MM224#=#== (S^s9~-M 6M !M#(B(B ! : : > >($!B H'I)II' 6C	N2 6C	N28}I6 96 K8AIy(H=9 )P44C7G c3'C3s8O"yy~HC.2 W%=#'#A#A%g.$L  *G%h<"8U[[9%c?;/6G,]':h;(
 *'2G+6 xx 93?"%D$E$EE"&"A"A7# #'"C"CC"H"K>E{=SD55c:"==c8WUG3;se1WI<NOS)PV &&00-uTYYx%8$9;-qQ &&00 /I<X &&**;7ww"" !" 4 4 B B..y+yRVW" 	XNNk]%i}TUVW	X%%'	X 	Xs   1P%%P.c                :    | j                   j                  |       y rB   )r  r   r   ry   s     rN   rv   zPythonWrapperCodegen.writeline
  s    

$rP   c                4    |D ]  }| j                  |        y rB   r  )r   r  ry   s      rN   r  zPythonWrapperCodegen.writelines
  s     	!DNN4 	!rP   c                L    | j                   j                  t        |             y rB   )r  r   r*   )r   ctxs     rN   r  z"PythonWrapperCodegen.enter_context
  s    

+c*+rP   c                (    ddl m}  |       rdd l}t        |t              rt        |j                  j                        S t        |t        j                        rt        |      S t        |t        t        f      rAt        j                   G d d             t         t        |       fd|D                    S t        |t         j"                  j$                        rt'        |      S t        |t(        j*                  t(        j,                  t.        f      r|j1                         S  |       r+t        |j2                  j4                        rt        |      S t        |t(        j6                        r|j1                         S t        |      S )Nr   )has_triton_packagec                      e Zd ZU ded<   d Zy)1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimr   refc                    | j                   S rB   )rb  r   s    rN   __repr__z:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__
  s    88OrP   N)r   r   r   r   rd  rb   rP   rN   Shimra  
  s    $rP   re  c              3  V   K   | ]   } t         j                  |             " y wrB   r  )rc   r3  re  r   s     rN   re   z6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>
  s$     Vq1@@qIJVs   &))torch.utils._tritonr_  ro   r[   r   rr  rK   rg  r\   r   rm   r   r  	dataclassrj  ri   rO  _ops
OpOverloadr   r   r  
MutableBoxr"   rT  languager   r|  )r   rL  type_r_  ro   re  s   `    @rN   r  z#PythonWrapperCodegen.val_to_arg_str
  s%   :a"%%5::&8OE4=)""$ $ #$ QVTUVV  5::001&q))BIIr}}oFG&&((!jFOO4I4I&J7N2,,-&&((7NrP   c                >   |j                         }|j                         }t        |j                               }t        t        j
                  j                  |            }t        |j                               }| j                  |j                         |||||      S rB   )
rO  rH   rm   r  r0   rC   r6  r  r  rE   )r   r  rX  r   r  allocation_shaper  s          rN   r  z+PythonWrapperCodegen.make_buffer_allocation
  s    ""$  "foo'( !<!<V!DEv((*+##OOvueV=M
 	
rP   c           
         ||}| j                  |      }| j                  |      }| j                  |      }	|j                  dv r| d|j                   d| d|	 d| d
}
n| d| d|	 d|j                   d| d
}
||k7  r|
d	| d|	 dz   }
|
S )
N)r  cudaxpuz = empty_strided_r  r  r   z = empty_strided(r  r  z.as_strided()rn   ri   )r   rz   rX  r   r  r  ro  r  codegen_allocation_shape_tuplecodegen_stride_tupler  s              rN   r  z$PythonWrapperCodegen.make_allocation
  s     #$"==eD)-)H)H*
&  $>>vF;;00 &)&++a12"'('  &)12"'( )!;;-yq:  "@@,':&;2>R=SSTUUC
rP   c                8    | j                  t        |             y rB   )rv   r  rZ  s     rN   make_commentz!PythonWrapperCodegen.make_comment
  s    {4()rP   c           	     `    | j                    | d| | j                   d| j                   d| 	S )Nr      )r  r  r!  )r   new_nameold_namer!  s       rN   make_tensor_aliasz&PythonWrapperCodegen.make_tensor_alias
  s6    ,,zXJt{{m2dll^STU\T]^^rP   c                (    d|j                          S )NrE  )rE   )r   r  s     rN   ra  z%PythonWrapperCodegen.make_buffer_free
  s    foo'())rP   c                8    ddj                  d |D               S )NrE  r  c              3      K   | ]  }|  y wrB   rb   )rc   rz   s     rN   re   z:PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>
  s     >>s   )r   )r   names_to_dels     rN   make_free_by_namesz'PythonWrapperCodegen.make_free_by_names
  s    dii>>>?@@rP   c           	     `    | j                    | d| | | j                   d| j                   d	S )Nr   rx   reuse)r   r  r!  )r   r{  rz  del_lines       rN   codegen_exact_buffer_reusez/PythonWrapperCodegen.codegen_exact_buffer_reuse
  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttrP   c                r   |j                         |j                         k(  sJ |j                         }|j                         }d}|t        j                  j	                         vr|rd| j                  |       }|j                         |j                         k(  r4|j                         |j                         k(  r| j                  |||      S | j                  ||j                         |j                         d| j                  j                        }| j                   | d| | d| j                   dS )N;z; r   r   rx  r  )rH   rE   r0   rC   rM  ra  r  r  r  r  r  rv   r  r!  )r   r  r  r  r{  rz  r  reinterpret_views           rN   r  z&PythonWrapperCodegen.make_buffer_reuse
  s   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T228XxPP88!11d6G6G6Q6Q
 ,,z-=,>xj4<<.X^__rP   c                    | j                  t        || j                   | d|j                          | j                   d| j
                   d             y )Nr   rx  z alias)rv   r4   r  rT  r  r!  )r   rz   r  s      rN   r  z0PythonWrapperCodegen.codegen_deferred_allocation   sS    <<.c$*@*@*B)CDKK=PRSWS_S_R``fg	
rP   c                   |j                         }|t        j                  j                  v s8|| j                  v s*t        |t        j                  t        j                  f      ry | j                  j                  |       t        |j                         t        j                  t        j                  f      r|j                         sy |j                         }t        |t        j                        ry t        |t        j                         ry t        |t        j"                        rt        |j$                  t        j&                        s*J dt)        |j$                         d|j$                          |j$                  j*                  }t        |t        j,                        sJ t)        |             |j*                  }t        |t        j.                        sJ t)        |             | j1                  |       | j3                  t5        | |||             y t        |t        j6                        r| j3                  t9        | |             y | j3                  t;        | |             y )Nzunexpected r   )rE   r0   rC   r`  r2  r[   r   DonatedBufferSubgraphBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocater  MutationLayoutSHOULDREMOVEr  r  r  r"   ri   r  
StorageBoxr  codegen_allocationrv   r  r  r  r  )r   r  rz   r  boxinput_buffers         rN   r  z'PythonWrapperCodegen.codegen_allocation  s     AGG+++t~~%&2#3#3R5F5F"GH4 &&(%%r~~6 **,'')fb;;<fbmm,fb001fkk2+=+=> d6;;/06;;-@> ++""Cc2==1<49<188LlBII6AS	A6##L1NN?4vvNOfb112NN1$?@|D&12rP   c                   |j                         }t        |t        j                  t        j                  f      r| j                  t        | |             y t        |j                         t        j                        r| j                  t        | |             y | j                  |      sy | j                  j                  |       | j                  t        | |             y rB   )rE   r[   r   InputBufferrz  rv   r]  r  r  r  	can_reuser3  r   r   )r   r  rz   s      rN   codegen_freez!PythonWrapperCodegen.codegen_free2  s      fr~~r/A/ABCNN8D&12f,,.0C0CD NN-dF;<~~f%

t*489rP   c                   |j                         }|t        j                  j                  v xs |t        j                  j                  v xr6 t        t        j                  j                  |   t        j                         xsh |t        j                  j                  v xsJ |t        j                  j                  v xs, |t        j                  j                  v xs || j                  v  S rB   )rE   r0   rC   r`  ru  r[   graph_inputs_originalr   r  r  r  never_reuse_buffersr3  )r   r  output_bufferrz   s       rN   r  zPythonWrapperCodegen.can_reuseF  s    $$&AGG+++ 
",,, "GG11$79I9I 
" qww(((
" qww222
" qww222
" tzz!
 	
rP   c                    |j                         | j                  v xr. | j                  |j                            |j                         k(  S rB   )rE   r4  )r   r  reused_buffers      rN   	did_reusezPythonWrapperCodegen.did_reuseV  sC     OO, KFOO-.-2H2H2JJ	
rP   c                t   t        ||      sJ | j                  |       | j                  j                  |j	                                | j
                  j                  |j	                                |j	                         | j                  |j	                         <   | j                  t        | ||             y rB   )	rX   r  r3  r   rE   r2  r4  rv   r  )r   r  r  s      rN   codegen_inplace_reusez*PythonWrapperCodegen.codegen_inplace_reuse^  s    $\=AAA-

|,,./=11340<0E0E0GM**,-y|]CDrP   c                    t        |      }|| j                  v r|S | j                  j                  |       | j                  |z   S rB   )r   r'  r   r  )r   r   rz   s      rN   codegen_unbacked_symbol_declz1PythonWrapperCodegen.codegen_unbacked_symbol_declf  sC    6{4---K &&**40<<$&&rP   c                &   t        t        j                  j                  j                  |      }|sy |j                         D ]I  \  }dfdfd}| j                  | j                  |       d |        | j                          K y )Nc                   |dk(  r| S t        |      dk\  r_t        |d   t              rLt        |d   t        j                        r/ |  d|d   j
                   d|d   j                   d|dd        S t        |d   t              r |  d|d   j
                   d|dd        S t        |d   t        j                        rYt        j                  j                  r  d	|d   j                   d
|  d|dd        S  |  d|d   j                   d|dd        S t        |d   t              r |  d|d   j                   d|dd        S t        d|       )Nrb   r   r   r1   r   r  r   r  z	std::get<z>(r  r  z.__floordiv__(r  )r   r[   r   pytreeSequenceKeyrz   rA  r0   rC   r,  r   r  r  )rg  r  gos     rN   r  zIPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go  s   b=K LA%"71:}="71:v/A/AB&'!*//!2!GAJNN3C1Ewqr{   
M:a
'8;WQR[II
F,>,>? 77.. Ywqz~~&6ba@'!"+N  4&'!*..)9 ;WQR[I
  
K8 nWQZ5G5G4HJGTUTVKXX(+@	)JKKrP   c                    t         j                  j                  rt              dk(  rZd   }  d   j	                         t        | t        j                        r!t        | j                        dk7  r	dd        S       S t        d   t        j                        sJ  d   j                     j	                         dd        S        S )Nr1   r   )r0   rC   r,  r   rE   r[   r   r  r  r  r  rA  )r  r  r  r  r  s    rN   go_outerzOPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outer  s    77&&
 7|q(%aj  "#AJ//1)#r~~>3s{{CSWXCX $ABK   ")	    *'!*f6H6HIII!''!*.."9"B"B"DgabkRRk733rP   r   )rg  r   r  zpytree.KeyPath)	r   r0   rC   rI   	shape_envr   rv   r  r  )r   r  r  unbacked_bindingsrL  r  r  r  s    ``   @@rN   (codegen_unbacked_symbol_defs_for_outputsz=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputso  s     6GG&&(9
 ! ,113 <	JAw
L<4. NN44Q78HJ<}Uu<	rP   c                     fd} fd}	  j                  j                          j                   j                   dj                           |        t
        j                  }t        j                  j                        5  j                  j                  |       d d d         |         j                          y # 1 sw Y   !xY w#  j                          w xY w)Nc                    t        j                  j                        t              k(  sJ t        j                  j                        D ]3  \  } }j	                  j
                   |  d| j                          5 y r  )r   rC   ru  r   rv   r  r  )inner_inputouter_inputouter_inputsr   subgraphs     rN   _codegen_subgraph_prefixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefix  sy    x~~223s<7HHHH,/++\- ([ ||n[M[M$++OrP   c                    t        j                  j                        t              k(  sJ t        j                  j                        D ]5  \  } }j	                  | d| j                          j                          7 y r  )r   rC   rx  r   rv   rT  r  )inner_outputouter_outputouter_outputsr   r  s     rN   _codegen_subgraph_suffixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix  s{    x~~334M8JJJJ.1,,m/ *l #nC(F(F(H'I$++WrP    subgraph: )parent_graph)	r  rC   rv   r!  rz   r0   set_graph_handlercodegen_subgraphr   )r   r  r  r  r  r  r  s   ````   rN   codegen_subgraph_by_inliningz1PythonWrapperCodegen.codegen_subgraph_by_inlining  s    			'%%hnn5NNdll^;x}}oFG$&77L$$X^^4 //!- 0  %&$$&  $$&s$   A;C C,C CC C*c           	        |j                   }|j                  }t        |j                               |j                  D cg c]  }|j
                   c}z   }dj                  |      t        |      dk(  rdndz   }|D cg c]  }|j                          }	}dj                  |	      t        |      dk(  rdndz   }
| j                  d| d| d       |j                         D cg c]
  \  }}|s	| }}}|r#| j                  ddj                  |              | j                  d	|
 d
| d| d       | j                  d| d       yc c}w c c}w c c}}w )z'Generate code to call a graph partitionr  r1   r  r  	partition	_args = [r  rE  r  z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesr   r  symbol_inputsrz   r   r   rE   rv   r   )r   partition_idrH  r  r  symbol_inputr  r  rK   output_namesr  rz   
deallocater  s                 rN   codegen_partition_callz+PythonWrapperCodegen.codegen_partition_call  sj    2DD+88-22452F2T2T9
".L9
 
 ;'#k2Ba2G3RP4@ADAA))L)C4E4JSPRS 	<.	&CD *<)A)A)C
%T:zD
 
 NNT$))L"9!:;< 	y,\N+l^SYZ	
 	|nE:;-9
 B
s   E?E'
E2Ec                P    t        |      D cg c]  }d| 	 c}| _        y c c}w )N
partition_)rN  r  )r   num_partitionsrA  s      rN   set_all_partition_namesz,PythonWrapperCodegen.set_all_partition_names  s$    BGBW#X3j$6#X #Xs   #c           	     p   dj                  |      t        |      dk(  rdndz   }dj                  |      t        |      dk(  rdndz   }| j                  |j                  j                   d| d       | j                  d| d|j                  j                   d|j                  j                   d	       y )
Nr  r1   r  r  r  r  r  z) = r  )r   r   rv   rC   rz   )r   r  r  outer_flattened_outputsouter_output_namesouter_input_namess         rN   ,codegen_subgraph_call_with_flattened_outputszAPythonWrapperCodegen.codegen_subgraph_call_with_flattened_outputs  s     "YY'>?./14C"
 !IIl3|$)Cr
 	(..--.i8I7J!LM 	"#4(;(;'<Ahnn>Q>Q=RRXY	
rP   c                v   dj                  |      t        |      dk(  rdndz   }| j                  |j                  j                   d| d       t
        j                  j                  j                          | j                  | d|j                  j                   d|j                  j                   d	       y )
Nr  r1   r  r  r  r  r   r  r  )r   r   rv   rC   rz   r0   	schedulerfree_buffers)r   r  r  outer_buffer_namer  s        rN   codegen_subgraph_callz*PythonWrapperCodegen.codegen_subgraph_call"  s     IIl3|$)Cr
 	(..--.i8I7J!LM 	
&&( 	 !X^^%8%8$98>>;N;N:OvV	
rP   c                   | j                  |j                         | j                  d       | j                  | j                   d|j                          t
        j                  }|j                  |j                  _        |j                  j                  | j                  vrt        j                  |j                        5  t        j                  dd      5  |j                  j                         \  }}d d d        d d d        | j                  j                  |j                  j                         | j                  j                         y y # 1 sw Y   ]xY w# 1 sw Y   axY w)Nr  r  r  F)r  rC   rv   r!  rz   r0   r,  r;  r  r   patchr  r   r  r   )r   r  r  subgraph_codern  s        rN   codegen_subgraph_commonz,PythonWrapperCodegen.codegen_subgraph_common3  s   !!(..1r$,,{8==/BCww%1%=%=">>d&F&FF $$X^^4 @\\"3U; @'/~~'='='?$M1@@
 ,,001D1DE,,]-@-@A G
@ @@ @s$   E E8E E	E  E)c                L    | j                  |       | j                  |||       y rB   )r  r  )r   r  r  r  s       rN   'codegen_subgraph_with_flattened_outputsz<PythonWrapperCodegen.codegen_subgraph_with_flattened_outputsF  s(     	$$X.99l$;	
rP   c                L    | j                  |       | j                  |||       y rB   )r  r  )r   r  r  r  s       rN   r  z%PythonWrapperCodegen.codegen_subgraphN  s%     	$$X.""8\;LMrP   c                   |j                         }| j                  | dt        |j                                |j                  D cg c]  }|j                          }}t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j                  |j                  ||       y | j                  |j                  ||       y c c}w c c}w )N = [None] * r  r  )rE   rv   r   r  r  rT  r0   rC   r-  rN  r  r  r  )r   invoke_subgraphrz   r}  r  r  r  s          rN   codegen_invoke_subgraphz,PythonWrapperCodegen.codegen_invoke_subgraphT  s    '')$|C0G0G,H+IJK;J;Q;QRC--/RR77(-c/2I2I.J(K#$4&!AM  --((, !!/":":L$O Ss   C(C-c                   |j                         }|j                  D cg c]  }|j                          }}|j                  j                         }t	        |j                  t
        j                        s| d}| j                  | dt        |j                                | j                  d| d       | j                  t        | |j                  j                               t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j!                  |j                  ||       n| j#                  |j                  ||       | j                  t%        |              | j                  d       | j                  t        | |j&                  j                               t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j!                  |j&                  ||       n| j#                  |j&                  ||       | j                  t%        |              y c c}w c c}w c c}w )Nr  r  r   r+  r  r  zelse:)rE   operandsrT  	predicater[   r   ShapeAsConstantBufferrv   r   r  r  true_subgraphrC   r0   r-  rN  r  r  r  false_subgraph)r   conditionalrz   r}  r  r  r  r  s           rN   codegen_conditionalz(PythonWrapperCodegen.codegen_conditionald  s   ##%;F;O;OPC--/PP));;=	+//1I1IJ$+W-I$|C0C0C,D+EFGYKq)*({/H/H/N/NOP775:3{?R?R;S5TUvQqc^UMU--))< !!+";";\4P'-.w({/I/I/O/OPQ775:3{?R?R;S5TUvQqc^UMU--**L- !!+"<"<lDQ'-.9 Q V Vs   I)"I.I3c                   |j                         }|j                  D cg c]  }|j                          }}|j                  D cg c]  }|j                          }}| j	                  | dt        |              t        |      D ]  \  }}| j	                  | d| d|          g t        t        |            D cg c]
  }| d| d c}|}| dg}	t        |      }
|
d t        |       }| j	                  d       | j	                  t        | |j                  j                               t        j                  j                  r| j                  |j                  ||	       n| j                  |j                  ||	       | j	                  d|	d    d	       | j	                  t!        |              | j	                  t        | |j"                  j                               t        j                  j                  r| j                  |j"                  |
|       n| j                  |j"                  |
|       | j	                  t!        |              y c c}w c c}w c c}w )
Nr  r  z] = r  _cond_resultzwhile True:zif not r   z: break)rE   carried_inputsrT  additional_inputsrv   r   r3  rN  r   r  cond_subgraphrC   r0   r-  r  r  r  body_subgraph)r   
while_looprz   r}  outer_carried_inputsouter_additional_inputsr  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputss               rN   codegen_while_loopz'PythonWrapperCodegen.codegen_while_loop  sd   ""$/9/H/H 
(+C!!# 
  
 0:/K/K#
(+C!!##
 #
 	$|C0D,E+FGH 45 	3FAsNNdV1QCtC512	3
&+C0D,E&FGas!nG
$
 "&l34 
 //J5I1JK}%(z/G/G/M/MNO77--((*;=O 88((*;=O 	(+,G4	
 	'-.(z/G/G/M/MNO77--((*;=O 88((*;=O 	'-.c 
#
 Hs   II I%c                    	 t        | dd       ry t        | t              r| S t        j                  j
                  j                  |       }||S t        |      S # t        $ r Y y w xY w)Nrf  )r  r[   r   r0   rC   
_shape_env_maybe_evaluate_staticr>  )r   r   s     rN   statically_known_int_or_nonez1PythonWrapperCodegen.statically_known_int_or_none  sf    	q.$/ !S!''$$;;A>C{
s8O 		s!   A A ,A 
A 	A&%A&c                l    g }| D ],  }t         j                  |      }| y |j                  |       . |S rB   )r  r   r   )lstr  r   nums       rN   %statically_known_list_of_ints_or_nonez:PythonWrapperCodegen.statically_known_list_of_ints_or_none  sA     	A&CCAFC{MM#		
 rP   c                0    t         j                  |       d uS rB   )r  r  )r  s    rN    is_statically_known_list_of_intsz5PythonWrapperCodegen.is_statically_known_list_of_ints  s     !FFsKSWW	
rP   c                H    t         j                  | j                               S rB   )r  r  r  r  s    rN   r  z4PythonWrapperCodegen.static_shape_for_buffer_or_none  s    #IIOO
 	
rP   c                0    t         j                  |       d uS rB   )r  r  r  s    rN   !can_prove_buffer_has_static_shapez6PythonWrapperCodegen.can_prove_buffer_has_static_shape  s    #CCFKSWWWrP   rB   )rE  r   rF  r   rG  Optional[PythonWrapperCodegen]rH  $Optional[ir.GraphPartitionSignature]r  )rz   r   rB  r   r   r   )r  r   )ri  TritonMetaParamsr   r   r   rj  r   z>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]r   zlist[IRNode])r  rj  r  )r&  r   rm  r   r   r   r  )r&  r   r   r   )r  rj  r   r   r  r(   r   r   )rK   zir.FallbackKernelr   r   )rK   r>  )rK   rH  r   r   )r   r   r  r   r  r   rC  rj  rX  r   r   r   )F)r  r   r  r   r	  zCallable[[], Sequence[str]]r
  z<Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]ri  r  r  zSequence[ir.Buffer]r   r   )r  Callable[..., None]r   zIterator[Callable[..., None]])r  r   )rz   r   r   rk  ra  rl  )rh  zsympy.Symbol)r   r   rJ   r   r   r   )r   r   r   r   )r  r   rz   r   r  r   r   r   )r  zSequence[Expr]r   r   )rv   r  r   r   )r  r   )rK   zir.MultiOutput)NTN)
rV  r   ru  r   rv  r   rw  r   rx  r   )rV  r   ru  r   rv  r   )r  r   )r   z"list[list[Union[int, sympy.Expr]]])rV  r   r  r   )r  r   rC   r>   r   r   )r  r6   )rV  r   )r  r  )r  )r  r^  )r  rj  )r{  r   rz  r   r  r   )r  r  r  r  r  r   )rz   r   r  zir.ReinterpretViewr   r   r  r  )r  r  r  r  )r  r   r  r   r  z,Optional[dict[sympy.Symbol, pytree.KeyPath]]r   r   )r  r   rH  zir.GraphPartitionSignature)r  r   )r   r   r   r  supports_cachingr   r  rI  r)  r0  r,  r\  r.  r%   rc  re  rg  rl  rp  rr  rv  rn  r~  r  r  r  r  r  r-  r  r6  r  r  r   r  r  r  r  r  r  r  r  r  r  r  rB  r  rS  r  r  r  r  r   r  r  r  r  r   contextmanagerr  r  r  r(  rI  rT  r#  rb  r  rs  r*  rx  rz  r|  r  rn   r  r  r  r  r  r  r  r+  r  r  r{  r  r  r	  r  r  r  r  r  r!  r"  r&  r%  r'  r'  rh   r>  rB  ro  rv   r  r  r  r  r  rv  r|  ra  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r
  r   r   s   @rN   r  r  F  s    ]#~ 
 FJ	&&$& 7& C	& &'<;Az
   " + +	! 
 

$	G$
%S$	(
.8)6(-.10J
/+7,5
5$:,:8 8 
8;; ;  	;
 ; ; 
;,U
&<
	V	V  	V .		V
 R	V  	V %	V 
	V0 ! !,O
b&SP:

B%(K(K (K -	(KT'?R5 @D W CG +.&'6
(  ' 
:>S
4*[Yz, #'(,

 
  	

 
 &
& FJ'*6C  #'(,!! !  	!
 ! &!.2I; 2I;V"A"A+8A	A
%N:.>85W

4
4SGj' !-
-
h !f(f(P !, F
 DH@*_*Au` 
(3T:(
 
E'NN N H	N
 
N`+'Z<< 9<BY
$
"B&
NP /B3/j     
 

 
 

 X XrP   r  c                       e Zd ZdZ	 d	 	 	 	 	 d fdZddZddZd Zd Zd Z	ddZ
dd	Zdd
ZddZ	 	 ddZddZddZd fdZedd       Zedd       Z xZS )rD  a  
    A wrapper codegen that generates code for a subgraph. For most of the
    methods, we rely on the implementation in the PythonWrapperCodegen. But we
    override a few functions to produce cleaner code (like avoiding writing
    imports twice in the output code)
    c                L    || _         || _        || _        t        |           y rB   )rF  rG  rH  r   r   )r   rF  rG  rH  r   s       rN   r   z%SubgraphPythonWrapperCodegen.__init__  s(     +,$8!rP   c                &    | j                   | _        y rB   )rF  r(  r   s    rN   r)  z1SubgraphPythonWrapperCodegen.set_launcher_fn_name  s     !% 2 2rP   c                     y rB   rb   r   s    rN   r,  z)SubgraphPythonWrapperCodegen.write_header   r]  rP   c                     y rB   rb   r  s     rN   r+  z2SubgraphPythonWrapperCodegen.add_benchmark_harness  r]  rP   c                     y rB   rb   r  s     rN   r  z6SubgraphPythonWrapperCodegen.benchmark_compiled_module  r]  rP   c                     y rB   rb   r   s    rN   r  z5SubgraphPythonWrapperCodegen.write_async_compile_wait	  r]  rP   c                6    | j                   j                         S rB   )rG  r  r   s    rN   r  z/SubgraphPythonWrapperCodegen.next_kernel_suffix  s    ""5577rP   c                     y rB   rb   r  s     rN   r  z2SubgraphPythonWrapperCodegen.generate_after_suffix  rs  rP   c                \    | j                   j                  d| j                   d       d}|S )Nz
            def z(args):
            r1   )r  r   r(  r  s     rN   r  z>SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent  s<    &&' (	

 rP   c                     yr   rb   r   s    rN   r  z4SubgraphPythonWrapperCodegen.get_wrapper_call_indent  s    rP   c                    | j                   x}r3|j                  |j                  D ci c]  }t        |      | c}z  }|S t        j
                  j                  }|S c c}w rB   )rH  input_nodesr  r   r0   rC   ru  )r   r  rL  r  s       rN   rv  z-SubgraphPythonWrapperCodegen.get_graph_inputs  sh     11191**#,#:#:.A	. F
  WW))F.s   Ac                    | j                   x}rJt        |j                  j                               |j                  D cg c]  }|j
                   c}z   }|S t        j                  j                  }|S c c}w rB   )	rH  r   r"  r  r  rz   r0   rC   r{  )r   r  r  namess       rN   r  z2SubgraphPythonWrapperCodegen.get_graph_input_names*  sr    11191..33566?6M6M:&2!!: E
  GG--E:s   A5c                r    | j                   x}r|j                  }|S t        j                  j                  }|S rB   )rH  r  r0   rC   rx  )r   r  r  s      rN   rn  z.SubgraphPythonWrapperCodegen.get_graph_outputs3  s;    11191,,G  gg++GrP   c                ~    |j                         }| j                  x}r||j                  v ry t        |   |       y rB   )rE   rH  r"  r   r  )r   r  rz   r  r   s       rN   r  z/SubgraphPythonWrapperCodegen.codegen_allocation:  s?     222I2	@U@U8U "6*rP   c                8    | j                   j                          y rB   )rG  rc  r   s    rN   rc  z5SubgraphPythonWrapperCodegen.write_triton_header_onceD  s     	446rP   c                8    | j                   j                          y rB   )rG  rg  r   s    rN   rg  z=SubgraphPythonWrapperCodegen.write_get_raw_stream_header_onceM  s     	<<>rP   rB   )rF  r   rG  r  rH  r  r  r  r  r  r  r  r  r  )r   r   r   r  r   r)  r,  r+  r  r  r  r  r  r  rv  r  rn  r  r%   rc  rg  r   r   s   @rN   rD  rD    s     FJ	 - C	3
8		G	+ 7 7 ? ?rP   rD  )rK   r  r   r   )rT   r  rU   r  )NN)rz   r   r   zlist[triton.Config]r   zlist[TritonGrid]rj   r  r   r   r   ztuple[str, str]r  )
__future__r   r   r   r  r   r  r  r6  r  r  rR  r8  	itertoolsr   r   typingr   r   r   r	   r
   r\   r   rO  
torch._opstorch.utils._pytreeutils_pytreer  r   r4  torch._dynamo.utilsr   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   r  r   r   r   	codecacher    r!   r"   runtimer#   runtime.hintsr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   virtualizedr0   r  r2   r3   r4   r5   r6   r7   	cpp_utilsr8   triton_utilsr9   r:   r;   collections.abcr<   r=   ro   rC   r>   wrapper_fxirr?   	getLoggerr   logdoprintrr  rm   rX  r   r   r   r  r  r  rO   rX   r  r   r  r   r   r   rh  r   r   r@   r  r  r  r%  r7  r=  rG  r]  rf  rt  r  r  r   r  r  r  r  r  r  r  r  r1  Liner  rD  rb   rP   rN   <module>rE     s   "    
      	  " @ @     $ $ & 6 C A ;  . / 9 : ( ( ' ( ' ,       P P 2%) g! u{{C56299l*+
]OT12 @ S> 	%UZZ
 #
%&2B1CU3PS8_1T(UU
 /3*.d&
d& d& d& ,	d&
 (d& d&NJ&Z   * **X X
 2 2 2 ++ + + 1{ 1 1 "@K "@ "@J?; ? 
7K 
7 
7 5+ 5 5@ 	({ 	( 	( /[ / /> 5; 5 5* ; ; ;2 ,% , ,@ 6, 6 62 /( / /& )" ) ).(! (
 ![ ! !: )8^ )8 )8X 4 4 4 #0k #0 #0L 	5+ 	5 	5 
,-\&X7 \&X~Lp?#7 p?rP   