
    rh6                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZ d dlmZ d dl	mZ d dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d dlm3Z3 d dl4Z4d dl5Z5d d	l6m7Z7 d d
l8m9Z9 d dl:m;Z; dgZ<e)r^d dlm=Z=m>Z>m?Z? d dl5m@Z@mAZAmBZB d dlCmDZD d dlEmFZF d dlGmHZH d dlImJZJ ddlKmLZL ddlMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZUmVZVmWZWmXZX ddlYmZZZ ddl[m\Z\m]Z] g dZ^ e*d      Z_ej                  dd       Zad dlbmcZc d dldmeZe d dlfmgZg d d lhmiZi d d!ljmkZk d d"llmmZm d d#lnmoZompZpmqZqmrZrmsZs d d$ltmuZumvZv d d%lwmxZxmyZy dd&lzm{Z{ dd'l|m}Z~ ej                  d(k(  Z ej                   e      Z e*d)      Zee4j
                  e4j
                  f   Ze'e+e5j                  ee5j                  f      Zd*d+d,Zd-Zd-Zd-Zd.Zd/Zeedz
  z  d k(  red0k\  sJ d1       dd2Zdd3Z G d4 d5e4j$                        Z ej(                  d67       G d8 d9             Zddd:Z	 d	 	 	 	 	 	 	 dd;Zej                  dd<       Zdd=Zdd>Zdd?Zdd@Z	 	 	 	 	 	 ddAZ}ddBZ	 	 	 	 ddCZddDZ	 	 	 	 ddEZddFZdG f	 	 	 	 	 ddHZ	 	 	 	 	 	 	 	 ddJZdddKZ	 	 d	 	 	 	 	 	 	 	 	 d dLZ	 	 	 	 	 d!	 	 	 	 	 	 	 	 	 	 	 	 	 d"dMZd#dNZd$dOZd%dPZd&dQZd'dRZ e/dS      Z e*dTd6U      Z G dV dWe(e$eef         Zd(dXZ	 	 	 	 d)dYZ	 	 	 	 	 	 d*dZZ	 	 	 	 	 	 d+d[Z	 d,	 	 	 	 	 d-d\Z	 	 	 	 	 	 d.d]Zd/d^Zd0d_Zd1d`Zd2daZd3dbZd4dcZd5ddZd6deZd7dfZ	 	 	 	 d8dgZd9dhZd:diZd dlZd;djZg ZdIedk<   d<dlZŐd;dmZej                  	 	 	 d=	 	 	 	 	 	 	 d>dn       ZeZeZeZːd?doZ	 	 	 	 	 	 d@dpZ ej                  d0      dAdq       Z G dr dse&      Zej(                   G dt du             Z G dv dw      Z G dx dyeҫ      Zej                  dBdz       Z G d{ d|      Z G d} d~eի      Zej                  dCdDd       Zej                  dEd       ZؐdEdZ	 d,	 	 	 	 	 	 	 dFdZ	 	 	 	 	 	 dGdZېdHdZܐdHdZddd	 	 	 	 	 	 	 dIdZސdJdZߐdKdZdLdZdZdZg dZe+ee4j
                  f   Zded<   dMdZej                  dNd       Zej                  dOd       Zej                  dPd       ZdQdZdKdZdKdZdQdZdQdZ	 	 	 	 	 	 	 	 dRdZ	 	 	 	 dS	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dTdZddZ G d d      Z	 	 	 	 	 	 	 	 dUdZ	 	 	 	 	 	 	 	 dUdZdVdZdWdZdXdZ	 	 	 	 	 	 	 	 dXdZ	 	 	 	 	 	 	 	 dYdZej                  	 	 	 	 	 	 dZd       Z	 d,	 	 	 	 	 d[dZd\dZd]dZd^dZd^dZd_dZ d`dZej                  dad       ZdEdZej                  dEd       Zej                  dAd       Zej                  dEd       ZdEdZdbdZdcdZ	ddZ
ddZdddZd7dZ G d dej                        Z	 	 	 	 	 	 	 	 	 	 dedZdfdZ	 	 	 	 dfdZ	 d,	 	 	 	 	 dgdZdhdZdidZdidZ	 	 	 	 	 	 djdÄZ	 	 	 	 	 	 	 	 dkdĄZdń f	 	 	 	 	 	 	 	 	 	 	 dldƄZdǄ f	 	 	 	 	 	 	 	 	 	 	 dldȄZdmdɄZdndʄZej(                   G d˄ d̫             Zej                  dod̈́       Zdpd΄ZdqdτZ drdЄZ!dsdфZ"	 	 	 	 	 	 	 	 	 	 	 	 	 	 dtd҄Z#dudӄZ$dvdԄZ%dwdՄZ&dxdքZ'	 	 	 	 	 	 	 	 dydׄZ(dzd؄Z)	 	 	 	 	 	 	 	 d{dلZ*d|dڄZ+	 d,	 	 	 	 	 	 	 d}dۄZ,	 	 	 	 	 	 d~d܄Z-dd݄Z.	 	 	 	 	 	 ddބZ/dd߄Z0dpdZ1ddddddddZ2e2jg                         D  ci c]  \  } }|| 
 c}} Z4 ejj                  d      Z6ddZ7ddZ8ddZ9ddZ:ej                  dd       Z;ej(                   G d d             Z<i Z=ded<   	 	 	 	 	 	 	 	 ddZ> e9       Z?ded<   ddZ@ddZAddZB e*d      ZC e*d      ZD G d deeCeDf         ZE e.d6      d,d6d7dd       ZFddZG	 d	 	 	 	 	 	 	 ddZH G d  dej                        ZIej                  dd       ZJddZKddZLddZMddZNddZOdZPdd	ZQdd
ZRyc c}} w (      )annotationsN)
CollectionIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeAlias	TypeGuard)mock)DeviceProperties)
OrderedSet)tree_map_only!activation_quantization_aten_pass)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)ShapeEnv)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelExternKernelOutIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpuTc                     t         D  cg c]#  } t        t        |       j                         s"| % }} t	        |      dk  sJ t	        |      dk(  rd}|S |j                         }|S c c} w )Nr+   r   r<   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      h/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/utils.pyget_gpu_typerK   ^   sg    &K'%*;*H*H*J!KJKz?aZA-vHO 4>>>3CHO Ls
   #A'A')get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRanges)config)ceildivwin32_Tz.cubinz.spv)r<   r>         @      zmust be power of 2c                *    | t         z   dz
  t          z  S )z/Round up to the nearest multiple of ALIGN_BYTESr+   )ALIGN_BYTES)nbytess    rJ   _alignrf      s    [ 1$44    c                   t        | t        j                  t        j                  f      r#t	        t        t        | j                              S t        | t              xs! t        j                  | t              t        k(  S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrd   )vs    rJ   ro   ro      sQ    !eii+,3{AFF+,,aK599Q#<#KKrg   c                  *    e Zd ZdZdZdZedd       Zy)rq   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr+   Tc                    t        |t        t        j                  f      rt	        t        |            S t        |      r|S y N)ri   intrj   Integerrf   ro   )clsvalues     rJ   evalz
align.eval   s6    ec5==12#e*%%uL rg   N)r{   
sympy.ExprreturnzOptional[sympy.Expr])__name__
__module____qualname____doc__nargs
is_integerclassmethodr|    rg   rJ   rq   rq      s!    FEJ rg   rq   Tfrozenc                  :    e Zd ZU dZded<   ded<   ded<   ded<   y	)
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    rx   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesN)r   r   r   r   __annotations__r   rg   rJ   r   r      s$    
 	G -,-- rg   r   c           
     d    |         t         j                  j                          t        j                  t	        d      t         j
                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t        |	      D cg c]"  }t         j                  j                  d      $ }}t        |	      D cg c]"  }t         j                  j                  d      $ }}t         j                  j                  t         j                  j                  j                  g      5 }
t         j                  j                          t        |	      D ]q  }|j                          ||   j                          t         j                  j                   j                  d	      5   |         d
d
d
       ||   j                          s t         j                  j                          t        j"                  t%        ||      D cg c]  \  }}|j                  |       c}}      }d
d
d
       t        j&                        j)                         }t*        j-                  d       t*        j-                  
j/                         j1                  dd             t3        |
j5                         D cg c]/  }|j6                  t8        j                  k(  rd|j:                  v r|1 c}      }|r"|t=        j&                  d |D              dz  z  }t*        j-                  d|       |S c c}w c c}w # 1 sw Y   xY wc c}}w # 1 sw Y   !xY wc c}w )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        Ar<   dtypedeviceTenable_timing   r+   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitfused_abs_max_0c              3  4   K   | ]  }|j                     y wrw   device_time_total.0events     rJ   	<genexpr>zfp8_bench.<locals>.<genexpr>   s     QE33Q        @@profiling results: %s ms)rC   r<   synchronizeemptyrx   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestablerO   eventsdevice_typerN   name
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rJ   	fp8_benchr      sJ    D	JJKKJu}}VLE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 BGxQA5::##$#7QKQ?DXO!!!!5OIO			NN++00
 
  
 
 


 x 	"AKKMN!!#&&7 aL!	" 	

 +.{I+FG41aQ^^AG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
  JOO38IUZZ8W 	
O OOQQQ	

 II(#.JI RO 
 H
 
*	
sD   "'P'PA9P =PAP P9P 4P-PP  P*c                L    |         t         j                  j                          t        j                  t	        d      t         j                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t         j                  j                          t         j                  j                  t         j                  j                  j                  g      5 }
t        |	      D ]  }|j                           |          t         j                  j                          d	d	d	       t        j!                  d
       t        j!                  
j#                         j%                  dd             t'        |
j)                         D cg c]0  }|j*                  t,        j                  k(  r|j.                  dk7  r|2 c}      }t1        |      |	z  dk7  rt3        dt1        |      |	      t1        |      |	z  }t'        t5        |      D cg c]  \  }}||z  dk7  r| c}}      }|j7                          |j#                         }t        j!                  d       t        j!                  |j%                  d             t9        d |D              dz  |	z  }t        j!                  d|       |S # 1 sw Y   xY wc c}w c c}}w )r   r   r<   r   Tr   r   r+   r   Nr   r   r   r   zContext Syncr   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %szprofiling time breakdown)r   c              3  4   K   | ]  }|j                     y wrw   r   r   s     rJ   r   z+do_bench_using_profiling.<locals>.<genexpr>R  s     A%e%%Ar   r   r   )rC   r<   r   r   rx   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rO   r   r   rN   r   rE   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   r   num_event_per_groupactual_eventsr   s                    rJ   do_bench_using_profilingr     s    D	JJKKJuyyHE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 
JJ			NN++00
 
  
 ! 
x 	AKKMD		 	

 ! IIlIIann$$-EQS$TU 	
  JOO3

n8T 	
O ?h&!+- 	
 	
 o.9 &o6	
5&&!+ 	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J_! !$	
	
s   6AN$5N(N 
Nc                    	 ddl m}  t        j                  j	                  dd       | d uxr% t        t        t        j                  dd       d      S # t        $ r Y yt        $ r}dt        |      v sJ Y d }~yd }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   rC   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrB   opsImportErrorr   str)r   r   s     rJ   has_torchvision_roi_alignr   W  s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 	A?A?&A::A?c                b   | t        j                  d      j                  S t        | t              rt        j                  |       } | j
                  dvrZ| j                  Nt        | j
                        }t        j                  | j
                  |j                  j                               S | S )Ng        )cpumeta)index)
rC   r   r   ri   r   typer   rL   Workercurrent_devicer   device_interfaces     rJ   decode_devicer   g  s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMrg   c                |    t        j                  t        j                  | t        j
                  j                        S rw   )	functoolsreduceoperatormulrj   SOne)its    rJ   sympy_productr  r  s#    HLL"eggkk::rg   c           	         t        |       t        |      k(  sJ t        j                  t        d t	        | |      D                    S )Nc              3  ,   K   | ]  \  }}||z    y wrw   r   )r   abs      rJ   r   zsympy_dot.<locals>.<genexpr>x  s     >daAE>s   )rE   rj   expandr   r   )seq1seq2s     rJ   	sympy_dotr	  v  s8    t9D	!!!<<>c$o>>??rg   c                \    | D ci c]  }t        |      | c}j                         S c c}w rw   )r   values)r   rG   s     rJ   uniquer  {  s'     !BqE1H!((**!s   )c           
     n   t        | t        j                        st        |t        j                        r2t        t        j                  |       t        j                  |            S t        | t
              rt        |t
              s$J |  dt        |        d| dt        |              t        | |      S )Nz: , )ri   rj   ExprrR   sympifyrx   r   runtime_ceildiv)numberdenoms     rJ   r\   r\     s     &%**%E5::)Fu}}V,emmE.BCC fc"z%'= ("T&\N"UG2d5k];= 65))rg   c                f   | yt        |       j                  d      d   }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|j                  t        |j	                               D ci c]  }|| c}       t        | t               r| S d'||    S c c}w )(Nz*i8.r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64*)r   splitupdatelistr  ri   )key	dtype_strtysrs   s       rJ   _type_ofrB    sR   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /C4 JJd3::<01112S#&3@aI/?,@@ 2s   
B.c                R    | D cg c]  }t        j                  |       c}S c c}w )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    )rj   r  lstr   s     rJ   convert_shape_to_inductorrF    s!     '**EMM!***s   $c                    ddl m} t        | t              r| S t        | t        j
                        rt        |       S |j                  j                  j                  j                  | d      S )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r+   VN)hint)
virtualizedrI  ri   rx   rj   ry   graphsizevars	shape_envcreate_symintnode)r   rI  s     rJ   convert_to_symintrP    se      a 	

 !U]]+ F	 !!++==ad=Krg   c                >    | D cg c]  }t        |       c}S c c}w )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    )rP  rD  s     rJ   convert_shape_to_symintrR    s     +..Qa ...s   c                N    t        d | j                  j                  D              S )z-
    Does this op overload have aliasing
    c              3  8   K   | ]  }|j                   d u  y wrw   )
alias_infor   r  s     rJ   r   zis_view.<locals>.<genexpr>  s     FAq||4'Fs   )any_schema	argumentsops    rJ   is_viewr\    s     F1E1EFFFrg   c                     yNFr   )r   s    rJ   <lambda>r_        rg   c                   | j                   dk(  syt        | j                  t        j                  j
                        s| j                  t        j                  u syt        t        j                  j
                  | j                        }|t        j                  u st        |      rt        fd| j                  D              S t        j                  j                  |j                  v xs  |      S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  6   K   | ]  }t        |        y wrw   )is_pointwise_use)r   uis_pointwise_fns     rJ   r   z#is_pointwise_use.<locals>.<genexpr>  s     KA#A7Ks   )r[  ri   targetrC   _ops
OpOverloadr   getitemr   r\  rm   usersTag	pointwisetags)userf  rg  s    ` rJ   rd  rd    s     66_$3::uzz445xGWGW9W%**''4F!!!WV_KKKK99&++-H1HHrg   	list[Any]c           	        t         j                  j                         g dfd} j                  | gt	        t         j
                  |||f       }t        | j                  j                        dk(  r2t        | j                  j                  d   j                        dk(  r|f}j                  |       t         j                  j                  i       }|fS )Nc                `    j                  |        j                  dt                     S )Narg)appendplaceholderrE   )rs  g
graph_argss    rJ   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_arg  s,    #}}s3z?"3455rg   r+   r   Tensor)rs  torch.Tensorr~   r*   )rC   fxGraphrb  r   ry  rE   rX  returnsr   r   outputr(   )rg  rp   kwargsrx  nodegmrv  rw  s         @@rJ   gen_gm_and_inputsr    s     	A%'J6 1??u||^dF^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>rg   c                h    | dk(  ry t        |       }|j                         r|j                          y y Nr   )rL   rD   r   r   s     rJ   r   r     s4    /7$$&$$& 'rg   c                    t        |       t        j                  d       t        j                         }t        |      D ]  } | | }t        |        t        j                         }J ||z
  S )Ni9  )r   rC   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rJ   timedr    sr     	d				B5\ 'F 
			B7Nrg   c                    t        j                  t        |      D cg c]  }t        | |||       c}      }t        j                  |      |z  }t        ||z  d       |j                         S c c}w )Nz.6f)rC   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rJ   print_performancer  *  sg     ll>CFmLuneV	4LG << 5(D	TH_S!#99;	 	Ms   A1c                H     t        | |             t        | |fd       y)zKReplace obj.method() with a new method that returns a precomputed constant.c                      S rw   r   )r  s   rJ   r_  z#precompute_method.<locals>.<lambda>=  s     rg   N)rB   setattr)objmethodr  s     @rJ   precompute_methodr  :  s     !WS&!#FC(rg   c                *    |D ]  }t        | |        y)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rJ   precompute_methodsr  @  s     '#v&'rg   c                <    t        | |kD        t        | |k        z
  S rw   )rx   )r  r  s     rJ   cmpr  F  s    q1u:AE
""rg   c                ~    t        | t              r| g|z  S t        |       dk(  r t        |       | d   g      |z  S | S )Nr+   r   )ri   rx   rE   r   )rG   sizes     rJ   pad_listliker  J  sC    !SsTz
1v{tAw!v%%Hrg   c                D    t        |       dk(  rg S dd}t        | |      S )Nr   c                n    t        | t              r| S ddlm} t        | |      sJ | j	                         S )Nr+   )r:   )ri   r   	schedulerr:   get_name)elemr:   s     rJ   	sort_funcztuple_sorted.<locals>.sort_funcW  s1    dC K0$ 1222}}rg   r?  )r  r^   r~   r   )rE   sorted)rG   r  s     rJ   tuple_sortedr  S  s&    
1v{	 !##rg   PRV)	covariantc                  &    e Zd Zedd       ZddZy)CachedMethodc                     y rw   r   )r   s    rJ   clear_cachezCachedMethod.clear_cacheh  s    ),rg   c                     y rw   r   selfrp   r  s      rJ   __call__zCachedMethod.__call__k  r`  rg   N)r   r   r~   None)rp   P.argsr  P.kwargsr~   r  )r   r   r   staticmethodr  r  r   rg   rJ   r  r  g  s    , ,Drg   r  c           	         | j                   }d| dd| i}t        d| d d dj                         |        t        j                  |       || d         }d
fd	}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                8    t        |       rt        |        y y rw   )r   delattrr  r?  s    rJ   r  z"cache_on_self.<locals>.clear_cache  s    4D# rg   )r  r   r~   r  )r   execlstripr   wrapsr  )r   r   ctxwrapperr  r?  s        @rJ   cache_on_selfr  o  s    ;;DtfF
C *CF  E "' (+e ,			 FH "ioob!#n&=">?G &GNrg   c           
     ^   ddl m} t        | t              rgt	        j
                  t        j                  | D cg c]0  }t        |d      r"|j                  r|j                  j                  2 c}t                     S t        | |j                        r| j                  S t               S c c}w )Nr+   irr  ) r  ri   r>  r   r   r   or_r   r  originsr   r2   )node_scheduler  r  s      rJ   aggregate_originsr    s     -&LL *4(TYY 		!!
 L
 	
 
M2??	3$$$|s   5B*
c                   t        |       }|dk(  rq|D cg c]Q  }|j                  dk(  r@d|j                  v r2|j                  d   #|j                  d   j                  j                  S }}t        t        |            }n|dk(  rg }|D ]y  }|j                  dk(  sd|j                  v s"|j                  d   d   }t        |d   t              r|j                  |d          \|j                  |d   j                         { t        t        |            }n5|dk(  r*|D cg c]  }|j                  dk(  s|j                    }}nt        |}dj                  d	g|z         S c c}w c c}w )
Noriginal_atenrb  rC   source_fn_stackr   r+   inductor_noder   fused)r  r[  r   _overloadpacketr   r  r   ri   r   rt  r   NotImplementedErrorjoin)r  descriptive_namesall_originsoriginsources	source_fns         rJ   get_fused_kernel_namer    sm    $M2KO+ &
yyO+6;;.O,8	 KK(88AA
 
 G,-	g	%! 	:FyyO+0AV[[0P"KK(9:2>	ilC0NN9Q<0NN9Q<#8#89	: G,-	o	-&1
"VYY/5QFKK
 
 "!G88WI'((5
(
s   AE(%E-:E-c                   t        |       }|D cg c]  }|j                  dk(  s| }}t        j                  t              }t        j                  t              }d t        |      rt        d |D              }t        |      dk(  r_|d   j                  t        d      s/t        j                        D 	ci c]  \  }}	|	|
 }
}}	|
_        |j                  fd       |D ]  }d|j                  v rO|j                  d   @t        |j                  d   j                        }||   j!                  |j"                         d	|j                  v so|j                  d	   d   j"                  }||   j!                  |j"                          d
nd}|j$                   d| ddj'                  |j)                                ddj'                  |j)                                d}|j$                   dg}t+        |j-                               D ]@  \  }}|j!                  |j$                   d| ddj'                  t+        |                    B S|j!                  |j$                   d       |D ]0  }	|j!                  |j$                   d|	j/                                 2 |dj'                  |      fS c c}w c c}	}w )Nrb  c              3  4   K   | ]  }|j                     y wrw   )rL  )r   ns     rJ   r   z&get_kernel_metadata.<locals>.<genexpr>  s     "Cq177"Cr   r+   r   )_inductor_kernel_metadata_node_to_idx_mapc                "    j                   |    S rw   )r  )r  single_graphs    rJ   r_  z%get_kernel_metadata.<locals>.<lambda>  s    lTTUVW rg   r  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r  z], Original ATen: []z" Source node to ATen node mapping:z   z => z Graph fragment:
)r  r[  collectionsdefaultdictr>  rE   r   rL  r   r   nodesr  sortr   r   r  rt  r   commentr  keysr  itemsformat_node)r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr  node_to_idx_mapr  r?  sort_strmetadatadetailed_metadataoriginal_noder  r  s                     @rJ   get_kernel_metadatar    s    $M2K+6W&)):VfWNW ,,T2N$006
 L
>""CN"CC}")!,22L<)TU8A,BTBT8U"Vfc11c6"V"VIXFW     2dii'DIIo,F,Rdii0@@ACs#**4995$))#))K(+00C3&&tyy12 *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= > 
u  s=/diiu6N5OP	

   GOO#44D!EF 	OA $$'8AMMO;L%MN	O
 TYY0111c X #Ws   KK>Kc                    t        |       } t        |       }| rV| j                         }|j                  D ]4  }|r	 ||      r||vs|j	                  |       | j                  |       6 | rV|S )zJReturns the set of nodes whose values depend on those within initial_queue)r>  r   rF   rk  addrt  )initial_queueskip_filterdominated_setr  users        rJ   dominated_nodesr    sz    
 'M}-M
  "JJ 	+D{40=(!!$'$$T*	+  rg   c                   dd l }ddlm dfd|j                         D cg c]  } |      s|j                   }}| D cg c]  } |      s|j                   }}t         |j                  g ||       S c c}w c c}w )Nr   r+   r  c                    t        | j                        r | j                        S t        | j                        r | j                        S t        | j                        xr t        | j
                        S rw   )ri   	TensorBoxdata
StorageBoxr4   	Pointwise)r  r  is_unrealized_nodes    rJ   r  z*gather_origins.<locals>.is_unrealized_node  s^    a&%aff--a'%aff--!RYY'GJq",,,GGrg   )r  r4   r~   r  )	itertoolsr  r  r  r  r   chain)	rp   r  r  valkwarg_originsrs  arg_originsr  r  s	          @@rJ   gather_originsr    s     H -3MMOWS?QRU?VS[[WMW*.J32DS2I3;;JKJoiooC{C]CDD XJs   BBBBc                J    dddfddfddfd |       S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    c                    t        | t        j                        xr, t        | j                        dk(  xr | j                  d   dk(  S )N   r   r   )ri   rj   MulrE   rp   )exprs    rJ   is_neg_leadzsympy_str.<locals>.is_neg_lead,  s:    tUYY'VC		Na,?VDIIaLTVDV	
rg   c                `   t        | t        j                        rt        | j                        dk(  rO | j                  d         r: | j                  d          d | j                  d   j                  d          S dj                  t        | j                              S  |       S )Nr  r+   r   z - z + )ri   rj   rk   rE   rp   r  rn   )r  r  sympy_str_muls    rJ   sympy_str_addz sympy_str.<locals>.sympy_str_add1  s    dEII& 499~"{499Q<'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&rg   c                    t        | t        j                        rE |       rd | j                  d          S dj	                  t        | j                              S  |       S )N-r+   z * )ri   rj   r  rp   r  rn   )r  r  sympy_str_atoms    rJ   r  z sympy_str.<locals>.sympy_str_mul<  s[    dEII&4  >$))A,7899zz#ndii"@AA!$''rg   c                   t        | t        j                        r| j                  S t        | t        j                  t        j
                  f      rd |        dS t        | t        t        t        t        f      rC| j                  j                   ddj                  t        t        | j                               dS t!        |       S )N()r  )ri   rj   Symbolr   rk   r  rV   rS   rT   rU   funcr   r  rn   	sympy_strrp   r   )r  r  s    rJ   r  z!sympy_str.<locals>.sympy_str_atomG  s    dELL)99uyy%))45}T*+1--(HMNii(()499SDII5N+O*PPQRRt9rg   )r  r}   r~   r  r  r}   r~   r   r   )r  r  r  r  r  s    @@@@rJ   r#  r#  %  s$    

	'	( rg   c                    ddl m} t        j                  r3t	        |j
                  dd       x}r|j                  dk7  rt        |       S t        j                         S )Nr+   rH  current_node
index_expr)
rK  rI  r[   compute_all_boundsrB   interpreterrg  rY   rZ   unknown)r   rI  fx_nodes      rJ   get_bounds_index_exprr,  T  sN     	!!~tDDWDNNl*5!!""$$rg   c                    | d   dk(  S )Nr   rr   )prefixs    rJ   prefix_is_reductionr0  b  s    !9rg   c                J    | t         j                  k7  sJ t        | |dd      S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)rX   SIZErW   )r/  r  s     rJ   sympy_index_symbol_with_prefixr7  f  s)     TYY vsDdCCrg   c                N    | xs t         j                  xr t         j                  S rw   )r[   debug_index_assertsassert_indirect_indexing)checks    rJ   generate_assertr<  r  s    /V//TV5T5TTrg   c                F    | d   dk7  sJ t        j                  | dd      S )r2  r   r   Tr3  )rj   r!  r   s    rJ   sympy_index_symbolr?  v  s)     7c>> <<d==rg   c                    	 	 	 	 	 	 dd}t        j                  |       j                  |j                         D ci c]  \  }}| |||       c}}      S c c}}w )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    c                    t        | t        j                        sJ t        |t              r,t        j                  || j
                  | j                        S |S )Nr3  )ri   rj   r  r   r!  r   is_nonnegative)replacedreplacements     rJ   	to_symbolzsympy_subs.<locals>.to_symbol  sP     (EJJ///k3'<< ++$33  rg   )rC  r}   rD  zUnion[sympy.Expr, str]r~   sympy.Symbol)rj   r  xreplacer  )r  replacementsrE  krs   s        rJ   
sympy_subsrJ    sf    +A	 ==''(4(:(:(<=1IaO	= =s   A
c                    t        | t        j                        xs^ t        | t        j                        xrB t	        d t        j                  | j                         | j                               D              S )Nc              3  2   K   | ]  }t        |        y wrw   is_symbolicr   rG   s     rJ   r   zis_symbolic.<locals>.<genexpr>  s     N1AN   )	ri   rC   r&   ry  rW  r  r  r  stride)r  s    rJ   rN  rN    sS    a& 1ell# 	ON	!((*(MNNrg   c                 &    t        d | D              S )Nc              3  2   K   | ]  }t        |        y wrw   rM  rV  s     rJ   r   z"any_is_symbolic.<locals>.<genexpr>  s     ,!{1~,rP  rW  )rp   s    rJ   any_is_symbolicrU    s    ,t,,,rg   c                T   ddl m} t        g d      }t        j                         r|j                  d       | j                  j                  D ]  }t        |j                        |v r|c S t        j                  j                  j                  slt        |j                  t        j                  j                        r>t        j                   j"                  j$                  |j                  j&                  v r|c S |j(                  j+                  d      x} ||      s|c S  y )Nr   )free_unbacked_symbols)z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outr  )%torch.fx.experimental.symbolic_shapesrW  r   rC   $are_deterministic_algorithms_enabledr=  rL  r  r   rg  	_inductorr[   graph_partitionri   rh  ri  r   rl  cudagraph_unsafern  r   get)r  rW  forbidden_setr  r  s        rJ   %get_first_incompatible_cudagraph_nodera    s     L	
M  113	
"  t{{},K &&664;;

(=(=>--1A1AA
 K99==''C49Ns9SK" rg   c                    t        t        t        | j                  j                                    }|j
                  dk(  sJ |S )z$Get the output node from an FX graphr~  )nextiterreversedrL  r  r[  )r  	last_nodes     rJ   output_noderg    s6    T(288>>234I<<8###rg   c                    | j                   j                  d      }t        d |D              }t        |       j                  d   }t        |t              r|n|f}t        d |D              }||z  S )Nru  rZ  c              3     K   | ]P  }t        |j                  j                  d       t        j                        r|j                  d    j
                   R ywr  N)ri   r   r_  rC   ry  r   )r   r  s     rJ   r   z"get_all_devices.<locals>.<genexpr>  sB      9diimmE*ELL9 			%9s   AAr   c              3     K   | ]t  }t        |t        j                  j                        rNt        |j                  j                  d       t        j                        r|j                  d    j                   v ywrj  )ri   rC   r{  r*   r   r_  ry  r   )r   rs  s     rJ   r   z"get_all_devices.<locals>.<genexpr>  sS      7c588==)sxx||E*ELL9 	7s   A:A<)rL  
find_nodesr   rg  rp   ri   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rJ   get_all_devicesrs    s}    ++}+=.8 9%9 /M "o""1%G$We4w7*H,6 77 -K ;&&rg   c                    t        t        j                  j                               D ]'  } | j	                  d      st        j                  |    }|j
                  j                         D ]  }|j	                  d      st        ||      }t        |t        j                  j                  j                  j                        sZ|j                  D ]i  }t        |t        j                  j                  j                  j                        s<|j                  j                   j"                  j%                          k  t        j                  | = * dt        j                  v rRt        j                  d   }t'        |j(                  j*                  j,                        `|j(                  j*                  `t1        j2                          y )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)r>  sysmodulesr  
startswith__dict__rB   ri   rC   r\  runtimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r   driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rJ   unload_xpu_triton_pydsr    sR   CKK,,./ %%%&NOKK$* 	<I##I. I.EOO33EEVV #)"8"8 <%"!OO33EEYY #MM--1199;<	< KK$!%& #++-kk12""(()2JJ#JJLrg   _registered_cachesc                    t        | d      rt        | j                        st        |  d      t        j                  |        | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  rt  r  s    rJ   clear_on_fresh_cacher    s?    
 3&hs.Gu$GHIIc"Jrg   c                 :    t         D ]  } | j                           y)z&
    Clear all registered caches.
    N)r  r  r  s    rJ   clear_cachesr  *  s     " rg   c              #  h  K   t                t        j                  |      	 t        j                  j                  t        j                  di      5  t        j                  d       t        j                  j                  d      }t        j                  j                  t        j                  d|i      5  d t        | t
              rt        |       dk(  sJ d       t        j                  j                  |      rtt        j                  |      }| j!                  |D ci c]D  }d	|vr>|t        j                  j#                  t        j                  j                  ||            F c}       ddd       ddd       |rLt%               r(t&        j(                  j+                         r
t-                t/        j0                  fd
       t                yc c}w # 1 sw Y   oxY w# 1 sw Y   sxY w# t2        $ r t        j5                  d        w xY w# t                w xY ww)z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    )dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNr   z!expected empty cache_entries dictz.lockc                4    t         j                  d|      S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r"  pathr  inductor_cache_dirs      rJ   r_  zfresh_cache.<locals>.<lambda>]  s    S[[@&% 6A 6 rg   )onerrorz(on error, temporary cache dir kept at %s)r  tempfilemkdtempr   patchdictosenvironr   r   r  r  ri   rE   existslistdirr=  getsize
is_windowsrC   r>   rD   r  shutilrmtree	Exceptionr  )cache_entriesr  deletetriton_cache_dirfilesfr  s         @rJ   fresh_cacher  2  s     N!))c2&ZZ__JJ24FG
 	 II35GH!ww||,>I.@BR-ST mT2}-2W4WW2ww~~&67 "

+; <%,, */$%#*!#3 !"277??277<<@PRS3T#U U	$ |		 6 6 8&(MM"
 	3 	 	B  >@RS 	sn   !H20G? A'G3<A-G')A	G"2G'9G3AG? H2"G''G0	,G33G<8G? ?!H  H# #H//H2c           	         | j                   }t        t        |             }t        t	        t        ||d                  S )NT)r?  reverse)__getitem__r   rE   r>  re  r  )seqgettera_rs      rJ   argsortr  p  s1    __F
C/C>?@@rg   c           	     2    d fd}t        |      D cg c]9  \  }}|t        |t        j                        r|j                  j
                  n|f; }}}t        |t        j                  |            }|D cg c]  \  }}|	 }}}|S c c}}w c c}}w )Nc                n    | \  }}|\  }}dfd} |||k        ry |||kD        ry||k  ry||kD  ryy)Nc                N    t        | t              r| S j                  | d      S )NT)size_oblivious)ri   r  evaluate_expr)r  rN  s    rJ   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate~  s(    $%**4*EErg   r   r+   r   )r  z%Union[bool, torch.SymInt, sympy.Expr]r~   r  r   )r  r  a_idxa_valb_idxb_valr  rN  s          rJ   r  zargsort_sym.<locals>.cmpz  sT    uu	F
 EEM"EEM"
 5=5=rg   r  )r  tuple[int, sympy.Expr]r  r  r~   rx   )	r   ri   rC   r&   r  r  r  r   
cmp_to_key)rN  r  r  r  r   exprsr   r  s   `       rJ   argsort_symr  w  s    4  nC 
Z5<<8affkka@E  5i22378E %&fc1c&F&M
 's   >B<Bc                t    | t         j                  k(  ryt        j                  d|       j                         S )Nrb   r   r   )rC   r:  r   element_sizer  s    rJ   get_dtype_sizer    s-     ;;r'4466rg   c                      e Zd ZU ded<   y)LineContextr   contextNr   r   r   r   r   rg   rJ   r  r    s    Lrg   r  c                  "    e Zd ZU ded<   ded<   y)ValueWithLineMapr   r{   zlist[tuple[int, LineContext]]line_mapNr  r   rg   rJ   r  r    s    J++rg   r  c                      e Zd ZdZdddZej                  dd       ZddZddZ	ddZ
ddZddZdd	Zdd
ZddZ	 	 	 	 ddZdddZdd dZdd dZ	 d!	 	 	 	 	 d"dZd#dZddZd$dZy)%IndentedBuffer   c                     g | _         || _        y rw   )_lines_indent)r  initial_indents     rJ   __init__zIndentedBuffer.__init__  s    GI%rg   c              #  b   K   | j                   }	 || _         d  || _         y # || _         w xY wwrw   )tabwidth)r  r  prevs      rJ   set_tabwidthzIndentedBuffer.set_tabwidth  s,     }}	!$DM DMDDMs   /# /	,/c                   t               }d}g }| j                  D ]  }t        |t              r
 |       }|1t        |t              r|j                  ||j                  f       K|}t        |t              sJ |j                  |       |j                  d       |d|j                  d      z   z  } t        |j                         |      S )Nr+   r  )r
   r  ri   DeferredLineBaser  rt  r  r   writecountr  getvalue)r  bufr   linemaplilines         rJ   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap  s    j13++ 	&B"./t<B,2::/dC(((IIdOIIdOTZZ%%%A	&  88rg   c                6    | j                         j                  S rw   )r  r{   r  s    rJ   r  zIndentedBuffer.getvalue  s    '')///rg   c                f   t               }| j                  D ]  }t        |t              r
 |       }|t        |t              r.|}t        |t
              sJ |j                  d      r|j                  |d d        h|j                  |       |j                  d        |j                         S )N\r   r  )	r
   r  ri   r  r  r   endswithr  r  )r  r  r  r  s       rJ   getrawvaluezIndentedBuffer.getrawvalue  s    j++ 	 B"./t<B,dC(((}}T"		$s)$		$		$	   ||~rg   c                8    | j                   j                          y rw   )r  clearr  s    rJ   r  zIndentedBuffer.clear  s    rg   c                ,    t        | j                        S rw   )r  r  r  s    rJ   __bool__zIndentedBuffer.__bool__  s    DKK  rg   c                :    d| j                   | j                  z  z  S )Nr  )r  r  r  s    rJ   r/  zIndentedBuffer.prefix  s    dllT]]233rg   c                &    | j                  d       y )Nr  	writeliner  s    rJ   newlinezIndentedBuffer.newline  s    trg   c                   t        |t              r| j                  j                  |       y t        |t              r9| j                  j                  |j                  | j                                      y |j                         r.| j                  j                  | j                          |        y | j                  j                  d       y Nr  )ri   r  r  rt  r  with_prefixr/  stripr  r  s     rJ   r  zIndentedBuffer.writeline  s    dK(KKt$./KKt//>?ZZ\KK$++-78KKr"rg   c                4    |D ]  }| j                  |        y rw   r  )r  linesr  s      rJ   
writelineszIndentedBuffer.writelines  s      	!DNN4 	!rg   c                H     t         j                  d fd       } |       S )Nc               3     K   xj                    z  c_         	 d  xj                    z  c_         y # xj                    z  c_         w xY wwrw   r  )offsetr  s   rJ   r  z"IndentedBuffer.indent.<locals>.ctx  s9     LLF"L'&&s   A4 AAAr~   Iterator[None])
contextlibcontextmanager)r  r  r  s   `` rJ   indentzIndentedBuffer.indent  s$    		"	"	' 
#	' urg   c                .    | xj                   |z  c_         y rw   r  r  r  s     rJ   	do_indentzIndentedBuffer.do_indent      rg   c                .    | xj                   |z  c_         y rw   r  r  s     rJ   do_unindentzIndentedBuffer.do_unindent  r  rg   c           	        t        |t              rt        d      }|j                  D ]E  }t        |t              r|st        |t        |      t        |j                               z
        }G t        j                  |      rd}|j                  D ]P  }t        |t              r| j                  j                  |       /t        j                  | |t        |      d         R y t        j                  |      }|r|j                         }|sy |j                         }|j!                  d      D ]  }| j                  |        y )Ninfr   r  )ri   r  floatr  r  minrE   r  mathisinfrt  r  rx   textwrapdedentrstripr<  )r  
other_coder  r  r  r   s         rJ   splicezIndentedBuffer.splice  s    j.15\F")) I!$4 TS5G)GHFI zz&!")) HdK0KK&&t,",,T4F3FG	H "4J'..0
#**,J%%d+ "q!"rg   c                    t        | j                        }| j                  D cg c]
  } ||       c}|_        |S c c}w N)r  )r  r  r  )r  r"  r   r  s       rJ   rn   zIndentedBuffer.map/  s4    DLL9-1[[9Td4j9

 :s   >c                @    t        |        d| j                          dS )Nr  r   )r   r  r  s    rJ   __repr__zIndentedBuffer.__repr__4  s     t*Qt}}/q11rg   c                    | j                   |j                   k(  sJ t        | j                         }|j                  | j                         |j                  |j                         |S r  )r  r  r   r  )r  otherr   s      rJ   __add__zIndentedBuffer.__add__7  sK    ||u}},,,DLL9t{{#u||$
rg   Nr   )r  rx   r~   r  )r  rx   r~   r  )r~   r  r~   r   r~   r  r~   r  )r  z)Union[LineContext, DeferredLineBase, str]r~   r  )r  z3Sequence[Union[LineContext, DeferredLineBase, str]]r~   r  ru   )r  rx   r~   'contextlib.AbstractContextManager[None])r  rx   r~   r  F)r  zUnion[IndentedBuffer, str]r  r  r~   r  )r"  zCallable[[Any], Any]r~   r  )r   r   r~   r  )r   r   r   r  r  r  r  r  r  r  r  r  r  r/  r  r  r   r	  r  r  r  rn   r  r!  r   rg   rJ   r  r    s    H& ! !9(0(!4#!H!	!	 EJ"4"=A"	"2
2rg   r  c                  (     e Zd Zd fdZddZ xZS )FakeIndentedBufferc                "    t         |           y rw   )superr  )r  	__class__s    rJ   r  zFakeIndentedBuffer.__init__A  s    rg   c                V    |dk(  rt         j                  | |      S t        d| d      )Nr,  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rJ   r/  z#FakeIndentedBuffer.__getattribute__D  s;    ;**466!$ (= =
 	
rg   r$  )r   r   r~   r   )r   r   r   r  r/  __classcell__r,  s   @rJ   r)  r)  @  s    
rg   r)  c               #     K   t         j                  t         j                  }} 	 d  | |ct         _        t         _        y # | |ct         _        t         _        w xY wwrw   )rv  stdoutstderr)initial_stdoutinitial_stderrs     rJ   restore_stdout_stderrr7  O  s@     %(ZZNN@!/
CJ
CJs   !AA  A AAc                  P    e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zy
)r  z.A line that can be 'unwritten' at a later timec                6    |j                         sd}|| _        y r  )r  r  r  s     rJ   r  zDeferredLineBase.__init__[  s    zz|D	rg   c                    t         )zJReturns either self.line or None to indicate the line has been 'unwritten'r  r  s    rJ   r  zDeferredLineBase.__call__`      !!rg   c                    t         )z3Returns a new deferred line with the same conditionr;  r  s     rJ   	_new_linezDeferredLineBase._new_lined  r<  rg   c                @    | j                  | | j                         S rw   r>  r  )r  r/  s     rJ   r  zDeferredLineBase.with_prefixh  s    ~~455rg   c                T    | j                  | j                  j                               S rw   )r>  r  r  r  s    rJ   r  zDeferredLineBase.lstripk  s    ~~dii..011rg   c                >    | j                  | j                  |         S rw   r@  )r  r   s     rJ   r  zDeferredLineBase.__getitem__n  s    ~~dii.//rg   c                ,    t        | j                        S rw   )r  r  r  s    rJ   r  zDeferredLineBase.__bool__q  s    DIIrg   c                ,    t        | j                        S rw   )rE   r  r  s    rJ   __len__zDeferredLineBase.__len__t  s    499~rg   N)r  r   )r~   zUnion[str, None])r  r   r~   r   )r/  r   r~   r   )r~   r   )r   zUnion[int, slice]r~   r   r%  r~   rx   )r   r   r   r   r  r  r>  r  r  r  r  rE  r   rg   rJ   r  r  X  s-    8
""620rg   r  c                  4     e Zd ZdZd fdZddZddZ xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`c                @    t         |   |       || _        || _        y rw   )r+  r  r?  value_fn)r  r?  rJ  r  r,  s       rJ   r  zDelayReplaceLine.__init__{  s     rg   c                j    | j                   j                  | j                  | j                               S rw   )r  replacer?  rJ  r  s    rJ   r  zDelayReplaceLine.__call__  s#    yy  4==?;;rg   c                D    t        | j                  | j                  |      S rw   )rH  r?  rJ  r  s     rJ   r>  zDelayReplaceLine._new_line  s    $-->>rg   )r?  r   rJ  zCallable[[], str]r  r   r#  )r  r   r~   rH  )r   r   r   r   r  r  r>  r0  r1  s   @rJ   rH  rH  x  s    @!
<?rg   rH  c                   t        | t        j                        r| }nt        j                  t               |       }t	        j
                  |      }t        j                  j                  rC|j                  J |j                  dk  s|j                  dk(  rt        j                  d       yy|j                  dk(  rdnd}|j                  }||k  rt        j                  d	||d
       yy)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTr>   r_   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)ri   rC   r   rK   r   createversionhipmajorr   r  r   multi_processor_count)index_or_devicer   proprR  rS  s        rJ   
is_big_gpur\    s    /5<<0 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I7:%I> 	 	
 rg   c                 T    t         j                  j                  d      j                  S )Nr<   )rC   r<   get_device_propertiesrY  r   rg   rJ   get_max_num_smsr_    s    ::++F3IIIrg   c                 d    t         j                  j                         } t               | | z
  S dz
  S )zFHandle experimental carveout if set otherwise return hardware SM countr   )rC   r   _get_sm_carveout_experimentalr_  )carveouts    rJ   get_num_smsrc    s1     xx557HH,@HHaHHrg   c                    ddl m}m} |
t               }|j	                  d      }|| z  t
        z  } |||| |j                               S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r+   )r,   WorkspaceZeroModeF)r  	zero_moder   
outer_name)codegen.commonr,   re  rc  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr,   re  rf  r  s          rJ   get_tma_workspace_argrn    sZ     @"}!++E2I--0CCD+<++-	 rg   c                    | j                   |vr!t        j                  d| j                   |       t        | j                  j
                        xr% | j                   |v xr t        | j                        S )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r   r\  )layoutallowed_layout_dtypess     rJ   _use_template_for_gpurs    sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%rg   c                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w N,)upperr[   max_autotune_gemm_backendsr<  r  backendrG   s     rJ   _use_autotune_backendr{    M    ==?!<<BBDJJ3O	      Ac                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w ru  )rw  r[   max_autotune_conv_backendsr<  r  ry  s     rJ   _use_conv_autotune_backendr    r|  r}  F)enable_int32enable_float8c                  ddl m}m} t        j                  t        j
                  t        j                  g}|r>t        j                  t        j
                  t        j                  t        j                  g}|r/|j                  t        j                  t        j                  g       t        | j                  j                        xr t        | |      xs) | j                  j                  dk(  xr | j                  |v xrN t         j"                  xs t         j$                  xr* t'        d      xr  || j                  |j(                        S )Nr+   )BackendFeaturehas_backend_featurer   TRITON)rh  r  r  rC   r   r&  r(  r0  extendr   r!  rp  r   r   rs  r   r[   max_autotunemax_autotune_gemmr{  TRITON_TEMPLATES)rq  r  r  r  r  layout_dtypess         rJ   use_triton_templater    s     D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&-@O ""e+M0M		P   <F$<$<		P "(+		P  ~/N/NOrg   c                     ddl m}m} ddlm dfd |       rt
        j                  ryt
        j                  j                  xr  |       xr t        fd| D              S )	Nr   )has_triton_stable_tma_apihas_triton_tma_devicer+   rH  c                N   t        | j                               dk7  ry| j                         }|t        j                  t        j
                  t        j                  fvry| j                         }|j                         }|j                         s|sy|j                  d   }|r|j                  d   }|t        j                  k(  r'j                  j                  j                  |d      ry||j                  z  }j                  j                  j                  |t               S )Nr  Fr+   r       )rE   get_size	get_dtyperC   r   r&  r   
get_layoutis_transposedis_contiguousr  rL  rM  statically_known_ltitemsizestatically_known_multiple_ofTMA_ALIGNMENT)rG   r   rq  
transposed	inner_diminner_bytesrI  s         rJ   _is_tma_compatiblez3use_triton_tma_template.<locals>._is_tma_compatible  s    qzz|!8K8KLL))+
$$&*KKN	AIE'''AGG,<,<,P,Pr-
 %..0ww<<[-XXrg   Fc              3  .   K   | ]  } |        y wrw   r   )r   r  r  s     rJ   r   z*use_triton_tma_template.<locals>.<genexpr>  s     8!"1%8   rG   r4   r~   r  )
torch.utils._tritonr  r  rK  rI  r[   cpp_wrapperr  enable_persistent_tma_matmulrm   )matricesr  r  rI  r  s      @@rJ   use_triton_tma_templater    sT    TY2 !"v'9'9 	22 	9!#	98x88rg   c                
   ddl m} |j                  j                  j	                  ||z  |z  d      }|dk  s|t
        j                  j                  k  ryddlm	} t        j                  j                  ryt        j                  t        j                  t        j                  g}t!        | |      xr/ t
        j"                  xs t
        j$                  xr t'        d      }|r |       st(        j+                  d	       y|S )
Nr+   rH  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)rK  rI  rL  rM  	size_hintr[   r<   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr  rC   rV  rW  r   r&  r0  rs  r  r  r{  r   r  )	rq  r  r  rI  rI  	gemm_sizer  r  r   s	            rJ   use_cutlass_templater  !  s      **1q519r*BIA~V[[%N%NN> }} ]]ENNEKK@Mfm4 	-  <F$<$<	-!),  !#KK4
 Jrg   c                    t         j                  j                  j                         }|dk(  ry| j                         |j	                  d      D cg c]  }|j                          c}v S c c}w )z8Check if CUTLASS should be used for the given operation.ALLTrv  )r[   r<   cutlass_enabled_opsrw  r<  r  )op_nameenabled_opsrG   s      rJ   _use_cutlass_for_opr  A  sU    ++11779Ke==?+2C2CC2HIQqwwyIIIIs   A,r  r   )r_   r  ra   r`      r   _IntLikec           
     v   ddl m} |j                  j                  j	                  t        j                  t        j                  |t        | z        t        j                  |t        |z                    xrC |j                  j                   xr* |j                  j                   xr t        j                   S )Nr   rH  )torch._inductor.virtualizedrI  rL  rM  statically_known_truerj   AndGedecompose_k_thresholdaot_moder  r[   disable_decompose_k)r  r  rI  rI  s       rJ   use_decompose_k_choicer  T  s    - 	
..II1A561A56	
 	+    	+ ###	+ ***
rg   c                   t        |t        j                        r|j                  st        S t        | t        j                        r| j                  r&t        |t        j                        r|j                  sd}nt        || z  ||z        }d}t        j                  |      }|D cg c]  }||k  r||k\  r| }}g g g }	}}|D ]Z  }
||
z  }|dk  r||dz
  z  dk(  r|dk\  r|j                  |
       0|dz  dk(  r|j                  |
       J|	j                  |
       \ t        j                  dk(  r||z   |	z   S t        |      t        k\  r|S ||z   |	z   }|d t         S c c}w )Nr  r  r`   r+   r   r  
EXHAUSTIVE)ri   rj   r  	is_numberdefault_k_splitsr  divisorsrt  r[   max_autotune_gemm_search_spacerE   k_splits_limit)r  r  rI  max_k_splitmin_k_splitr  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                rJ   get_k_splitsr  d  s    !UZZ 1ejj!!++1ejj!!++!q&!q&)K~~a H  k!g&< 	H  =?B>) %Q 3; EAI!#$$Q'RZ1_%%a( !!!$%" ,,< #55FF /  '*<<~M?N++Gs   )E#c                T    t         j                  j                  |       j                  S rw   )rC   r<   r^  gcnArchNamer   s    rJ   _rocm_native_device_arch_namer    s    ::++F3???rg   c                     	 dd l } ddlm}m} ddlm} t        j                  j                  | j                        }||||fS # t        $ r dd}dd} G d d      }d }Y %w xY w)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     g S rw   r   r   rg   rJ   r  z*try_import_ck_lib.<locals>.gen_ops_library      Irg   c                     g S rw   r   r   rg   rJ   r  z.try_import_ck_lib.<locals>.gen_ops_preselected  r  rg   c                      e Zd Zy)*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   r   rg   rJ   r  r    s    rg   r  )r~   rp  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r  r  dirname__file__r   )r  r  r  r  package_dirnames        rJ   try_import_ck_libr    sl    	
	
 ''//+*>*>? O-@/QQ  			 	 s   ;A A#"A#c                   t         j                  st         j                  syt        j                  j
                  sy| j                  j                  dk(  syt        | j                        }t         j                  j                  D ci c]  }|j                  d      d   | c}xs |j                  d      d   |i}|j                         t         j                  j                  z  D cg c]  }||   	 }}|sy| j                  t        j                  t        j                   t        j"                  fvryt%               \  }}}}|st&        j)                  d       yt        j*                         r|t         j                  _        t         j                  j,                  st&        j)                  d       y|t         j                  j,                  k7  rt&        j)                  d       yyc c}w c c}w )	NFr<   :r   z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)r[   r  r  rC   rV  rW  r   r   r  rocmarchr<  r  ck_supported_archr   r   r&  r(  r  r   r  	is_fbcodeck_dir)rq  native_archrI  requested_archsrequested_supported_archsck_package_dirnamer   s          rJ   use_ck_templater    s   6#;#;====' 0>K39;;3C3CDaqwws|A)D #q!;IO
 !%%'&++*G*GG! 	! ! %||EMM5>>5==II"3"51aBC/;;BCV[[///01= E!s   G-,G2c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr+   rH  CKr   r  r   rK  rI  r{  r  rL  rM  r  rq  r  r  rI  rI  s        rJ   use_ck_gemm_templater    sR     	d# 	CF#	CGG&&q1uqy2&>Brg   c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr+   rH  CKTILEr   r  r   r  r  s        rJ   use_ck_tile_gemm_templater    sR     	h' 	CF#	CGG&&q1uqy2&>Brg   c                2    t        d      xr t        |       S )Nr  )r  r  rq  s    rJ   use_ck_conv_templater     s    %d+G0GGrg   c                |    t         j                  xs t         j                  xr | j                  j                  dk(  S r  )r[   r  r  r   r   r  s    rJ   _use_template_for_cpur    s2    7v77&
--


%&rg   c                    ddl m} t        |j                  |      sJ t	        | ||d      xr |j                  j                         S )Nr+   )r5   F)require_constant_mat2)r  r5   ri   rq  use_cpp_gemm_templater  )rq  mat1mat2r5   s       rJ   use_cpp_bmm_templater  
  sE     dkk6*** 	fdDN 	(KK%%'rg   c                `   ddl m} ddlm} ddlm}	 ddlm}
 t        |       rt        d      syt        j                  j                  sy|j                         t        j                  t        j                   fv }t        j"                  t        j$                  t        j&                  t        j                  g} |
|||r| j(                  nd ||      \  }}}} }}t+        ||f      ryt-        ||j.                        r|j1                         } |	|j                               \  }} |d	||||j                         |j                         |t3               | |

      }dd}| j(                  |v xr= |d uxr7  ||      xr- t-        ||j4                        xr |j7                         xs | S )Nr+   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    | j                          | j                         d   dk(  S )Nr   r+   )freeze_layout
get_striderG   s    rJ   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride1J  s"    	||~b!Q&&rg   r  )r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr	  r  r{  r[   cppweight_prepackr  rC   r7  r,  r(  r&  halfr   has_free_symbolsri   BaseViewunwrap_viewparallel_num_threadsr	  is_module_buffer)rq  r  r  r  r  is_woq_int4r  r  r  r  r	  	int8_gemmr  r  r  rI  r  r   r  r  s                       rJ   r  r    s    9M) (0Ee0L::$$ U[[%**$==I]]ENNEJJLM")"+&,,'#Aq!VT4 A$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C tR]]+	C ""$A,A(Arg   c                 b    t         j                  xs t         j                   xs t        d      S )NATEN)r[   r  r  r{  r   rg   rJ   use_aten_gemm_kernelsr)  W  s-    7v77 '	v	&'rg   c                  T    e Zd ZU  ej                  d      Zded<   ddZddZd	dZ	y)
DebugDirManagerr   r   prev_debug_namec                @    t        t        j                        | _        y rw   )rc  r+  counterr   r  s    rJ   r  zDebugDirManager.__init__a  s    ../rg   c                    t         j                  j                  j                  | _        | j                   d| j
                   | _        | j                  t         j                  j                  _        y )N_tmp_)rC   _dynamor[   debug_dir_rootr,  r   new_namer  s    rJ   	__enter__zDebugDirManager.__enter__d  sM    $}}33BB//0dggY?.2mm+rg   c                    t        j                  | j                         | j                  t        j
                  j                  _        y rw   )r  r  r3  r,  rC   r1  r[   r2  )r  rp   s     rJ   __exit__zDebugDirManager.__exit__i  s*    dmm$.2.B.B+rg   Nr$  )rp   r   r~   r  )
r   r   r   r  r  r.  r   r  r4  r6  r   rg   rJ   r+  r+  ]  s(    iooa G0<
Crg   r+  c                    ddl m} g dfd}t        j                  j	                  |d|      5  t
        j                  j                           | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr+   r/   c                (    j                  |        y rw   rt  codesource_codess    rJ   save_output_codez*run_and_get_code.<locals>.save_output_codew      D!rg   r=  r;  r   r~   r  rL  r0   r   r  r.  rC   r1  reset)r   rp   r  r0   r=  r  r<  s         @rJ   run_and_get_coderB  n  su    
 % L" 
		=*<>N	O %T$V$% <% <s   'A$$A0c                    t        | g|i |\  }}g }|D ]6  }|j                  t        j                  d|t        j                               8 ||fS )Nz	'''.*?''')rB  r  refindallDOTALL)r   rp   r  r  r<  kernelsr;  s          rJ   run_and_get_kernelsrH    sZ     ,B@@@FLG Brzz,bii@AB7?rg   c                &     d fd}t        |      S )Nc                 R            } | j                         j                          | S rw   )r   backward)r  r   s    rJ   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backward  s!    

rg   )r~   r   )rB  )r   rL  s   ` rJ   run_fw_bw_and_get_coderM    s    
 -..rg   c                X   ddl m} g dfdd	fd}t        j                  j	                  |d|      5  t        j                  j	                  |d      5  t
        j                  j                           | |i |}ddd       ddd       S # 1 sw Y   xY w# 1 sw Y   S xY w)
zLGet the inductor-generated code, but skip any actual compilation or running.r+   r/   c                (    j                  |        y rw   r9  r:  s    rJ   r=  z"get_code.<locals>.save_output_code  r>  rg   c                     G d d      }| j                   r| j                         n| j                         \  }} |j                         |r |j                          |       S )Nc                       e Zd ZdZddZddZy)@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulec                     y rw   r   r  s    rJ   r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__  s    rg   c                     y rw   r   r  s      rJ   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call  s    rg   Nr$  rp   r   r  r   r~   r  )r   r   r   r   r  rU  r   rg   rJ   DummyModulerR    s    Frg   rW  )r  codegen_with_cpp_wrappercodegenr{   )r  rW  wrapper_codekernel_coder=  s       rJ   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_module  s]    	 	 04/?/?D))+T\\^ 	"k
 	++,[../}rg   compile_to_moduler=  Nr?  )r  r0   r~   r   r@  )r   rp   r  r0   r\  r   r=  r<  s         @@rJ   get_coder^    s    $ L". 	

.0I	
  	

-);=MN	  	          s#   "B'BBB	BB)c                |    t        | g|i |}dt        |      cxk  rdk  sn J dt        |              |d   S Nr+   r  z%expected one or two code outputs got r   )r^  rE   )r   rp   r  r<  s       rJ   get_triton_codera    sQ    B000LL!&Q& 
/L0A/BC& ?rg   c                    t        | g|i |\  }}dt        |      cxk  rdk  sn J dt        |              |d   S r`  )rB  rE   )r   rp   r  r   r<  s        rJ   run_and_get_triton_coderc    sW     'r;D;F;OA|L!&Q& 
/L0A/BC& ?rg   c                    ddl m ddlm} |j                  g dfd}t
        j                  j                  |d|      5   | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   r/   r8   c                 ^     | i | | d   }t        |      sJ j                  |       y )Nr  )ri   rt  )rp   r  rL  r0   graph_lowerings	real_inits      rJ   	fake_initz-run_and_get_graph_lowering.<locals>.fake_init  s7    4"6"Q%///u%rg   r  rV  )torch._inductor.graphr0   torch._inductor.output_coder9   r  r   r  r.  )	r   rp   r  r9   rh  r  r0   rf  rg  s	         @@@rJ   run_and_get_graph_loweringrk    sq     4;((IO& 
		?J		B %T$V$% ?""% ?""s   	AA(c              #     K   ddl m} |j                  |    }	 t        j                  ||      |j                  | <   d ||j                  | <   y# ||j                  | <   w xY ww)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorrm  	loweringsr   partial)aten_opoverride_fnrm  orig_fns       rJ   override_loweringrt    s`      )  )G.&/&7&7W&M7#&-7#g7#s   A$'A  A$A!!A$c                     ddl m} |j                  d fd}t        j                  j
                  j                  |d|      S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                B     | |        | |      }r	 | |       |S rw   r   )r  r  outrs  post_fnpre_fns      rJ   r  z(add_scheduler_init_hook.<locals>.wrapper  s+    y% i'Iu%
rg   r  )r  r   r  r   r~   r   )torch._inductor.schedulerrv  r  unittestr   r  r.  )rz  ry  rv  r  rs  s   ``  @rJ   add_scheduler_init_hookr}    s9     4  G ==%%iWEErg   c                z    t         j                  rt        j                  |        yt        j	                  |        y)z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)r[   developer_warningsr   r  info)msgs    rJ   developer_warningr    s$       Crg   c                    	 t         j                  j                  d      } | dz   t        t         j                        k  rTt        t         j                  | dz            dkD  r2t         j                  | dz      d   dk7  rt         j                  | dz      S t         j                  D ]#  }|j                  d      s|t        d      d c S  y# t        $ r Y Bw xY w)a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr+   r   r  z--only=N)rv  argvr   rE   
ValueErrorrx  )r  rs  s     rJ   get_benchmark_namer    s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx )>>)$s9~'(()   s   BC 	CCc                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    ywr+   Nr   rO  s     rJ   r   zis_ones.<locals>.<genexpr>=       %!qAv%   rm   r  s    rJ   is_onesr  <      %u%%%rg   c                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    yw)r   Nr   rO  s     rJ   r   zis_zeros.<locals>.<genexpr>A  r  r  r  r  s    rJ   is_zerosr  @  r  rg   c                &    t        d | D              S )Nc              3     K   | ]@  }t        |t        j                        r$|j                  t        j                  d       k(   B yw)r   N)ri   rC   ry  r   )r   r   s     rJ   r   z is_cpu_device.<locals>.<genexpr>E  s8      dELL) 	u||E**s   AAr  )inputss    rJ   is_cpu_devicer  D  s       rg   c                    t        | t        j                        sJ d       | j                  rt        j
                  S t        j                  S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)ri   rj   r  r   rC   r2  r*  )r  s    rJ   get_sympy_Expr_dtyper  L  s=    c5::& B& ~~{{}}rg   c              /     K   | r-t        j                  j                  |i |5 }| d d d        y d  y # 1 sw Y   y xY wwrw   )rC   r   r   )should_profilerp   r  r   s       rJ   maybe_profiler  V  sE     ^^##T4V4 	G	 	 		 	s   "A7AA Ac                 l    t         j                  j                  } | dk  rt        j                         } | S Nr+   )r[   r  threadsrC   get_num_threads)r  s    rJ   r#  r#  _  s+    jj  G{'')Nrg   c                     ddl m}   |        }|j                  dt        j                  j
                  rd      S d      S )Nr+   )get_backend_options
num_stagesr     )runtime.triton_helpersr  r_  rC   rV  rW  )r  optionss     rJ   get_backend_num_stagesr  f  s2    ;!#G;;|%--*;*;QCCCCrg   c                   ddl m}m} | t        j                  t        j
                  t        j                  fv sJ t        j                  |      j                  j                  d      rddlm}  |       }| t        j                  t        j
                  fv r	 || |      S t        j                  j                  j                  j                   r |t        j                  |      S  |t        j                  |      S | t        j                  t        j
                  fv r ||       S t        j                  j                  j                  j                   r |t        j                        S  |t        j                        S )Nr   )get_max_simd_tflopsget_max_tensorcore_tflops
clock_rate)max_clock_rate)triton.testingr  r  rC   r   r&  r(  inspect	signature
parametersr_  torch._utils_internalr  backendsr<   matmul
allow_tf32)r   r  r  r  sm_clocks        rJ   get_device_tflopsr  n  s   MU]]ENNEMMBBBB,-88<<\J8!#U]]ENN33,UH==>>%%00,U]]HEE&u}}h??U]]ENN33,U33>>%%00,U]];;&u}}55rg   c                     ddl m}   |        S )Nr   get_dram_gbps)r  r  r  s    rJ   get_gpu_dram_gbpsr    s    ,?rg   c                 x    ddl m}  | j                  j                  j	                  d      j                  dd      S )Nr   r  max_shared_mem)triton.runtimer  r  r  r^  r_  r  s    rJ   get_gpu_shared_memoryr    s.    %==44Q7;;<LaPPrg   c                $    | j                  d      S )Nwelford)rx  reduction_types    rJ   is_welford_reductionr    s    $$Y//rg   c                (    t        |       ry| dk(  ryy)Nr  online_softmax_reducer  r+   )r  r  s    rJ   reduction_num_outputsr    s    N+	2	2rg   c                 0    t        j                         dk(  S )NLinux)platformsystemr   rg   rJ   is_linuxr    s    ??''rg   c                 (    t         j                  dk(  S )Nr]   )rv  r  r   rg   rJ   r  r    s    <<7""rg   c                &    t        d | D              S )Nc              3  n   K   | ]-  }t        |t        j                        xr |j                    / y wrw   )ri   rj   r  r  rO  s     rJ   r   z#has_free_symbols.<locals>.<genexpr>  s)     Jz!UZZ(<_<Js   35rT  )itrs    rJ   r   r     s    JcJJJrg   c            	     x   ddl m} | D ]  }t        ||j                  |j                  |j
                  |j                  |j                  f      r=t        |j                         xs d      st        |j                         xs d      s yt        ||j                        st        dt        |              y)Nr+   r  r   Tzunexpected type for is_dynamic F)r  r  ri   r  r	  r!  ComputedBufferr1   r   maybe_get_sizemaybe_get_strider4   	TypeErrorr   )rp   r  ts      rJ   
is_dynamicr    s     IbmmR[[":K:KRYYW
   0 0 2 8b9=M""$*> Aryy)=d1gYGHHI rg   c                      e Zd ZdZdZy)PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r  r  r   rg   rJ   r  r    s      K *rg   r  c                   ddl m} t        j                  ddd      5 }t	        j
                         }t	        j
                         } t        |t        |            j                  |  t        d|j                   |	       t        |j                  |	       t        j                         }t        ||      5   | |j                         d d d        t        j                         |z
  }	 ||j                         |j                  j                          |j                          t        d
|j                   |	       t        |j                  |	       |j!                         |j!                         k(  }
t"        j%                  d||j&                  |
|	       d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr+   )stable_topological_sortwzutf-8F)modeencodingr  )r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r  NamedTemporaryFileior
   rQ   rM   	propagater  rL  r	   nowrP   lint	recompiler  r   r  r   )r"  r  inpr  r  r  	before_ioafter_io
start_timetime_elapsedr  s              rJ   pass_execution_and_saver    sX    9		$	$
 
 
KKM	;;=C	R#3C#89CCSI	"(($1-bhhY'\\^
#B, 	N	||~
2)


#!,bhhX& H$5$5$77hFF	
-
 
	 	
 
s%   BF4<F(CF4(F1	-F44F=c                ~    ddl m} t        | |j                        xr  t        | j                  |j
                        S )zB
    Check if input buffer is a multi-outputs template buffer
    r+   r  )r  r  ri   CppTemplateBufferrq  MultiOutputLayout	input_bufr  s     rJ   is_multi_outputs_templater    s9     i!5!56 :"..< rg   c                    ddl m} t        | |j                        xr2 t	        | j
                        dk(  xr t        | j
                  d         S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r+   r  r   )r  r  ri   MultiOutputrE   r  r  r  s     rJ   #is_output_of_multi_outputs_templater    sL      	9bnn- 	;	  !Q&	;%i&6&6q&9:rg   c                   | yddl m} t        |       |j                  k(  xr |d u xs | j                  |u xsB t        |       |j
                  k(  xr' t        t        j                  j                  d      xr; | j                  t        j                  j                  j                  j                  k(  xs t        t        j                  j                  d      xr; | j                  t        j                  j                  j                  j                  k(  xsa t        t        j                  j                  d      xr; | j                  t        j                  j                  j                  j                  k(  S )NFr+   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r   _CollectiveKernelop_overloadFallbackKernelr   rC   r   torchrecr  defaultr  r  r  r[  r  s      rJ   is_collectiver  	  s1    | 	T
b***Ud
0Td>N>NRT>T 	T
b''' 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX+rg   c                >    ddl m} t        |       |j                  k(  S Nr+   r  )r  r  r   _WaitKernelr  r  s     rJ   is_waitr  0	  s    :''rg   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   GroupedSchedulerNodec              3  2   K   | ]  }t        |        y wrw   )contains_collectiverO  s     rJ   r   z&contains_collective.<locals>.<genexpr>:	  s     @a&q)@rP  )r{  r
  ri   rW  snodesr  r  snoder
  s     rJ   r  r  6	  s4    >%-.@5<<@@@$$rg   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   r	  c              3  2   K   | ]  }t        |        y wrw   )contains_waitrO  s     rJ   r   z contains_wait.<locals>.<genexpr>C	  s     :=#:rP  )r{  r
  ri   rW  r  r  r  r  s     rJ   r  r  ?	  s4    >%-.:U\\:::uzz""rg   c                    ddl m} t        |t        j                  j
                        r|g}t        | |j                        xr | j                  |v S r  )r  r  ri   rC   rh  ri  r  r  r  s      rJ   is_fallback_opr  H	  sE     "ejj++,TdB--.I43C3Cr3IIrg   c                B    |||    j                   j                            S rw   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rJ   buf_name_to_fused_snoder  S	  s#     k(3??HHJKKrg   c                     yr^  r   r  s    rJ   r_  r_  ^	  r`  rg   c                     ||       ry |j                  |        | j                  D ].  }t        |j                  ||      }||v rt	        |||||       0 y )Ncriteria_cb)r  unmet_dependenciesr  r   find_recursive_deps_of_node)r  collected_node_setr  r  r  depdefining_op_for_deps          rJ   r!  r!  Y	  sn     55!'' 
5HHk#5
 "44##	

rg   c                     yr^  r   r  s    rJ   r_  r_  w	  r`  rg   c           	     z    ||       ry |j                  |        | j                         D ]  }|j                  D ]}  }|j                  J |j                  j	                         dk(  r/|j                  j	                         |vrL||j                  j	                            }||v rnt        |||||         y )NOUTPUTr  )r  get_outputsrk  r  r  find_recursive_users_of_node)r  r"  r  r  r  or  user_ops           rJ   r)  r)  r	  s     55!  GG 	D99(((yy!!#x/yy!!#+==(););)=>G,,(""'	rg   c                b    t         j                  j                  j                  rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rC   
_functorchr[   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rJ   num_fw_fixed_argumentsr2  	  s6     $$::   "669SSSrg   c                    dd}d}g }| j                   j                  D ]0  }|j                  dk(  s ||      r|j                  |       |dz  }2 |t	        t        t        |                  k(  sJ t        |      S )z>
    Infers which inputs are static for a backwards graph
    c                ~    d| j                   vxr. d| j                   vxr d| j                   vxr d| j                   vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater>  r  s    rJ   is_saved_tensorz'count_tangents.<locals>.is_saved_tensor	  sH    aff$ .!&&(.!/.  qvv-		
rg   r   ru  r+   )rG   r*   r~   r  )rL  r  r[  rt  r>  r   rE   )fx_gr9  	arg_countstatic_arg_idxsr  s        rJ   count_tangentsr=  	  s    

 IOZZ 44= q!&&y1NI	 d5_)=#>????rg   c                  2    e Zd ZU ded<   ddZedd       Zy)	BoxedBoolr  r{   c                    | j                   S rw   )r{   r  s    rJ   r  zBoxedBool.__bool__	  s    zzrg   c                6    t        | t              r	d| _        | S yr^  )ri   r?  r{   r  s    rJ   disablezBoxedBool.disable	  s    c9%CIJrg   Nr%  )r  r   r~   zUnion[BoxedBool, bool])r   r   r   r   r  r  rB  r   rg   rJ   r?  r?  	  s     K  rg   r?  c              #      K   ddl m} |j                  	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fd}t        j                  j                  |d|      5  d  d d d        y # 1 sw Y   y xY ww)Nr+   r-   c                @    j                  |        | |||||      S rw   r9  )r  kernel_namer[  r  gpucpp_definitionkernel_listorig_define_kernels         rJ   define_kernelz.collect_defined_kernels.<locals>.define_kernel	  s-     	;'!+{Hc>
 	
rg   rJ  )NTN)r  r.   rE  r   r[  r   r  Optional[str]rF  r  rG  rK  r~   r   )codegen.wrapperr.   rJ  r   r  r.  )rH  r.   rJ  rI  s   `  @rJ   collect_defined_kernelsrM  	  s     5-;; #'(,
"

 
  	

 
 &
 

 
		/-	P   s   AA*A	A*A'#A*c                    | dz   S )N__original__r   r>  s    rJ    get_cloned_parameter_buffer_namerP  	  s    .  rg   c                    | t         v S rw   )rA   r  s    rJ   rp  rp  	  s    Yrg   c                &    | dk7  xr t        |       S )Nr=   )rp  r  s    rJ   device_need_guardrS  	  s    U?-vf~-rg   c                d   t        j                         rc| t        j                  k(  rPt        j                  j                         r2t        j                  j                         dk\  rt         j                  ry| t        t        j                  t        j                  t        j                  g      v S )N)rO  r   F)r[   r  rC   r&  r<   rD   get_device_capabilitybfloat16_atomic_adds_enabledr   r2  r  r  s    rJ   ,needs_fallback_due_to_atomic_add_limitationsrW  	  sp    
 	U^^#JJ##%JJ,,.&8//
EKKU^^#LMMMrg   c                   | j                   t        j                  j                  j                  t        j                  j                  j
                  fv r|y| j                   t        j                  j                  j                  k(  rdnd}|d |fvxs |xr t        |      xr t        |      xs | j                   t        j                  j                  j                  k(  xrW |dk(  xrP |xrL |dk(  xrE t        j                  j                  xr) t        j                  j                  xs t               dk7  xs? ||k(  xr" |t        j                  t        j                  fv xs t        j                          S )NFr  r   r   r+   )overloadpacketrC   r   atenscatter_reduce_scatter_reducescatter_rp  rW  r[   r  fallback_scatter_reduce_sumdynamic_threadsr#  r  r2  r[  )r  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rJ   use_scatter_fallbackre  	  sZ    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 i'SJ5::u{{:S,S	8 557!rg   c                   ddl m}m} ddlm} t        dt        |        d       t        |       D ]  \  }}t        d|dd       ||u rt        d	       '||u rt        d
       7t        ||      r|j                         }t        |rdnd d       |r:|j                  J t        d|j                  j                  j                          t        d       |j                  j                  D ]  }t        |        t        d       |j                  j                  D ]  }t        |        t!        dt#        |              y)z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdrh  ri  r{  rj  r  rE   r   ri   is_reductionr  r  reduction_hintread_writesreadswritesr   r   )r  rh  ri  rj  r  r  is_redr#  s           rJ   dump_node_scheduleru  
  s=   
 O7	M 236
:;}- H	T#al?"$%%%%&m,&&(FfU$/?@yy,,,01N1N0OPQ*''-- c
+''.. c
 !9$t*FGG'Hrg   c                z    ddl m}  || j                         t        | j                        z  t
        z  dk(        S )Nr   )r  )rZ  r  storage_offsetr  r   GPU_ALIGN_BYTES)r   r  s     rJ   tensor_is_alignedry  ;
  s:     L 				 >&,,#?	??RVWW rg   c                |    t        | j                  j                        syt        j                  xs t        |       S r^  )rp  r   r   r[   assume_aligned_inputsry  )example_inputs    rJ   should_assume_input_alignedr}  I
  s2     -&&++,''K+<]+KKrg   c                     t         j                  j                  j                         } | st	        j
                         S | j                  j                  }|st	        j
                         S |j                         S rw   )	rC   _guardsTracingContexttry_getr  nullcontextr  rN  suppress_guards)tracing_contextrN  s     rJ   #maybe_get_suppress_shape_guards_ctxr  R
  sb    
 mm22::<O%%''  ))33I%%''$$&&rg   c                   t         j                  j                  j                  t        dd      5  t
        j                  j                          dd l}dd l	} |j                         } |j                  |      }ddlm} |j                  |       |j                  }|j!                  |j"                          | |i |}	|j%                         }
|j!                  |       |j'                  |       d d d        |	|
fS # 1 sw Y   	
fS xY w)Nr   Tr   )output_code_log)r|  r   r  r.  r[   rC   r1  rA  r  loggingr
   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr  removeHandler)r   rp   r  r  r  log_capture_stringchr  
prev_levelr  r   s              rJ   run_and_get_cpp_coder  c
  s     
			#	#FGT	: *(R[[]"W""#56=""2&$**
  /T$V$'')  ,%%b)*  19!*  19s   CC>>D
c                    t        |       }||j                  S | D ]4  }t        |t        j                        s|j
                  j                  c S  y rw   )rM   rN  ri   rC   r&   r  )r  r  inputs      rJ   shape_env_from_inputsr  |
  sT     (I """  (eU\\*::'''(
 rg   c                <     t              dk(  r S d fd}|S )Nr   c                z    t        |       \  }} |       }t        |      rt        j                  ||       |S rw   )copy_misaligned_inputsrE   rC   _foreach_copy_)
new_inputsold_tensorsnew_tensorsrx  inputs_to_checkr  mutated_input_idxss       rJ   r  z)align_inputs_from_check_idxs.<locals>.run
  sE    #9);$
 [ J {  k:
rg   )r  list[InputType]r~   r   )rE   )r  r  r  r  s   ``` rJ   align_inputs_from_check_idxsr  
  s#    
 ?q  Jrg   c                T   d| j                         v rd}n;t        d t        | j                         | j                               D              dz   }t	        j
                  | |fd      j                         }t	        j
                  || j                         | j                               S )Nr   c              3  2   K   | ]  \  }}|d z
  |z    ywr  r   )r   shaperQ  s      rJ   r   z)clone_preserve_strides.<locals>.<genexpr>
  s     Tf$TrP  r+   ru   )r  r   r   rQ  rC   
as_stridedclone)rG   needed_sizebuffers      rJ   clone_preserve_stridesr  
  s    AFFH} T#affh
:STTWXX 	 a+6<<>FFAFFHahhj99rg   c                2   g }g }|du}|D ]  }| |   }t        |t        j                        sJ dt        |              |j	                         t
        z  sMt        |      | |<   |s^||v sc|j                  |       |j                  | |           ||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )ri   rC   ry  r   data_ptr	ALIGNMENTr  rt  )r  check_inputs_idxsreturn_pair_idxsr  r  ret_pair_definedr   _inps           rJ   r  r  
  s     ')K&(K (t3 
2!}$- 	
.tDzl;	
- ==?Y&248JqMA)9$9""4("":a=1
2 ##rg   c                    g }|D ]N  }| |   }t        |t        j                        s#|j                         t        z  dk(  s>|j                  |       P t        |      t        |      k7  r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )ri   rC   ry  r  r  rt  rE   )r  static_input_idxsaligned_static_input_idxsr  r  s        rJ   remove_unaligned_input_idxsr  
  st     !#  2seU\\*0@90LQR/R%,,S12 $%->)??((rg   c                x   ddl m} t        j                  t        j                        j
                  }|j                  j                  j                  }|j                  j                  j                  j                  }|j                  j                  j                  | |k        ry ||       xr  ||       |k  S )Nr+   rH  T)rK  rI  rC   iinfor0  r   rL  rM  r  rN  has_hintr  )r   rI  int_maxr  r  s        rJ   expr_fits_within_32bitr  
  s    kk%++&**G  **Iww))22H 	ww--a7l;A;29Q<722rg   c                   t         j                  j                  j                         }||j                  t        |j                        dk(  sJ t        |       |j                  J |j                  D ]  }||j                  j                  d        !dt         j                  j                  j                         x}r|j                  dfd|j                  j                  t        fd|D                      y y y )Nr   Fc                f    t        |       S rj                  |       S j                  |       S rw   )rx   deserialize_symexprevaluate_symexpr)r   fakify_first_callrN  s    rJ   map_exprz4set_tracing_context_output_strides.<locals>.map_expr  s7     ("1v((<<Q??$55a88rg   c              3  .   K   | ]  } |        y wrw   r   )r   r   r  s     rJ   r   z5set_tracing_context_output_strides.<locals>.<genexpr>  s     5!(1+5r  )r   r   r~   z,Union[float, int, SymInt, SymFloat, SymBool])
rC   r  r  r  output_stridesrE   r  rt  r  rm  )r  compiled_graphr  r  r  r  r  rN  s        @@@rJ   "set_tracing_context_output_stridesr  
  s     mm**224Gw55A7))*a///).9	,,888#22 	E}&&--d3$)!--66>>@@3@(+(=(=%9 &&--5u55		  Brg   c                    t         j                  t         j                  S t        j                         syt        j                  j                         ry	 ddlm}  | t        j                  j                  d      k\  S # t        $ r Y yw xY w)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
r[   fx_graph_remote_cacher  rC   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rJ    should_use_remote_fx_graph_cacher    s    ##/+++,,.H  5#8#8#M#M8$    s   A> >	B
	B
c                0    t        j                  dd|       S )Nz[^a-zA-Z0-9_]r   )rD  subr>  s    rJ   normalize_namer  #  s    66"C..rg   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                l    t         j                  dt        |             }t        j	                  ||      S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r   _triton_type_mappingr_  )r   triton_type_names     rJ   triton_typer  9  s.    &**5#e*=##$46FGGrg   c                    t         j                  | |       }|j                  dd      }t        t        |      }t        |t        j                        sJ |S )Nr  r  )_torch_triton_mappingr_  rL  rB   rC   ri   r   )r   adjusted_type	type_namer  s       rJ   triton_type_to_torchr  ?  sL    )--eU;M%%eR0Iy)Ii---rg   c                   | j                    xr | j                         |j                         k(  xr | j                         |j                         k(  xr | j                  |j                  k(  xr{ | j                  |j                  k(  xr` | j                         j                         |j                         j                         k(  xr! | j                         |j                         k(  S rw   )	is_mkldnnr  rQ  r   r   untyped_storager  rw  r  r{   s     rJ   is_same_tensorr  G  s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;rg   c                v   | j                   xr | j                         |j                         k(  xr | j                  |j                  k(  xrn | j                  |j                  k(  xrS t        j
                  j                  j                  |       t        j
                  j                  j                  |      k(  S rw   )r  r  r   r   rC   r   mkldnnr  r  s     rJ   is_same_mkldnn_tensorr  S  s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOrg   c                      y)N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   rg   rJ   boolean_opsr  ]  s    rg   c                  "    e Zd ZU ded<   ded<   y)OpDtypeRuler'   type_promotion_kindOptional[torch.dtype]override_return_dtypeNr  r   rg   rJ   r  r  q  s    8800rg   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                *    t        ||      t        | <   y rw   )r  r  )r   r  r  s      rJ   #register_op_dtype_propagation_rulesr  z  s    
 (32(t$rg   zOrderedSet[str]op_requires_libdevice_fp64c                .    t         j                  |        y rw   )r  r  r>  s    rJ   #register_op_requires_libdevice_fp64r    s    ""4(rg   c                     ddl m}  | j                  j                         j                  }|dk(  rt
        j                  S |dk(  ryt
        j                  S )Nr   rH  r   r=   )r  rI  rL  get_current_device_or_throwr   r[   cpu_backendcuda_backend)rI  
device_strs     rJ   get_current_backendr    sH    -446;;JU!!!	u	"""rg   c                    | t         j                  t         j                  fv r7t        j                  j
                  rt               dk(  rt         j                  S | S )z"Maybe upcast [b]float16 to float32r  )rC   r   r&  r[   r  codegen_upcast_to_fp32r  r(  r  s    rJ   upcast_compute_typer    s@     	%--00MM00!X-}}Lrg   KeyTypeValTypec                  Z    e Zd ZdZddZddZddZddZdddZddZ	dd	Z
dd
ZddZy)
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    c                     || _         i | _        y rw   original_dict	new_items)r  r  s     rJ   r  zScopedDict.__init__  s    *13rg   c                Z    || j                   v r| j                   |   S | j                  |   S rw   r  r  r  s     rJ   r  zScopedDict.__getitem__  s.    $.. >>#&&!!#&&rg   c                "    || j                   |<   y rw   )r  )r  r?  r{   s      rJ   __setitem__zScopedDict.__setitem__  s    #srg   c                >    || j                   v xs || j                  v S rw   r  r  s     rJ   __contains__zScopedDict.__contains__  s!    dnn$At/A/A(AArg   Nc                t    || j                   v r| j                   |   S | j                  j                  ||      S rw   )r  r  r_  )r  r?  r   s      rJ   r_  zScopedDict.get  s6    $.. >>#&&!!%%c733rg   c                z    t        | j                        }| j                  D ]  }|| j                  vs|dz  } |S r  )rE   r  r  )r  r  rI  s      rJ   rE  zScopedDict.__len__  sC    ""# 	A***Q	 rg   c              #     K   | j                   E d {    | j                  D ]  }|| j                   vs|  y 7 )wrw   r  )r  rI  s     rJ   __iter__zScopedDict.__iter__  s@     %%%% 	A***	 	&s   ><!>>c                H    t        | j                  xs | j                        S rw   )r  r  r  r  s    rJ   r  zScopedDict.__bool__  s    D&&8$..99rg   c                    t         rw   r;  r  s     rJ   __delitem__zScopedDict.__delitem__  s    !!rg   )r  zMapping[KeyType, ValType])r?  r  r~   r  )r?  r  r{   r  r~   r  )r?  r.  r~   r  rw   )r?  r  r   Optional[ValType]r~   r  rF  )r~   zIterator[KeyType]r%  )r?  r  r~   r  )r   r   r   r   r  r  r  r  r_  rE  r  r  r  r   rg   rJ   r	  r	    s5    4'
$B4
:"rg   r	  )frozen_defaultc              (    dfd}| |S  ||       S )Nc                    t         j                  dk\  rt        j                  | d      S t        j                  |       S )N)r  rP  T)kw_onlyr   r   )rv  version_infodataclasses	dataclass)rz   r   s    rJ   wrapzir_dataclass.<locals>.wrap  s;    w&((d6JJ ((V<<rg   )rz   r^   r~   r^   r   )rz   r   r#  s    ` rJ   ir_dataclassr$    s    = {9rg   c                     t         j                  j                  j                         } | "| j                  r| j                  j
                  S y rw   )rC   r  r  r  fw_metadatabw_donated_idxs)r  s    rJ   get_donated_idxsr(    s=    mm22::<O"'B'B**:::rg   c                   ddl m}m} ddlm} ddlm} |rYt        | |      sJ |j                  j                  j                  |g       j                  fd| j                  D               y t        | t              sJ | D ]j  }|||fvs
|j                  |j                  j                  j                  |g       j                  fd|j                  j                  D               l y )Nr+   rg  )r3   rH  c              3  R   K   | ]  }|j                   vr|j                      y wrw   r>  r   r  curr_node_infos     rJ   r   z:set_kernel_post_grad_provenance_tracing.<locals>.<genexpr>  s)      
{{.0 KK
   $'c              3  R   K   | ]  }|j                   vr|j                      y wrw   r>  r+  s     rJ   r   z:set_kernel_post_grad_provenance_tracing.<locals>.<genexpr>	  s)      *"!;;n< *r-  )codegen.simd_kernel_featuresrh  ri  r  r3   rK  rI  ri   r   ._inductor_triton_kernel_to_post_grad_node_info
setdefaultr  r  r>  r  )	r  rE  	is_externrh  ri  r3   rI  r  r,  s	           @rJ   'set_kernel_post_grad_provenance_tracingr3    s    
 P#-999GGBBMMR 	
 	 
'//
 	
 -..." 
	E_.>??::)%&WW%[%[%f%f#R&N #)) *&+jj&8&8* 
	rg   c                       e Zd ZdZdZdZdZdZy)TritonAttrsDescriptorVersionr   r+   r  r  r  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   rg   rJ   r5  r5    s     LKK	  Grg   r5  c                 P   t         j                  j                  d      t        j                  S dd l} dd l} t        | j                  j                  d      rt        j                  S t        | j                  j                  d      rt        j                  S t        j                  S )Nr  r   AttrsDescriptor)	importlibutil	find_specr5  r6  triton.backends.compilertriton.compiler.compilerr   r  compilerr8  r7  r:  )r  s    rJ   #get_triton_attrs_descriptor_versionrC    s{    ~~)1+888##v''):; ,777	))+<	=+777 ,333rg   c                 8    t               t        j                  k(  S rw   )rC  r5  r:  r   rg   rJ   triton_version_uses_attrs_dictrE  4  s    .04P4X4XXXrg   c                   ddl m} t        | |j                        syt        | j                  t
        j                  j                        r;t
        j                  j                  j                  | j                  j                  v ryy)zq
    Returns True if the node is an op that is not cudagraphable.
    Usually only custom ops have this tag.
    r+   r  FT)r  r  ri   r  r  rC   rh  ri  r   rl  r^  rn  r  s     rJ   is_cudagraph_unsafe_oprG  8  s^    
 dB--. 	4##UZZ%:%:;HHLL))T-=-=-B-BBrg   c                    t         j                  j                  dd      } t        j                         rUddlm}  |       }|rFt         j                  j                  |dd      }| r!t         j                  j                  || g      n|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathrz  lib)
r  r  r_  r[   r  libfb.py.parutilrJ  r  r  pathsep)r  rJ  runtime_pathlib_paths       rJ   get_ld_library_pathrP  K  sg    ::>>+R0D5')ww||L)UCH8<2::??Hd#34(DKrg   c                F    ddl m} t        | |      xr | j                  d uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperrR  ri   partition_signatures)r  rR  s     rJ   #is_codegen_graph_partition_subgraphrU  X  s*    L 	789 	5((4rg   c                    ddl m} |j                  j                  j	                  | d      r6|j                  j                  j                  | d      rt        j                  S t        j                  S )Nr+   rH  l        i   )	rK  rI  rL  rM  r  statically_known_geqrC   r0  r2  )r  rI  s     rJ   dtype_from_sizerX  a  sP    ww++e
''


/
/h
?{{{{rg   )r   r>   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN BF16.
    r   r>   TF)rC   r   r  _is_mkldnn_bf16_supportedr   s    rJ   is_mkldnn_bf16_supportedr\  o  3     eyy99;;	+	rg   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN FP16.
    r   r>   TF)rC   r   r  _is_mkldnn_fp16_supportedr[  s    rJ   is_mkldnn_fp16_supportedr`  {  r]  rg   r#  )re   rx   r~   rx   )rs   r}   r~   r  )   d   )r   zCallable[[], Any]r   rx   r   rx   r~   r  r%  )r   z"Union[Optional[torch.device], str]r~   torch.device)r   zIterable[sympy.Expr]r~   r}   )r  Sequence[sympy.Expr]r  rd  r~   r}   )r   zIterable[_T]r~   zValuesView[_T])r  Union[int, sympy.Expr]r  re  r~   re  )r?  r  r~   r   )rE  z"Iterable[Union[int, torch.SymInt]]r~   zlist[sympy.Expr])r   re  r~   zUnion[int, torch.SymInt])rE  z Iterable[Union[int, sympy.Expr]]r~   zlist[Union[int, torch.SymInt]])r[  torch._ops.OpOverloadr~   r  )ro  r*   rf  z'Callable[[torch._ops.OpOverload], bool]r~   r  )rg  r   rp   rp  r  dict[str, Any]r~   z&tuple[GraphModule, list[torch.Tensor]])r<   )r   r   r~   r  )r+   r<   )
r  Callable[..., Any]r  Sequence[Any]r   rx   r   r   r~   r  )r   rP  rP  g      ?r<   )r  rh  r  ri  r   rx   r  rx   r  r  r   r   r~   r  )r  r   r  r   r~   r  )r  r   r  r   r~   r  )r  rx   r  rx   r~   rx   )rG   zUnion[int, Sequence[int]]r  rx   r~   Sequence[int])rG   ztuple[_T, ...]r~   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r~   zCachedMethod[P, RV])r  0Union[Sequence[BaseSchedulerNode], ExternKernel]r~   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r  z8Literal[True, 'torch', 'original_aten', 'inductor_node']r~   r   )r  rk  r  r.   r~   ztuple[str, str]rw   )r   zIterable[torch.fx.Node]r  zOptional[Callable[[Any], bool]]r~   zOrderedSet[torch.fx.Node])rp   zSequence[IRNode]r  zdict[str, IRNode]r~   zOrderedSet[IRNode]r$  )r   r}   r~   zValueRanges[Any])r/  r   r~   r  )r/  rX   r  rx   r~   rF  )r;  r  r~   r  )r   r   r~   rF  )r  r}   rH  zdict[sympy.Expr, Any]r~   r}   )r  r   r~   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])rp   r   r~   r  )r  torch.fx.GraphModuler~   zOptional[torch.fx.Node])r  rm  r~   r*   )r  rm  r~   zOrderedSet[torch.device]r$  )r  r   r~   r   )NNT)r  zOptional[dict[str, Any]]r  rK  r  r  r~   r  )r  ri  r~   	list[int])rN  r)   r  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]r~   rn  )r   torch.dtyper~   rx   r  r"  )rZ  zUnion[int, torch.device]r~   r  rF  )rl  rx   r   rc  rm  Optional[int]r~   r,   )rq  r5   rr  zlist[torch.dtype]r~   r  )rz  r   r~   r  )rq  r5   r  r  r  r  r~   r  )r  r4   r~   r  )
rq  r5   r  rx   r  rx   rI  rx   r~   r  )r  r   r~   r  )r  r  r  r  rI  r  r~   r  )r  r  r  r  rI  r  r~   rn  )r   r   r~   r   )r~   zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])rq  r5   r~   r  )rq  r5   r  zUnion[ReinterpretView, Buffer]r  r4   r~   r  )FTFN)rq  r5   r  r4   r  r4   r  r  r  r  r%  r  r  rp  r~   r  )r   Callable[P, _T]rp   r  r  r  r~   ztuple[_T, list[str]])r   rh  r~   ztuple[Any, list[str]])r   rq  rp   r  r  r  r~   r   )r   rq  rp   r  r  r  r~   r   )r   rq  rp   r  r  r  r~   ztuple[Any, list[GraphLowering]])rq  rh  rr  rh  r~   r  )rz  rh  ry  zOptional[Callable[..., Any]]r~   r   )r  r   r~   r  )r~   rK  )r  ri  r~   r  )r  zSequence[torch.Tensor]r~   r  )r  r}   r~   ro  )r  r  rp   r   r  r   r~   zIterator[Any])r  r   r~   r  )r  r   r~   rx   )r  zIterable[Any]r~   r  )
r"  rh  r  r(   r  ri  r  r   r~   r  )r  z"Optional[Union[Buffer, Operation]]r~   r  )r  z Optional[Union[Node, Operation]]r[  z!Optional[torch._ops.OperatorBase]r~   r  )r  z"Optional[Union[IRNode, Operation]]r~   r  )r  r:   r~   r  )r  zOptional[Operation]r[  z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]r~   r  )r  r   r  rg  r  rg  r~   r   )r  r:   r"  zMutableSet[BaseSchedulerNode]r  zdict[str, SchedulerBuffer]r  zdict[str, BaseSchedulerNode]r  zCallable[[Any], bool]r~   r  )r/  rx   r0  rx   r~   rx   )r:  rm  r~   rx   )rH  r   r~   r  )r   r   r~   r   )r   rK  r~   r  )r   r   r~   r  )r   ro  r~   r  )r  rf  r  rK  r`  ro  ra  ro  rb  r   rc  r  r~   r  )r  rl  r~   r  )r   rz  r~   r  )r|  rz  r~   r  )r~   r&  )r   rq  rp   r  r  r  r~   ztuple[_T, str])r  Sequence[InputType]r~   zOptional[ShapeEnv])r  Callable[[list[InputType]], _T]r  rj  r  zOrderedSet[int]r~   rs  )rG   rz  r~   rz  )r  r  r  rj  r  zOptional[OrderedSet[int]]r~   z-tuple[list[torch.Tensor], list[torch.Tensor]])r  rr  r  rj  r~   rj  )r   r}   r~   r  )r  ri  r  r9   r~   r  )r   ro  r~   r   )r   r   r~   ro  )r  rz  r{   rz  r~   r  )r~   ztuple[str, ...])r   r   r  r'   r  r  r~   r  )r   r   r~   r  )r   ro  r~   ro  )rz   zOptional[type[Any]]r   r  r~   r   )r~   zOptional[list[int]]r'  )r  z3Union[Sequence[BaseSchedulerNode], ExternKernelOut]rE  r   r2  r  r~   r  )r~   r5  )r  r6   r~   r  )r  r.   r~   r  )r  rx   r~   ro  )r   r   r~   r  (S  
__future__r   r  r  r!  enumr   r=  r  r  r  r  r  r   r  r  rD  r  r   rv  r  r  r  r|  collections.abcr   r   r   r   r   r	   r
   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   r   rj   rC   torch._inductor.runtime.hintsr   torch.utils._ordered_setr   torch.utils._pytreer   OPTIMUS_EXCLUDE_POST_GRADr!   r"   r#   r$   r%   r&   torch._prims_commonr'   torch.fxr(   rZ  r)   torch.fx.noder*   rh  r,   rL  r.   rL  r0   r  r1   r2   r3   r4   r5   r6   r7   output_coder9   r  r:   r;   rA   r?   r   rK   torch._dynamo.device_interfacerL   torch._dynamo.utilsrM   torch.autogradrN   torch.autograd.profiler_utilrO   (torch.fx.passes.graph_transform_observerrP   torch.fx.passes.shape_proprQ   torch.utils._sympy.functionsrR   rS   rT   rU   rV   torch.utils._sympy.symbolrW   rX   torch.utils._sympy.value_rangesrY   rZ   r  r[   runtime.runtime_utilsr\   r  _IS_WINDOWS	getLoggerr   r   r^   r  r  	VarRangesry  rx   	InputTypeGPU_KERNEL_BIN_EXTSrx  r  r  rj  rd   rf   ro   Functionrq   r"  r   r   r   r   r   r  r	  r  rB  rF  rP  rR  r\  rd  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r#  r,  r0  r7  r<  r?  rJ  rN  rU  ra  rg  rs  r  r  r  r   r  r  r  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher  r  	lru_cacher  r  r  r  r)  r7  r  rH  r\  r_  rc  rn  rs  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r)  r+  rB  rH  rM  r^  ra  rc  rk  rt  r}  r  r  r  r  r  r  r  r#  r  r  r  r  r  r  r  r  r   r  Enumr  r  r  r  r  r  r  r  r  r  r!  r)  r2  r=  r?  rM  rP  rp  rS  rW  re  ru  ry  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  compiler  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r$  r(  r3  r5  rC  rE  rG  rP  rU  rX  SUPPORTED_MKLDNN_DEVICESr\  r`  )rI  rs   s   00rJ   <module>r     s   "        	     	  	   
     U U          : / - (  >>//C$>",5$   -= #	CL
   D 0 % 2 K 0  8 D  = llg%g!T]UZZ'(	U5<<ell:;<	'7 	 {Q'A-+2B XDX XB5
LENN  d#  $"DP 9<SS#&S25S
Sl  ;@
+*"*+A**#AL+	+++"/	)/#/G @OI	I<I 
I0 *8+0' 	!  	
 ( %'!  	
    )'#$  cNTT"E8WQU^ E:C*!).!)O!) 	!)H62C62!62 62v 48*0 (E
E$5EE&,^%	DU	>2-888v'& 
: !# I "	 .24+4	4 4 	4 4p !5 $ " A!!L!!H Q7 7*  , , ,
O Od
 
 @ @ @?' ?  8 J JI #'   	(+<	  -2%)BF	0&R@J    * CO,) ,  3, 3,l @ @ R R:+\H&

8
@F
	
" ""&"&==
= = 	=
  = =  = 
=@'C C"      	 $ &2:/)X &2:## &#2:#$#* ...@.. .$ IMFF)EFF*	B&&   D D 6 66  Q0(#K(*$)) *!

!
"-!
4A!
HK!
	!
H1	" -1!
*!)! 
!H(%#J
JGJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2     ,!.N $&$!$ $ 	$
 $ $ 
$NH>L'" &2:2(*" ( %	0	: 37$$$$ 0$ 3	$<$ $3!3B	:&/ '#)* $%
  +?*D*D*FG$!QAG  "**Y'H	  & 1 1 1
 68 2 7
8 1 
	 /9l O :)	# )

)
-" 01 -"` D)t  * !F!! ! 
	!H499  4 42Y&
 * 		Q
 Hs   /g	