
    rhÓ                       d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZmZ d d	lmZ d
dlmZmZmZ d
dlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, erd dlm-Z- d
dl.m/Z/m0Z0 d
dl1m2Z2m3Z3 ddl"m4Z4  ejj                  e6      Z7ejp                  dejr                  dejt                  dejv                  dejx                  dejz                  dej|                  dej~                  dej                  di	ZAd'dZB G d de      ZC G d  d!e'      ZDeDj                  d"       eDj                           G d# d$e+      ZG G d% d&e,      ZHy)(    )annotationsN)Path)AnyOptionalTYPE_CHECKING)
PRECEDENCE)_embed_headers)
OrderedSet)
CppPrinterExprPrinter)ValueRanges   )ceildivget_bounds_index_exprget_kernel_metadata)ops
OpsWrapperV   )CSEVariableDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDScheduling)Union)ReductionType	StoreMode)	SchedulerSchedulerNode)OpVarTboolcharshortintlongucharfloathalfbfloatc                    t        | t              r:| t        j                  k(  ry| t        j                   k(  ry| | k7  ryt	        |       S t        | t
              r| rdS dS t	        |       S )N	HUGE_VALFz
-HUGE_VALFNANtruefalse)
isinstancer+   torchinfstrr%   )vals    n/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/codegen/mps.pyvalue_to_metalr9   8   s_    #u%))UYYJCZ3x	C	v)')s8O    c                  |    e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zdd
ZddZddZeZddZddZy)MetalExprPrinterz/Converts sympy expression to Metal code snippetc                    |j                   \  }}| j                  |      }| j                  |      }|j                  r	d| d| dS d| d| dS )Nc10::metal::floor_divide(, )metal::floor() / (argsdoprint
is_integer)selfexprxdivs       r8   _print_FloorDivz MetalExprPrinter._print_FloorDivI   s[    3LLOll3??.qcC5::qcse1--r:   c                    |j                   \  }}}| j                  |      }|dk7  r0| j                  |      }|j                  r
d| d| d}n	d| d| d}| j                  |      }d| d| dS )Nr   (rB   r@   rA   z) % (rC   )rG   rH   rI   rJ   mods        r8   _print_ModularIndexingz'MetalExprPrinter._print_ModularIndexingQ   s    ii3LLO!8,,s#Cs%uA&#A3eC52ll31#U3%q!!r:   c                    t        |j                        dk7  rt        d      t        | j                  |j                        \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::min only supported for 2 argsstatic_cast<decltype(+)>(r@   zmetal::min(r?   lenrD   RuntimeErrormap_printrG   rH   ab
typecast_a
typecast_bs         r8   
_print_MinzMetalExprPrinter._print_Min]       tyy>QEFF4;;		*1,QCq3qc;
,QCq3qc;
ZL:,a88r:   c                    t        |j                        dk7  rt        d      t        | j                  |j                        \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::max only supported for 2 argsrQ   rR   rS   r@   zmetal::max(r?   rT   rY   s         r8   
_print_MaxzMetalExprPrinter._print_Maxe   r_   r:   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   metal::abs(r   r@   rU   rD   rX   rG   rH   s     r8   
_print_AbszMetalExprPrinter._print_Absm   s9    499~"""T[[167q99r:   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   zstatic_cast<long>(metal::rint(r   ))rd   re   s     r8   _print_RoundToIntz"MetalExprPrinter._print_RoundToIntq   s9    499~"""/DIIaL0I/J"MMr:   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  d	S )
Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulz!static_cast<float>(metal::rint(1e * z) * 1er@   )rU   rD   rF   
ValueErrorparenthesizer   )rG   rH   numberndigits
number_strs        r8   _print_RoundDecimalz$MetalExprPrinter._print_RoundDecimalu   s    499~"""))Q;;abiajjkl  &&vz%/@A
27)3zl&RYQYPZZ[\\r:   c                n    |j                   \  }}d| j                  |       d| j                  |       dS )Nstatic_cast<float>(z) / static_cast<float>(r@   )rD   rX   )rG   rH   lhsrhss       r8   _print_IntTrueDivz"MetalExprPrinter._print_IntTrueDiv   s;    99S$T[[%5$66MdkkZ]N^M__`aar:   c                    t        |j                        dk(  sJ t        | j                  |j                        \  }}d| d| dS )Nr   zmetal::pow(static_cast<float>(z), static_cast<float>(rh   )rU   rD   rW   rE   )rG   rH   rI   ys       r8   _print_PowByNaturalz$MetalExprPrinter._print_PowByNatural   sF    499~"""4<<+1/s2H2NNr:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   ru   r@   rU   rD   rE   rG   rH   rI   s      r8   _print_ToFloatzMetalExprPrinter._print_ToFloat   s=    499~"""LL1&$QCq))r:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   z1static_cast<int>(metal::floor(static_cast<float>(z)))r}   r~   s      r8   _print_FloorToIntz"MetalExprPrinter._print_FloorToInt   s=    499~"""LL1&B1#SIIr:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   zstatic_cast<int>(metal::trunc(rh   r}   r~   s      r8   _print_TruncToIntz"MetalExprPrinter._print_TruncToInt   s=    499~"""LL1&/s"55r:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   zmetal::log2(r@   r}   r~   s      r8   _print_OpaqueUnaryFn_log2z*MetalExprPrinter._print_OpaqueUnaryFn_log2   s=    499~"""LL1&aS""r:   N)rH   
sympy.Exprreturnr6   )__name__
__module____qualname____doc__rK   rO   r^   ra   rf   ri   rs   rx   r{   r   r   _print_floorr   r    r:   r8   r<   r<   F   sR    9.
"99:N
]b
O
*
J
 %L6
#r:   r<   c                  b   e Zd ZdZe	 	 d0	 	 	 	 	 	 	 	 	 d1d       Ze	 	 	 	 	 	 	 	 d2d       Zed3d       Zed4d       Zed5d       Z	ed6d       Z
ed7d	       Zed8d
       Zed8d       Zed8d       Zed8d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed:d       Zed9d       Zed9d       Zed9d       Z ed9d       Z!ed9d        Z"ed8d!       Z#ed9d"       Z$ed9d#       Z%ed8d$       Z&ed9d%       Z'ed8d&       Z(ed9d'       Z)ed;d(       Z*ed;d)       Z+e	 	 	 	 	 	 	 	 	 	 d<d*       Z,ed9d+       Z-ed8d,       Z.d=d-Z/d>d.Z0e1d?d/       Z2y)@MetalOverrideszXImplements Metal-specific overrides for ops. Base class emits Python-friendly overrides.Nc                ~    |t         j                  k(  rt        j                  d       d|  dS dt        |    d|  dS )Nz>float64 cast requested, probably from tensorify_python_scalarsru   r@   static_cast<>()r4   doublelogwarningDTYPE_TO_METAL)rI   dtype	src_dtypeuse_compute_typess       r8   to_dtypezMetalOverrides.to_dtype   sK     ELL KKP )1--nU34Bqc;;r:   c                6    dt         |    dt         |    d|  dS )Nzas_type<z>(static_cast<r   rh   r   )rI   r   r   s      r8   to_dtype_bitcastzMetalOverrides.to_dtype_bitcast   s/     ./0~i?X>YY[\][^^`aar:   c                    t        |       S Nr9   )r7   r   s     r8   constantzMetalOverrides.constant   s    c""r:   c                @   t         j                  j                  t         j                  j                  |             }t         j                  j                  j                  t         j                  j                  |t        |             }t        j                  ||      S )N)bounds)
r   kernelindex_to_strprepare_indexingcsegeneratecomputer   r   r   )rH   r   idx_strvars       r8   
index_exprzMetalOverrides.index_expr   sl    ((''(A(A$(GHhhll##HHg.CD.I $ 
 ||C''r:   c                    t         j                  j                  | |      5 } |       }d d d        j                  j                  rt        |      }t        j                  ||      S # 1 sw Y   AxY wr   )r   r   
mask_loadsr   is_boolr%   r   where)maskbodyothernew_maskresults        r8   maskedzMetalOverrides.masked   sa     XX  u- 	VF	 ==  KEyy6511	 	s   A))A2c                (    |  d| dt        |       S )Nz ? z : r   )rZ   r[   cs      r8   r   zMetalOverrides.where   s    Cs#nQ/011r:   c                    d|  d| dS )Nzc10::metal::remainder(r?   r@   r   rZ   r[   s     r8   	remainderzMetalOverrides.remainder   s    's"QCq11r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zc10::metal::max(r?   r   rZ   r[   r\   r]   s       r8   maximumzMetalOverrides.maximum   K    ,QCq3qc;
,QCq3qc;
!*R
|1==r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zc10::metal::min(r?   r   r   s       r8   minimumzMetalOverrides.minimum   r   r:   c                    |  d| S )Nz || r   r   s     r8   
logical_orzMetalOverrides.logical_or       D}r:   c                    |  d| S )Nz && r   r   s     r8   logical_andzMetalOverrides.logical_and   r   r:   c                    d|  dS )Nzmetal::isnan(r@   r   rI   s    r8   isnanzMetalOverrides.isnan       qc##r:   c                    d|  dS )Nzmetal::isinf(r@   r   r   s    r8   isinfzMetalOverrides.isinf   r   r:   c                    d|  dS )Nzmetal::log(r@   r   r   s    r8   r   zMetalOverrides.log       QCq!!r:   c                    d|  dS )Nzmetal::exp(r@   r   r   s    r8   expzMetalOverrides.exp   r   r:   c                    d|  dS )Nrc   r@   r   r   s    r8   abszMetalOverrides.abs   r   r:   c                    d|  dS )Nzmetal::signbit(r@   r   r   s    r8   signbitzMetalOverrides.signbit   s     1%%r:   c                    d|  dS )Nzmetal::precise::sin(r@   r   r   s    r8   sinzMetalOverrides.sin      %aS**r:   c                    d|  dS )Nzc10::metal::sinc(r@   r   r   s    r8   sinczMetalOverrides.sinc  s    "1#Q''r:   c                    d|  dS )Nzmetal::precise::cos(r@   r   r   s    r8   coszMetalOverrides.cos  r   r:   c                    d|  dS )Nzmetal::tan(r@   r   r   s    r8   tanzMetalOverrides.tan  r   r:   c                    d|  dS )Nzmetal::asin(r@   r   r   s    r8   asinzMetalOverrides.asin      aS""r:   c                    d|  dS )Nzmetal::acos(r@   r   r   s    r8   acoszMetalOverrides.acos  r   r:   c                    d|  dS )Nzmetal::atan(r@   r   r   s    r8   atanzMetalOverrides.atan  r   r:   c                    d|  d| dS )Nz::metal::atan2(r?   r@   r   )rI   rz   s     r8   atan2zMetalOverrides.atan2   s     2aS**r:   c                    d|  dS )Nzmetal::sqrt(r@   r   r   s    r8   sqrtzMetalOverrides.sqrt$  r   r:   c                    d|  d|  dS )NrQ   z)>(-r@   r   r   s    r8   negzMetalOverrides.neg(  s     'qcaS22r:   c                    d|  dS )Nzmetal::rsqrt(r@   r   r   s    r8   rsqrtzMetalOverrides.rsqrt.  r   r:   c                    d|  dS )Nzmetal::tanh(r@   r   r   s    r8   tanhzMetalOverrides.tanh2  r   r:   c                    d|  dS )Nzmetal::atanh(r@   r   r   s    r8   atanhzMetalOverrides.atanh6  r   r:   c                    d|  d| dS )Nr>   r?   r@   r   r   s     r8   floordivzMetalOverrides.floordiv:  s     +1#Rs!44r:   c                    d|  dS )NrA   r@   r   r   s    r8   floorzMetalOverrides.floor?  r   r:   c                    d|  dS )Nzmetal::sign(r@   r   r   s    r8   signzMetalOverrides.signC  r   r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zmetal::fmod(r?   r   r   s       r8   fmodzMetalOverrides.fmodG  sK    ,QCq3qc;
,QCq3qc;
j\J<q99r:   c                    d|  dS )Nmetal::trunc(r@   r   r   s    r8   trunczMetalOverrides.truncM  r   r:   c                    |  d| }| j                   | j                   j                  s"|j                   |j                   j                  rd| dS |S )Nz / r   r@   )r   is_floating_point)rZ   r[   quots      r8   truncdivzMetalOverrides.truncdivQ  sQ    Cs|GGAGG$=$=GGAGG$=$="4&**r:   c                    d|  dS )Nzmetal::ceil(r@   r   r   s    r8   ceilzMetalOverrides.ceilZ  r   r:   c                f    t         j                  j                  j                  d       d|  d| dS )Nrandomzc10::metal::rand(r?   r@   r   r   headersaddseedoffsets     r8   randzMetalOverrides.rand^  s/    	X&"4&6(!44r:   c                f    t         j                  j                  j                  d       d|  d| dS )Nr  zc10::metal::randn(r?   r@   r  r  s     r8   randnzMetalOverrides.randnc  s/    	X&#D6F8155r:   c           	     r    t         j                  j                  j                  d       d|  d| d| d| d	S )Nr  zc10::metal::randint64(r?   r@   r  )r  r	  lowhighs       r8   	randint64zMetalOverrides.randint64h  s=     	
X&'vRxr#baHHr:   c                    d|  dS )Nzmetal::round(r@   r   r   s    r8   roundzMetalOverrides.roundo  r   r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zmetal::pow(r?   r   )rZ   r[   cast_acast_bs       r8   powzMetalOverrides.pows  sK    (1QCs1#Q7(1QCs1#Q7VHBvha00r:   c                f    t         j                  j                  j                  d       d| d| dS )Nspecial_mathc10::metal::rM   r@   r  )rG   rZ   names      r8   _special_unaryzMetalOverrides._special_unaryy  s/    	^,dV1QCq))r:   c                l    t         j                  j                  j                  d       d| d| d| dS )Nr  r  rM   r?   r@   r  )rG   rZ   r[   r  s       r8   _special_binaryzMetalOverrides._special_binary}  s5    	^,dV1QCr!A..r:   c           
        dD ].  }t        | |t        j                  | j                  |             0 t        j                  | j                  d      | _        dD ]1  }t        | |t        j                  | j                  |dz                3 dD ].  }t        | |t        j                  | j
                  |             0 dD ]1  }t        | |t        j                  | j
                  |dz                3 y )N)erferfinvi0i0ei1i1edigammaspherical_bessel_j0)r  	log_gamma)
	bessel_j0	bessel_j1	bessel_y0	bessel_y1modified_bessel_i0modified_bessel_i1modified_bessel_k0modified_bessel_k1scaled_modified_bessel_k0scaled_modified_bessel_k1_forward)	polygammazeta)chebyshev_polynomial_tchebyshev_polynomial_uchebyshev_polynomial_vchebyshev_polynomial_whermite_polynomial_hhermite_polynomial_he)setattr	functoolspartialmethodr  lgammar  )clsr  s     r8   _initialize_special_opsz&MetalOverrides._initialize_special_ops  s    	
 
	WD Cy66s7I7IPTUV
	W ,,S-?-?kR

 	D ''(:(:
ARS	&
 	XD Cy66s7J7JQUVW		X
 	D ''(;(;$BST	r:   )NT)
rI   r   r   torch.dtyper   zOptional[torch.dtype]r   r%   r   r6   )rI   r   r   rA  r   rA  r   r6   )r7   zUnion[bool, float, int]r   rA  r   r6   )rH   r   r   rA  r   r6   )r   r   r   r   r   r   r   r6   )rZ   r$   r[   r$   r   r$   r   r6   )rZ   r$   r[   r$   r   r6   )rZ   r   r[   r   r   r6   )rI   r   r   r6   )rI   r   rz   r   r   r6   )r  r   r	  r   r   r6   )
r  r   r	  r   r  r   r  r   r   r6   )rZ   r   r  r6   r   r6   )rZ   r   r[   r   r  r6   r   r6   r   None)3r   r   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r
  r  r  r  r  r  r  classmethodr@  r   r:   r8   r   r      s   b ,0"&	<<< )<  	<
 
< < bb*b7Bb	b b
 # # ( ( 2 2 2 2 2 2 > >
 > >
     $ $ $ $ " " " " " " & & + + ( ( + + " " # # # # # # + + # # 3 3
 $ $ # # $ $ 5 5 $ $ # # : :
 $ $   # # 5 5 6 6 II#.I5@IHSI	I I $ $ 1 1
*/ 7 7r:   r   mpsc                      e Zd ZU dZeZdZdZdZdZ	 e
       j                  Z e       j                  Z e       j                  ZeZ edg      Zded<   g Zd	ed
<   	 	 	 	 	 	 d fdZddZddZ	 d	 	 	 	 	 	 	 	 	 ddZddZddd ej8                         f	 	 	 	 	 	 	 	 	 	 	 d dZ	 	 	 	 	 	 	 	 	 	 d!dZ	 	 	 	 	 	 	 	 	 	 d!dZd"dZ d#dZ!dd$dZ"dd%dZ#	 	 	 	 	 	 	 	 	 	 d&dZ$ xZ%S )'MetalKernelz;Implement Metal codegen based on the SIMDKernel abstraction;auto i       utilszOrderedSet[str]r  zlist[IterationRangesEntry]multistage_reduction_entryc                X    t        |   |fi | t        j                         | _        y r   )super__init__	itertoolscountacc_var_ids)rG   tilingkwargs	__class__s      r8   rP  zMetalKernel.__init__  s&    
 	*6*$??,r:   c                    t         |   S r   r   )rG   r   s     r8   dtype_to_strzMetalKernel.dtype_to_str  s    e$$r:   c                   | j                   j                  |      }| j                  |      }t        j                  j                  |      }| d| j                  |       d}|t        j                  t        j                  fv rd| d}t        j                  }| j                  j                  | j                  ||      S )z"Codegen a load from an InputBuffer[]ru   r@   r   )rD   inputr   r   graph	get_dtyper   r4   float16bfloat16float32r   r   loads)rG   r  indexr   r   lines         r8   loadzMetalKernel.load  s    iiood#%%e,!!$'a))%013U]]ENN33 )a0DMMExx  T ??r:   Nc                h   | j                   j                  |      }| j                  |      }| j                  t        j
                  j                  |            }d| d| d}|| d| j                  |       d| d}n[|dk(  rH| j                  j                  d       d	| d
}	d|	 d| d}
|	 d|
 d| j                  |       d| d}nt        d|       | j                  r&| j                  j                  t        ||             y | j                  j                  t        ||             y )Nr   r   r@   rZ  ] = rI  
atomic_addatomiczc10::metal::AtomicType<>zreinterpret_cast<device z
::type *>(z::atomic_add(r?   );zUnimplemented store mode )rD   outputr   rX  r   r^  r_  r   r  r  rV   inside_reductionr   	writeliner   stores)rG   r  rd  valuemoder   	dtype_strcast_valre  atomic_typecast_vars              r8   storezMetalKernel.store  s<    iit$%%e,%%agg&7&7&=>	!)BugQ7<U!D--e45T(1ED\!LLX&3I;a@K1+jQOH!]-zD<M<Me<T;UUWX`WaacdD!:4&ABB  LL""<d#;<KK!!,tT":;r:   c                   | j                   j                  |      }| j                  |      }| j                  t        j
                  j                  |            }t        d | j                  D              }| d| j                  |       d| d| d}d|j                   d| }| j                  j                  t        ||             y )Nc              3  :   K   | ]  }|j                   s|  y wr   is_reduction.0ts     r8   	<genexpr>z.MetalKernel.store_reduction.<locals>.<genexpr>  s     K1ANNQK   rZ  z] = static_cast<r   rl  if (z == 0) )rD   rm  r   rX  r   r^  r_  nextrange_treesr   r  rp  ro  r   )rG   r  rd  rq  r   rs  reduction_dimre  s           r8   store_reductionzMetalKernel.store_reduction  s    iit$%%e,%%agg&7&7&=>	K(8(8KKa))%011A)BugUWXm(()7l467r:   Tc                z   t        |t        j                        r| j                  |      }dt	        | j
                         }t        j                  j                  |||      }|rdnd}|| d| z  }|r	|d| dz  }||rJ d       |d| z  }| j                  j                  || j                  z          |S )	Ntmp_acc_zthreadgroup   rZ  r[  z+Thread group var can not have default value = )r3   r4   r   rX  r  rS  r   r   create_cse_varindexing_codero  suffix)	rG   r   
elem_countdefault_valueis_threadgroupr   var_namer   var_defs	            r8   _new_idxvarzMetalKernel._new_idxvar  s     eU[[)%%e,Ed4#3#3456hh%%h>$2.eWAhZ((:,a((G$%T'TT%]O,,G$$Wt{{%:;
r:   c                    |||f}|| j                   j                  v r| j                   j                  |   S | j                  ||||      }|| j                   j                  |<   |S )z)Caching wrapper around _reduction_nocache)r   reduction_cache_reduction_nocache)rG   r   r   reduction_typerq  	cache_keyr   s          r8   	reductionzMetalKernel.reduction  sf     6	00088++I66((	>5Q.4  +r:   c                   | j                   sJ | j                  rJ d?d}d}d}| j                  D ]9  }|j                  s|r|dz  }||j                   d| z  }||j
                  z  }; t        || j                        }|dk(  r| j                  |      }	| j                  j                  |	 d       | j                  j                  d       | j                  j                  d	| d
|	 d       | j                  j                  d       |	S | j                  j                  d       |dv rt         |   }
| j                  |
t#        || j$                              }| j&                  s|}nD|dk(  rdnd\  }}| j                  |
|d      }| j                  j                  | d| d| d       | j(                  j+                  | j                  d| d| d| d| d| dt         |         S |dv r| j                  ||      }| d| d}t,        |   }| j&                  sV| j                  j                  | d| d| d        | j(                  j+                  | j                  d| d| d| d|      S |j/                  d!      rd"nd!}| j                  j                  | d#| d$| d%       |j1                  d&      rt3        d' | j4                  j7                         D              }| j                  t8        j:                  |      }|d(k(  rd)nd*}| d| d}| j                  j                  | d+       | j                  j                  d	| d| d| d
| d,| d-| d,|j                   d.       | j(                  j+                  | j                  | d/| d| d| d0|      S | j                  j                  | d1| d| d| d        | j(                  j+                  | j                  d| d| d| d|      S |d2k(  r+| j&                  s~| j                  ||      }| j                  j                  | d| d3| d       | j(                  j+                  | j                  d| d| d| dt8        j<                        } ||      S | j                  d4|      }| d| d}| j                  j                  | d5       | j                  j                  | d6| d7| d8       | j(                  j+                  | j                  d9| d| dt8        j<                        } ||      S |d:k(  r4t?        |t@              sJ d;       | j                  d4|      }| d| d}d<|d=    d|d    d|d>    d}| j                  j                  | d5       | j&                  rC| j                  j                  | d5       | j                  j                  | d6| d| d        n!| j                  j                  | d,| d       | j(                  j+                  | j&                  r| j                  n| j                  d| d| d| dt8        j<                        } ||      S tC        |      )@zeCodegen a reduction operation.
        Only sum and prod operations are somewhat reasonable optimizedc           
         t        j                  dD cg c](  }t        |  d| | j                  | j                        * c}      S c c}w )Nxyzrk   )r   _unwrapr   r   r   )res3r~  s     r8   _unwrap_helperz6MetalKernel._reduction_nocache.<locals>._unwrap_helper9  sA    %%NSTvQqc]DKKDT Ts   -Ar  r    + rm   anyz	 = false;z7threadgroup_barrier(metal::mem_flags::mem_threadgroup);z
                if (z) {
                    z' = true;
                }
            reduction_utils)prodsumr  )r   rR   )r   *F)r  r  r  z= rI  zc10::metal::threadgroup_rM   r?   r@   r\  )maxminargminargmaxrZ  r[  z = static_cast<r   rl  r  lowestz = ::metal::numeric_limits<z>::z();argc              3  :   K   | ]  }|j                   s|  y wr   rz  r|  s     r8   r  z1MetalKernel._reduction_nocache.<locals>.<genexpr>  s      Ar  r  rk  <z = -1;r  z;
                    z$;
                }
                z[c10::metal::threadgroup_z)]z = ::c10::metal::welford_reducerh  float3z = 0.0;z! = ::c10::metal::welford_combine(z	, float3(z, 0.0, 1.0));z(c10::metal::threadgroup_welford_combine(welford_combinez&Input to welford combine must be tuplezfloat3(r   r   )r  r   r   ztuple[CSEVariable, ...])"rn  
_load_maskr  r{  r  numelr  max_threadgroup_sizer  r  ro  r   splicerp  r  r  r   r   simd_group_sizerM  r   r   r   endswith
startswithr  range_tree_nodesvaluesr4   r)   rb  r3   tupleNotImplementedError)rG   r   r   r  rq  r  reduction_idxacc_buf_sizerdacc	acc_dtypeacc_bufr7   default_valreduction_opacc_thread_varsrc_metal_typelim_fnidx_varidx_acc_bufcmp_opidx_thread_varwf_res	inp_values                           r8   r  zMetalKernel._reduction_nocache-  s    $$$$??""	 "" 	%B??&yL>::MBHH$L	% <)B)BCU"""5)C((C5	):;((I LLG E  KK!!I J*+_,29=I&&7<1E1EFG 22 !/% 7HX *\ &&[ '  ##se1\N"UG1$EF88$$*>*:!G9Bse2m_\^_k^llmn07 %  
 ??&&y,?G 'y-:N+I6N22##%&on5ERwbQ xx((KK.~.>ay<.XYZ )  
 "0!8!8!?XUF((!""=n=MSQWPXX[\ ((/ #44;;=  #..uzz<H .( :$/=-!B""))^,<F*CD## )G1VHAn%5 6#$Cw /#$C~ 6%  xx((KK"m#<^<LAgYVXYeXffhi )  
 LL""!""3N3C1^DTTVW\V]]_` 88$$*>*:!G9B|nTUV %  
 --22**9lC##wiqtE7!$LM**LL.~.>ay<.XYZ-- + 
 &f--&&x>G 'y-:N%%(8&@ALL""!""CNCSS\]b\ccpq XX&&:7)2l^STUmm ' F
 "&))..eU+U-UU+&&x>G 'y-:N!%(2eAhZr%(1EI%%(8&@A..""))^,<G*DE&&%&&GGWWYZcYddfg &&.)9YKq'IJXX&&#>>DLL*>*:!G9B|nTUVmm ' F
 "&))!.11r:   c                   | j                  |j                        }| j                  |      }|j                  r#|j                  j
                  | j                  k  r9| j                  j                  | j                   d|j                   d| d       y | j                  j                  |       |j                  j
                  | j                  z   dz
  | j                  z  }| j                  j                  d|j                   d|j                   d| d|j                   d		       | j                  j                         5  | j                  j                  | j                   d|j                   d| d
| d|j                   d
       || j                  z  |j                  j
                  k7  r@| j                  j                  d|j                   d|j                  j
                   d       d d d        y # 1 sw Y   y xY w)Nr  r  rI  r   z	for(auto z
_cnt = 0; z_cnt < z; ++z_cnt) {rm   r  z_cnt;r  z >= z) break;)rename_indexingrH   sexprr{  rootr  r  r  ro  index_dtyper  rM  appendr   indent)rG   entryr   	index_str	loop_sizes        r8   codegen_iteration_ranges_entryz*MetalKernel.codegen_iteration_ranges_entry  s   ))%**5
JJz*	!!UZZ%5%59R9R%R((##$Aejj\YKqA ''..u5
 JJt8881<&&'	 			

|:ejj\4PUPZPZ|[cd	
 YY 	WII##$Aejj\YKs9+SQVQ[Q[P\\ab 4444

8H8HH		##d5::,d5::;K;K:LH$UV	W 	W 	Ws   B/G;;Hc                   | j                   r-| j                  j                         5  | j                  j                  | j                         | j                  j                  | j
                         ddd       | j                  j                  dt        | j                         z         | j                  j                  t        d | j                  j                  j                         D                     | j                   r| j                   j                         j                          | j                   r5nJ| j                  j                  | j                         | j                  j                  | j
                         | j                  j                  | j                         | j                  j!                          | j
                  j!                          | j                  j!                          y# 1 sw Y   xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        N}c              3  T   K   | ]   }t        |t              r|n|fD ]  }|  " y wr   )r3   r  )r}  itemvs      r8   r  z+MetalKernel.codegen_body.<locals>.<genexpr>  s:      &0u&=dD7  s   &()rM  r   r  r  rc  r   ro  rU   r   
invalidater
   r  r  popcache_clearrp  clear)rG   s    r8   codegen_bodyzMetalKernel.codegen_body  sc    **!!# /		  ,		  ./ IIc$*I*I&J JK
 HH  $ 8 8 ? ? A  11//335AAC 11 IITZZ(IIT\\*		%

1/ /s   AG99Hc                N
   | j                          t               }t        j                  j                  r|j                  d       n|j                  d       | j                         }|j                         5  t        j                  j                  s'| j                  D ]  }|j                  d| d        nr| j                  D cg c]  }d| d
 }}t        |t        t              j                  j                  j                  dz  gt                     }|j                  |       | j                  rQt        j                   d | j"                  D              }t%        || j&                        }|j                  d| d       |j                  d	       |j                         5  | j(                  j*                  j-                         D ]Z  \  }	}
|	| j.                  v r| j1                  t        j                  j3                  |	            }|j                  d
| d|
 d       \ | j(                  j4                  j-                         D ]  \  }	}
t        j                  j3                  |	      }|t6        j8                  k(  rBt        j                  j;                  |	      }||j=                         g k7  rt?        d      d}n| j1                  |      }|j                  d| d|
 d        | j(                  j@                  j-                         D ]  \  }	}
|j                  d|
 d        tC        |      dk  sJ d       tC        |      dkD  rdtC        |       nd}tC        |      dk(  r|d   jD                  nd}| j                  rdnd}|j                  | d| d|        | j                  r|j                  | d       ddd       |j                  d       |j                         5  tC        |      dkD  rAtG        |      D ]3  \  }}|j                  d|jD                   dtI        d|z          d        5 |jK                  | jL                         |jK                  | jN                         ddd       |j                  d!       ddd       t        j                  j                  r!|j                  d"       |jQ                         S |j                  d#       |jQ                         S c c}w # 1 sw Y   3xY w# 1 sw Y   xY w# 1 sw Y   xY w)$z3Called at the end to generate a final kernel stringz(R"MTL(zcompile_mps_shader('''z#include <c10/metal/z.h>includec              3  N   K   | ]  }|j                   s|j                    y wr   )r{  r  r|  s     r8   r  z-MetalKernel.codegen_kernel.<locals>.<genexpr>%  s      1 !AGG1s   %%z$[[max_total_threads_per_threadgroup(z)]]zkernel void generated_kernel(zdevice z* ,Nzfloat64 is not supported by MPSr+   z	constant zconstant long&    z%Up to 3 index variables are supportedr   uintr   
thread_posr  r  z [[thread_position_in_grid]]z- group_pos [[thread_position_in_threadgroup]]z) {rJ  z = thread_pos.x   rI  r  z)MTL");z''')))r  r   r   r^  cpp_wrapperro  active_range_treesr  r  r	   r   __file__parentr
   rn  mathr  r  r  r  rD   output_buffersitemsremoved_buffersrX  r_  input_buffersr4   float64try_get_bufferget_sizerV   sizevarsrU   r  	enumeratechrr  r  r   getvalue)rG   r  codeidx_varsheaderr  header_contentstotal_reduction_sizethreadgroup_sizeouterinnerrs  r   	outer_bufthread_pos_dtypethread_pos_var_namethread_pos_suffixidxr   s                      r8   codegen_kernelzMetalKernel.codegen_kernel
  s   77NN9%NN34**,[[] C	 77&&"ll GFNN%9&#EFG FJ\\;A*6(#6  #1(^**11889DEL#
 /$$'+yy 1%)%5%51 ($ $'';T=V=V#W :;K:LCP NN:; !$(II$<$<$B$B$D DLE5 4 44  $ 1 1!''2C2CE2J KINNWYKr%#BC	D
 %)II$;$;$A$A$C 
FLE5GG--e4E-$%GG$:$:5$A	$,	0B0B0D0J"./P"QQ$+	$($5$5e$<	NNYykE7!#DE
F %)II$6$6$<$<$> ?LE5NN_UG1#=>?8}q(Q*QQ(.1(ma.?d3x=/*V ! ),H(:HQK$$ $ ,0+@+@Cb!'(*=)>>Z[lZmn ((NN+,,YZ?!D NN5! 'x=1$$-h$7 S#CHH:^Cc	N;K1M D../DII&' NN3GC	 J 77NN9% }} NN6"}}K&! !F' 'wC	  C	 sR   8ATS=CT*H'T)T:BT T=TT	TT	TT$c           	        t         j                  j                  }| j                  j                  j                         D ]  }|j                  |        | j                  j                         \  }}}}t        ||      D 	ci c]  \  }}	t        |      |	 }
}}	g | j                  j                  j                         | j                  j                  j                         }|D cg c]  }|| j                  vs| }}|| j                  j                  j                         D cg c]  }t        |       c}z  }|D cg c]  }|
|   	 }}t         j                  j                  r| j                  n| j                  }dd}t!        | j#                               dkD  r| j#                         D cg c]J  } ||j$                  r*t'        j(                  |j*                  | j,                        n|j*                        L }}|j/                   ||d             |j/                  t0               n%t         j                  j                  rt3        d      | j4                  r| j#                         D cg c]@  }|j$                  r0 |t'        j(                  |j*                  | j,                              ndB }}|j/                   ||d             |j/                  t0               n1t         j                  j                  r|dgz  }|j/                  d       |j7                  ||t9        j:                  d      d	|
       yc c}	}w c c}w c c}w c c}w c c}w c c}w )zCodegen a call to this kernelthreadsc                    t         j                  j                  r(| D cg c]  }d| d
 } }ddj                  |        dS | ddj                  |        dS c c}w )Nzstatic_cast<uint64_t>(r@   {r?   r  z=[r[  )r   r^  r  join)r	  kwargr~  s      r8   format_threadsz/MetalKernel.call_kernel.<locals>.format_threadst  sh    ww""BIJQ3A3a8JJDIIg./r22499W#5"6a88 Ks   Ar   zWe should always have threads?1
group_sizeNcpuF)devicetriton	arg_types)r	  z	list[str]r  r6   r   r6   )r   r^  wrapper_coderD   r  keysensure_size_computedpython_argdefszipr6   r  r  r  r  cexprpexprrU   r  r{  sympyMinr  r  r  listrV   rn  generate_kernel_callr4   r  )rG   r  nodewrapperr  _	call_argsr  call_argarg_typearg_name_to_typerD   r  expr_printerr  r	  s                   r8   call_kernelzMetalKernel.call_kernela  s   ''&&##((* 	,A((+	, &*YY%=%=%?"9a>A)Y>W
(:(CM8#
 
 S))..0R4993J3J3O3O3QR#Gs$2F2F'FGG!3!3!8!8!:;AQ;;6:;s%c*;	;%&WW%8%8tzzdjj	9 t&&()A- 002  ~~ IIaggt'@'@AG  KKw	:;T"ww"""#CDD  
 002	  >> UYYqww0I0IJKG  KKw=>T"ww""   &$$<<& 	% 	
i

 H;; s,   M
*M>M+MM8AM:AM$c                    |s|sy | j                  |      }|r| dnd}|r| d| j                  |       nd}|r|r
d| d| d}nd| | d}| j                  j                  | j                  |d	
       y )Nz < 0r  z > zif ((z) && (z	)) returnr  z) returnF)
assignment)r   r   r   r   )	rG   rH   sizelowerupperexpr_str
lower_expr
upper_exprre  s	            r8   check_boundszMetalKernel.check_bounds  s      $$T**/z&R
BGzT%6%6t%<$=>R
U:,fZL	BD*j\:D$,,?r:   )rT  zdict[str, sympy.Expr]rU  r   r   rC  )r   rA  r   r6   )r  r6   rd  r   r   r   r   )
r  r6   rd  r   rq  r   rr  r!   r   rC  )r  r6   rd  r   rq  r   r   rC  )r   zUnion[str | torch.dtype]r  zOptional[int]r  zOptional[Any]r  r%   r   zValueRanges[Any]r   r   )
r   rA  r   rA  r  r    rq  +Union[CSEVariable, tuple[CSEVariable, ...]]r   r2  )r  r   r   rC  rB  )r  zOptional[str]r   r6   )r  r6   r   r   r   rC  )
rH   r   r+  r   r,  r%   r-  r%   r   rC  )&r   r   r   r   r   	overridesr  newvar_prefixr  r  r   rE   r  r   r  r<   r  kexprr
   r  __annotations__rM  rP  rX  rf  rw  r  r   unknownr  r  r  r  r  r  r(  r1  __classcell__rV  s   @r8   rH  rH    s   EIFMOO##EL  E&&EE)7)4G_4=? :?-%- - 
	-%@ SW<< *<3><FO<	<*8 %)'+##6;#6#6#8' " %	
  ! 
,  &	
 ; 
5 [2[2 [2 &	[2
 ;[2 
5[2zW6#JUnB
H@@&0@9=@FJ@	@r:   rH  c                  <     e Zd ZeZd fdZ	 	 	 	 	 	 	 	 ddZ xZS )MetalSchedulingc                    t         |   |       t        j                  j                  }|7t        j                  j
                  s|j                  j                  d       y y y )NzDfrom torch._inductor.runtime.runtime_utils import compile_mps_shader)rO  rP  r   r^  r  r  r  r  )rG   	schedulerr!  rV  s      r8   rP  zMetalScheduling.__init__  sQ    #''&&77&&%%Z ' r:   c                l   t         j                  j                  }||j                  v r|j                  |   }|S d|j	                          }t         j                  j
                  rd| |z   }| d}n| d}||j                  |<   t        ||      \  }}| d| }	|j                  |||	d       |S )Nmps_lib_z+at::native::mps::DynamicMetalShaderLibrary _funcz.generated_kernel
F)gpu)r   r^  r  src_to_kernelnext_kernel_suffixr  r   define_kernel)
rG   src_codenode_scheduler   r!  kernel_namemps_lib_nameoriginsdetailed_originsmetadata_comments
             r8   rE  zMetalScheduling.define_kernel  s     ''&&w,,,!//9K( ! &g&@&@&B%CDLww""A,P  ".e4!-.?@.9G!!(+(;M7(S%G%")"-=,>?!!,:JPU!Vr:   )r=  zOptional[Scheduler]r   rC  )rF  r6   rG  zlist[SchedulerNode]r   rH  r   r6   )r   r   r   rH  kernel_typerP  rE  r8  r9  s   @r8   r;  r;    s2    K,?IT	r:   r;  )r7   z)Union[float, int, bool, str, CSEVariable]r   r6   )I
__future__r   r<  rQ  loggingr  pathlibr   typingr   r   r   r  sympy.printing.precedencer   r4   torch.utils._cpp_embed_headersr	   torch.utils._ordered_setr
   torch.utils._sympy.printersr   r   ExprPrinter_torch.utils._sympy.value_rangesr   rL  r   r   r   virtualizedr   r   r   commonr   r   r   r   r   r   simdr   r   r   r   ops_handlerr    r!   r=  r"   r#   r$   	getLoggerr   r   r%   int8int16int32int64uint8r+   r,   ra  r   r9   r<   r   _initialize_pointwise_overridesr@  rH  r;  r   r:   r8   <module>rc     s,   #      / /  0  9 / O 7 G G , ,  C B 64g! 
JJ	JJ	KK	KK	KK	KK	KK	JJ	NNH
Y#| Y#xW[ Wt  . .u 5  & & (t@* t@n%n %r:   