
    rh                       d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlmZ ddlmZ dd	lmZ dd
lmZ erd dlmZ d dlmZ ej6                   G d d             Zej6                   G d de             ZeZej6                   G d d             Zej6                   G d d             Z ej6                   G d de             Z!ej6                   G d de             Z"ej6                   G d de             Z#ej6                   G d de              Z$ G d de%      Z& G d d e&!      Z' G d" d#e'      Z( G d$ d%e'      Z) G d& d'e'      Z* G d( d)e'      Z+y)*    )annotationsN)partial)Lock)AnyCallableTYPE_CHECKING)
OrderedSet   )config)get_backend_num_stages)V)	GeneratorConfigc                  D    e Zd ZU dZded<   ded<   ded<   ded<   ded<   y)	
BaseConfigzD
    Base Gemm configuration used for most backends (CPU, CUDA)
    intblock_mblock_nblock_k
num_stages	num_warpsN__name__
__module____qualname____doc____annotations__     v/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/template_heuristics.pyr   r      s"     LLLONr    r   c                       e Zd ZU dZdZded<   y)
GemmConfigz?
    Gemm configuration used for most backends (CPU, CUDA)
       r   group_mN)r   r   r   r   r%   r   r   r    r!   r#   r#   &   s     GSr    r#   c                  :    e Zd ZU dZded<   ded<   ded<   ded<   y)
FlexConfigz
    Base Config class for flex attention
    - FlexAttn forward, backward and flex decode will use this

    NOTE:
    For flex_attn bwd block_m and block_n are reused for block_m1, block_m2, block_n1, block_n2

    r   r   r   r   r   Nr   r   r    r!   r'   r'   3   s     LLONr    r'   c                  0    e Zd ZU dZded<   ded<   ded<   y)FlexDecodeConfigz(
    Config class for flex decoding
    r   r   r   r   Nr   r   r    r!   r)   r)   D   s     LONr    r)   c                  <    e Zd ZU dZdZded<   dZded<   dZded<   y	)
ROCmGemmConfigzN
    ROCm subclass for GEMMs, with AMD backend specific tuneable kernargs
       r   matrix_instr_nonkdimr   waves_per_eu   kpackNr   r   r   r   r-   r   r.   r0   r   r    r!   r+   r+   P   '     !##"L#E3Nr    r+   c                  <    e Zd ZU dZdZded<   dZded<   dZded<   y	)
ROCmConvConfigzM
    ROCm subclass for Conv, with AMD backend specific tuneable kernargs
    r,   r   r-   r   r.   r/   r0   Nr1   r   r    r!   r4   r4   [   r2   r    r4   c                  <    e Zd ZU dZdZded<   dZded<   dZded<   y)	ROCmFlexConfigzQ
    ROCm subclass for FlexAttn, with AMD backend specific tuneable kernargs
    r   r   r-   r.   r/   r0   Nr1   r   r    r!   r6   r6   f   '     !"#!L#E3Nr    r6   c                  <    e Zd ZU dZdZded<   dZded<   dZded<   y)	ROCmFlexDecodeConfigzS
    ROCm subclass for FlexDecode, with AMD backend specific tuneable kernargs
    r   r   r-   r.   r/   r0   Nr1   r   r    r!   r9   r9   q   r7   r    r9   c                  \     e Zd ZU dZi Zded<    e       Zded<   	 	 	 	 	 	 	 	 d fdZ xZ	S )BaseHeuristicSingletonz
    Thread-safe implementation of single to be used in the config heuristic subclasses
    to ensure heavy __init__ calls are not repeatedly run
    zdict[type[Any], Any]
_instancesr   _lockc                    | j                   5  | | j                  vrt        |          }|| j                  | <   | j                  |    cd d d        S # 1 sw Y   y xY wN)r=   r<   super__call__)clsargskwargsinstance	__class__s       r!   rA   zBaseHeuristicSingleton.__call__   sT     YY 	'#..( 7+-&.s#>>#&		' 	' 	's   :AA)rB   r;   rC   r   rD   r   returnBaseConfigHeuristic)
r   r   r   r   r<   r   r   r=   rA   __classcell__rF   s   @r!   r;   r;   |   sK    
 (*J$)&E4'#',/';>'	' 'r    r;   c                  *   e Zd ZdZddZ	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZddd d	f	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
Z	 	 	 	 	 	 	 	 ddZ	d dZ
d dZd dZd dZd dZd dZd dZ	 	 d dZd dZd dZd!dZd!dZ	 	 	 	 	 	 d"dZy)#rH   z\
    Base class for mm_configs, device specific triton kernels config inherit from here
    c                ,   t        ddddd      t        ddddd      t        ddddd	      t        ddddd	      t        ddddd      t        ddddd      t        ddddd      t        dddd
d	      t        ddddd      t        dddd
d      t        ddddd	      t        dddd
d      t        ddddd      t        dddd
d      t        ddddd	      t        ddddd	      t        dddd
d      t        dddd
d      t        ddddd	      g| _        t        j                  g dd
      D cg c]+  \  }}}dD ]   }dD ]  }dD ]  }t        ||||||        " - c}}}}}}| _        t        dddd
d      t        ddddd      t        ddddd      t        dddd
d      t        ddddd      t        dddd
d	      t        ddddd	      t        ddddd      t        dddd
d	      t        ddddd      g
| _        t        ddddd      t        dddd
d      t        dddd
d      t        ddddd	      t        ddddd	      t        ddddd	      t        ddddd	      t        ddddd	      t        dddd
d	      t        dddd
d	      t        dddd
d	      g| _        t        dddd
d      t        ddddd	      g| _        t        dddd
d	      t        dddd
d	      t        dddd
d	      t        dddd
d      t        ddddd	      t        ddddd	      t        ddddd	      t        ddddd      g| _        g t        dddd
d	      t        dddd
d	      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        dddd
d	      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        dddd
d      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      t        ddddd      | _	        t        dddd
d	      t        dddd
d	      t        ddddd	      t        ddddd      t        dddd
d      t        ddddd      t        ddddd	      t        ddddd	      t        ddddd	      g	| _
        t        ddddd      t        dddd
d	      t        ddddd      t        ddddd	      t        ddddd	      t        ddddd	      t        ddddd	      t        ddddd	      t        ddddd      t        ddddd      g
| _        t        ddddd      t        ddddd      t        ddddd	      t        ddddd	      t        ddddd      t        ddddd	      t        ddddd	      g| _        t        ddd
d      t        ddd
d      t        dddd	      t        ddd
d      t        ddd
d      g| _        dD 	
cg c]<  }dD ]5  }dD ].  }	|dk\  s|dk\  rdd	gndgD ]  }
||z  dk(  rt        |||	|
       0 7 > c}
}	}}| _        t#        dd
d      t#        dd
d      t#        dd
d      g| _        dD cg c]%  }dD ]  }dD ]  }dD ]  }t        ||||          ' c}}}}| _        dD cg c]-  }dD ]&  }dD ]  }dD ]  }||z  dk(  rt        ||||       ! ( / c}}}}| _        dD cg c]  }dD ]  }dD ]  }t#        |||         c}}}| _        y c c}}}}}}w c c}
}	}}w c c}}}}w c c}}}}w c c}}}w )N    r,   r
   r/         @      r$      r,   rM   rP   rN      repeat)r
   r/   rR   rO   rQ   r/   rO   r$   )r$   rT      i   )rM   rP   rM   rP   rN   )r
   rR   rO   rQ   r   r,   rM   rP   rN   )r#   
mm_configs	itertoolsproductexhaustive_configsextra_mm_configsint8_mm_configsmixed_mm_configspersistent_mm_configsscaled_mm_configsscaled_persistent_mm_configsmm_plus_mm_configs
ConvConfigconv_configsr'   flex_attn_fwd_autotune_configsflex_attn_bwd_autotune_configsr)   flex_decode_autotune_configs exhaustive_flex_attn_fwd_configs exhaustive_flex_attn_bwd_configsexhaustive_flex_decode_configs)selfBLOCK_MBLOCK_NBLOCK_Kr   r   r%   BLOCK1BLOCK2swr   s               r!   __init__zBaseConfigHeuristic.__init__   s   
 r2r1a(r2sAq)r2r1a(r2r1a(r2sAq)r2r1a(r2r1a(r2r1a(r2sAq)r3Aq)r3Aq)r3Aq)r3Q*sBAq)sBAq)sCQ*sCQ*sCQ*sCQ*'-
2 .7->->&q.5
 5
)' .5

 &5
 5
  w*iQ5
Q5
Q5
Q5
  r2r1a(r2r1a(r2r1a(r2sAq)sBAq)sBAq)sBQ*sCQ*sCQ*sCQ*3
 r2r1a(r3Aq)sBAq)r3Aq)sBAq)r2r1a(r2r1a(sCQ*r2r1a(sCa+sCa+2
 r3Q*r3Q*3
 sCQ*sCQ*sCa+sCa+sCQ*sCQ*sCQ*sCQ*	8
"b4
sCQ*b4
sCQ*b4
 sBAq)b4
 r3Aq)	b4

 sCQ*b4
 sBAq)b4
 r3Aq)b4
 sBAq)b4
 r2r1a(b4
 sCa+b4
 sBQ*b4
 r3Q*b4
 sCa+b4
 sBAq)b4
 r3Aq)b4
  sBAq)!b4
" r2r1a(#b4
$ r2r1a(%b4
& r2r1a('b4
( r3Aq))b4
* r3Aq)+b4
, r2r1a(-b4
. r2r1a(/b4
0 r3Aq)1b4
2 r3Aq)3b4
4 r2r1a(5b4
6 r2r1a(7b4
8 r3Aq)9b4
: r3Aq);b4
< r2r1a(=b4
> r2r1a(?b4
@ r3Aq)Ab4
B r3Aq)Cb4
D r2r1a(Eb4
F r2r1a(Gb4
H r3Aq)Ib4
J r3Aq)Kb4
L r2r1a(Mb4
N r2r1a(Ob4
P r3Aq)Qb4
R r3Aq)Sb4
T r2r1a(Ub4
V r2r1a(Wb4
X r3Aq)Yb4
Z r3Aq)[b4
\ r2r1a(]b4
^ r2r1a(_b4
` r3Aq)ab4
b r3Aq)cb4
d r2r1a(eb4
f r2r1a(gb4
h r3Aq)ib4
j r3Aq)kb4
l r2r1a(mb4
n r2r1a(ob4
p r3Aq)qb4
r r3Aq)sb4
t r2r1a(ub4
v r2r1a(wb4
x r3Aq)yb4
z r3Aq){b4
| r2r1a(}b4
~ r2r1a(b4
@ r3Aq)Ab4
B r3Aq)Cb4
D r2r1a(Eb4
F r2r1a(Gb4
H r3Aq)Ib4
J r3Aq)Kb4
L r2r1a(Mb4
N r2r1a(Ob4
P r3Aq)Qb4
R r3Aq)Sb4
T r2r1a(Ub4
V r2r1a(Wb4
X r3Aq)Yb4
Z r3Aq)[b4
\ r2r1a(]b4
^ r2r1a(_b4
` r3Aq)ab4
b r3Aq)cb4
d r2r1a(eb4
f r2r1a(gb4
h r3Aq)ib4
j r3Aq)kb4
l r2r1a(mb4
n r2r1a(ob4
p r3Aq)qb4
r r3Aq)sb4
t r2r1a(ub4
v r2r1a(wb4
x r3Aq)yb4
z r3Aq){b4
| r2r1a(}b4
~ r2r1a(b4
@ r3Aq)Ab4
B r3Aq)Cb4
J sCQ*sCa+sCa+sCa+sCa+sCa+sCa+sCa+sCQ*
?
) r2r1a(r2r1a(r2r1b)r2r1a(r2r1a(sCQ*r2r1a(r2sAq)r2r1a(r2r1a(5
 r3Aq)sBAq)tRQ*sCQ*r2r1a(r3Aq)sBAq)/
 sB1%sCA&sCA&r31%r2q!$A
+ #A
 A
'A
 !	A
  &#3q!fQCA

 !# vvq!,A
,A
,A
,A
+ RA&RA&S!Q'E
) -C
 C
(C
 *	C
 &C

 	 wY?C
?C
?C
?C
- ,C
 C
+C
 *	C
 &C

 !# vvz9=C
=C
=C
=C
- -G
 G
*G
 &	G
  Wj)<G
<G
<G
+W5
VA
C
C
G
s   20o-Ao7
*o?
=2p
 "pc              #  .  K   t               }t        j                  j                  }|D ]  }t	        |j
                  |j                  |j                  z  dz        }|j                  |j                  |j                  |j                  |f}t        |dd      }|||fz  }||vs||t        |      |k  s|j                  |       |j                  |j                  |j                  |j                  |d}|||d<    | j                  di |  yw)S
        Finalizes configs after scaling, applying additional constraints.
        rT   r%   N)ro   rp   rq   r   r   GROUP_Mr   )r	   r   test_configsmax_mm_configsminr   r   r   r   r   getattrlenaddtriton_config)	rn   configsusedr{   confr   keyr%   rD   s	            r!   _finalize_mm_configsz(BaseConfigHeuristic._finalize_mm_configs  s     -7L,,;; 	3DDNNDLL4<<,G3,NOI $C dIt4G"z!$&#d)n*D#||#||#||"&//!* &(/F9%(d((2622?	3s   B"D%D6ADc                   ddl m} d}	|rdnd}
t         |t        j                  j
                  j                  |t        j                              |	      }t         |t        j                  j
                  j                  |t        j                              |	      }t         |t        j                  j
                  j                  |t        j                              |
      }g }|D ]  }t        j                  |t        t        t        |j                  |z        |      |	      t        t        t        |j                  |z        |      |	      t        t        t        |j                  |z        |      |
            } ||j                  |j                  |j                        r|j!                  |        |S )zW
        Scales and filters matrix multiplication configs based on input size.
        r
   )next_power_of_2r,   rM   )fallback)r   r   r   )runtime.runtime_utilsr   maxr   graphsizevars	size_hintr   unbacked_symint_fallbackdataclassesreplacer|   r   r   r   r   append)rn   mnkr   scalehas_int8_tensorexcluder   min_block_sizemin_block_size_kscaled_configscscaled_configs                 r!   _scale_mm_configsz%BaseConfigHeuristic._scale_mm_configs  s    	;!02b  **#<< +  
   **#<< +  
   **#<< +  
  	5A'//CAII$5 6:NKCAII$5 6:NKCAII$5 6:<LM	M %%}'<'<m>S>S %%m4	5 r    c                   dd l }g }|D ]  }|j                  j                         }|j                  j                  |      }|j                  }d}	t        j                  |j                  |j                  z  |j                  dz  z        }
||j                  |j                  z  |j                  |j                  z  z   z  }||j                  z  |kD  r|
|	kD  r|j                  |        |S )Nr      rM   )torchcudacurrent_deviceget_device_propertiesshared_memory_per_block_optinmathceilr   r   r   r   r   r   )rn   r   
dtype_sizer   pruned_configsgemm_configdevicepropssm_availableNUM_REGacc_regsshared_mem_accums               r!   _prune_exhaustive_configsz-BaseConfigHeuristic._prune_exhaustive_configs  s    
 	" 	/KZZ..0FJJ44V<E >>LGyy##k&9&99[=R=RUW=WXH  *##k&9&99%%(;(;;<   +"8"88<GG#!!+.-	/0 r    Fr
   c                     y)NFr   )r   r   r   s      r!   <lambda>zBaseConfigHeuristic.<lambda>5  s    r    r   c	           	         | j                  |||||||      }	t        j                  dk(  r|dkD  sJ d       | j                  |	|      }	| j	                  |	      S )N
EXHAUSTIVEr   z1dtype_size must be provided for exhaustive search)r   r   max_autotune_gemm_search_spacer   r   )
rn   r   r   r   r   r   r   r   r   r   s
             r!   preprocess_mm_configsz)BaseConfigHeuristic.preprocess_mm_configs-  sh     //q!We_g
 00L@>V#VV>!;;NJWN((88r    c                $    ddl m}  ||||      S )Nr   r   )r   r   )tritonr   )rn   r   r   rD   TritonConfigs        r!   r   z!BaseConfigHeuristic.triton_configA  s     	2FzYOOr    c                D    t        | j                  | j                        S Nr   )r   r   r[   rn   s    r!   get_mm_configsz"BaseConfigHeuristic.get_mm_configsH  s    t114??KKr    c                D    t        | j                  | j                        S r   )r   r   r^   r   s    r!   get_exhaustive_mm_configsz-BaseConfigHeuristic.get_exhaustive_mm_configsK  s    t114;R;RSSr    c                D    t        | j                  | j                        S r   )r   r   r_   r   s    r!   get_extra_mm_configsz(BaseConfigHeuristic.get_extra_mm_configsN  s    t114;P;PQQr    c                D    t        | j                  | j                        S r   )r   r   r`   r   s    r!   get_int8_mm_configsz'BaseConfigHeuristic.get_int8_mm_configsQ  s    t114;O;OPPr    c                    t         j                  dk(  r| j                  | j                  z   n| j                  }t	        | j
                  |      S Nr   r   )r   r   r[   ra   r   r   )rn   r[   s     r!   get_mixed_mm_configsz(BaseConfigHeuristic.get_mixed_mm_configsT  sG     44D OOd333 	
 t11:FFr    c                    t         j                  dk(  r| j                  n| j                  }|D cg c]  }|j                  dk7  s| }}t        | j                  |      S c c}w )Nr   r/   r   )r   r   r^   rb   r   r   r   )rn   rb   r   s      r!   get_persistent_mm_configsz-BaseConfigHeuristic.get_persistent_mm_configs\  sp     44D ##++ 	 "7!
&:J:Ja:OF!
 !
 t11;PQQ!
s   A"A"c                D    t        | j                  | j                        S r   )r   r   rc   r   s    r!   get_scaled_mm_configsz)BaseConfigHeuristic.get_scaled_mm_configsi  s    t114;Q;QRRr    c                D    t        | j                  | j                        S r   )r   r   rd   r   s    r!    get_scaled_persistent_mm_configsz4BaseConfigHeuristic.get_scaled_persistent_mm_configsl  s"     &&0Q0Q
 	
r    c                D    t        | j                  | j                        S r   )r   r   re   r   s    r!   get_mm_plus_mm_configsz*BaseConfigHeuristic.get_mm_plus_mm_configss  s    t00$:Q:QRRr    c                D    t        | j                  | j                        S r   )r   r   rg   r   s    r!   get_conv_configsz$BaseConfigHeuristic.get_conv_configsv  s    t114;L;LMMr    c                |   g }t         j                  r.t         j                  dk(  r| j                  S || j                  z  }|dk  r1|t
        j                  k(  rt        dddd      }n?t        dddd      }n0|t
        j                  k(  rt        dddd      }nt        dddd      }||vr|j                  |       |S )	Nr   rT   rP   rR   rO   rN   rM   r,   )	r   max_autotunemax_autotune_flex_search_spacerk   rh   r   float32r'   r   rn   head_dimdtypeflex_attn_fwd_configsdefault_configs        r!   get_flex_attn_fwd_configsz-BaseConfigHeuristic.get_flex_attn_fwd_configsz  s    2444D<<<!T%H%HH!s?%!+BAq!9!+CQ!:%!+BAq!9!+BAq!9!66!((8$$r    c                    g }t         j                  r.t         j                  dk(  r| j                  S || j                  z  }t        dddd      }||vr|j                  |       |S )Nr   r,   r
   rO   )r   r   r   rl   ri   r'   r   rn   r   r   flex_attn_bwd_configsr   s        r!   get_flex_attn_bwd_configsz-BaseConfigHeuristic.get_flex_attn_bwd_configs  si    2444D<<<!T%H%HH!#BAq1!66!((8$$r    c                    g }t         j                  r.t         j                  dk(  r| j                  S || j                  z  }t        ddd      }||vr|j                  |       |S )Nr   rP   r
   r/   )r   r   r   )r   r   r   rm   rj   r)   r   rn   r   r   flex_decode_configsr   s        r!   get_flex_decode_configsz+BaseConfigHeuristic.get_flex_decode_configs  si     7944D:::4#D#DD)"aP!44&&~6""r    NrG   Noner   list[BaseConfig]rG   #Generator[TritonConfig, None, None])r   r   r   r   r   r   r   r   r   floatr   boolr   Callable[[int, int, int], bool]rG   r   )r   r   r   r   rG   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rG   r   )r   r   r   r   rD   r   rG   r   rG   z,partial[Generator[TritonConfig, None, None]]r   r   r   r   rG   zlist[FlexConfig]r   r   r   r   rG   zlist[FlexDecodeConfig])r   r   r   r   rv   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   rH   rH      s   K
Z*3!*3 
-*3X<< < 	<
 "< < < 1< 
<| !    
	 P !&3H99 9 	9
 "9 9 9 19 9 
-9(PP*-P9<P	PLTRQGRS
	5
SN%0%##$'#	#r    rH   )	metaclassc                      e Zd Zy)CPUConfigHeuristicN)r   r   r   r   r    r!   r   r     s    r    r   c                  H     e Zd ZdZd fdZddZddZ	 	 	 	 	 	 ddZ xZS )	CUDAConfigHeuristiczQ
    Child class for CUDA device specific gemm/flex attention/conv/ configs.
    c                v   t         |           t        j                  dft	        dddd      t        j                  dft	        dddd      t        j                  dft	        dddd      t        j
                  dft	        dddd      t        j
                  dft	        dddd      t        j
                  dft	        dddd      t        j                  dft	        dddd      t        j                  dft	        dddd      t        j                  dft	        dddd      i	| _        t        j                  dft	        dddd      t        j                  dft	        dddd      t        j                  dft	        dddd      t        j
                  dft	        dddd      t        j
                  dft	        dddd      t        j
                  dft	        dddd      t        j                  dft	        dddd      t        j                  dft	        dddd      t        j                  dft	        dddd      i	| _        y )	NrP   rN   rM   rR   rO   rT   r$   r,   )	r@   rv   r   r   r'   bfloat16float16h100_default_flex_configa100_default_flex_config)rn   rF   s    r!   rv   zCUDAConfigHeuristic.__init__  s    ]]BCQ!:]]C *RQ":]]C *RQ":^^R *S#q!"<^^S!:c2q!#<^^S!:b"a#;]]BCa!;]]C *S#q!"<]]C *RQ":
)
% ]]BCQ!:]]C *S"a";]]C *RQ":^^R *S"a";^^S!:c2q!#<^^S!:b"a#;]]BCQ!:]]C *S"a";]]C *RQ":
)
%r    c                F   t         j                  j                         }g }t        j                  r.t        j
                  dk(  r| j                  S || j                  z  }|dk  rx|t         j                  k(  rt        dddd      }nt        dddd      }|dk\  r| j                  j                  ||f|      }nT|dk\  rO| j                  j                  ||f|      }n0|t         j                  k(  rt        d	d
dd      }nt        dd	dd      }||vr|j                  |       |S )Nr   rT   rP   rR   rO   rN   	   r   r$   r   rM   r,   )r   r   get_device_capabilityr   r   r   rk   rh   r   r'   r   getr   r   )rn   r   r   
capabilityr   r   s         r!   r   z-CUDAConfigHeuristic.get_flex_attn_fwd_configs  s   ZZ557
2444D<<<!T%H%HH!s?%!+BAq!9!+CQ!:V#!%!>!>!B!BH%~" v%!%!>!>!B!BH%~" %!+BAq!9!+BAq!9!66!((8$$r    c                ^   t         j                  j                         }g }t        j                  r.t        j
                  dk(  r| j                  S || j                  z  }|t         j                  k(  rt        dddd      }n|dk  r<|dk\  r7|dk(  rt        dddd      }ny|d	k(  rt        dd	dd
      }net        dddd      }nV|dk\  rC|dk(  rt        dd	dd      }n=|d	k(  r|d   dk(  rdnd}t        dd|d      }nt        dddd      }nt        dddd      }||vr|j                  |       |S )Nr   r,   r
   rO   rT   r   rP   rR   rN   r$   r/   r   rM   r   )r   r   r   r   r   r   rl   ri   r   r'   r   )rn   r   r   r   r   r   r   s          r!   r   z-CUDAConfigHeuristic.get_flex_attn_bwd_configs  s;   ZZ557
2444D<<<!T%H%HH!EMM!'B15N_v!52~!+BAq!9S!+BQ!:!+BAq!96!2~!+BQ!:S",Q-1"4Q!
!+BJ!B!+BAq!9'B15N!66!((8$$r    c                   t         j                  j                         }t        ddd      }g }t        j
                  r.t        j                  dk(  r| j                  S || j                  z  }|dk\  r4|dkD  r!|t         j                  k(  rt        ddd      }nt        ddd      }nt        ddd      }||vr|j                  |       |S )NrP   r
   r/   r   r   rN   rR   )r   r   r   r)   r   r   r   rm   rj   r   r   )rn   r   r   r   r   r   s         r!   r   z+CUDAConfigHeuristic.get_flex_decode_configs  s     ZZ557
)"a36844D:::4#D#DD#~%5=="8!1"a!;!1"a!;-b!Q7N!44&&~6""r    r   r   r   )	r   r   r   r   rv   r   r   r   rI   rJ   s   @r!   r   r     s6    
6%B"%H##$'#	#r    r   c                       e Zd ZdZd fdZ	 	 	 	 	 	 ddZ	 	 	 	 ddZddZddZddZ	ddZ
dd	Z	 	 dd
ZddZddZddZddZ	 	 	 	 	 	 ddZ xZS )ROCmConfigHeuristiczJ
    Child class for ROCm specific gemm/flex attention/conv/ configs.
    c                   t         |           t               | _        g t	        ddd| j                  ddd      t	        ddd| j                  dd      t	        ddd| j                  ddd      t	        ddd	| j                  dd      t	        dd
d
| j                  dd      t	        d
dd	| j                  ddd      t	        d
dd| j                  dd      t	        d
dd
| j                  dd      t	        d
dd
| j                  dd      t	        d
dd	| j                  dd      t	        d
d
d| j                  dd      t	        d
d
d
| j                  dd      t	        d
d
d	| j                  dd      t	        d
d
d| j                  dd      t	        d
d	d| j                  ddd      t	        d
d	d| j                  dd      t	        d
d	d
| j                  dd      t	        d
d	d	| j                  dd      t	        d	dd| j                  dd      t	        d	dd
| j                  dd      t	        d	d
d| j                  ddd      t	        d	d
d
| j                  dd      t	        d	d
d	| j                  dd      t	        d	d	d| j                  ddd      t	        d	d	d| j                  dd      t	        d	d	d| j                  ddd      t	        d	d	d
| j                  dd      t	        d	d	d
| j                  dd      t	        d	d	d	| j                  dd      t	        d	dd| j                  ddd      t	        d	dd
| j                  dd      t	        dd
d
| j                  dd      t	        dd	d| j                  ddd      t	        dd	d| j                  dd      t	        dd	d
| j                  dd      t	        ddd
| j                  dd      | _        t        j                  g dd      D 	c
g c]O  \  }}}d| j                  fD ]8  }dD ]1  }dD ]*  }dD ]#  }dD ]  }dD ]  }	t	        |||||||||		        % , 3 : Q c
}	}}}}}}}}| _        t        j                  d
ft        d	ddd      t        j                  d	ft        d	ddd      t        j                  dft        d
ddd      t        j                  d
ft        d	d
dd      t        j                  d	ft        d	d
dd      t        j                  dft        dd
dd      t        j                  d
ft        d	d
dd      t        j                  d	ft        d	d
dd      t        j                  dft        dd
dd      i	| _        dD 
cg c]  }
dD ]  }dD ]  }t        |
|d|          c}}}
| _        dD 
cg c]=  }
dD ]6  }|
d	k\  s|d	k\  rddgndgD ]   }dD ]  }||
z  dk(  rt        |
|d||       " 8 ? c}}}}
| _        t#        ddd      t#        d
dd      t#        d	dd      t#        ddd      t#        d
dd      t#        d	dd      g| _        dD cg c]C  }dD ]<  }dD ]5  }dD ].  }dD ]'  }dt'        d|z        fD ]  }t        ||||||       ) 0 7 > E c}}}}}}| _        dD 
cg c]K  }
dD ]D  }dD ]=  }dD ]6  }dD ]/  }dt'        d|z        fD ]  }||
z  dk(  rt        |
|||||       1 8 ? F M c}}}}}}
| _        dD cg c]=  }dD ]6  }dD ]/  }dD ](  }dt'        d|z        fD ]  }t#        |||||d       * 1 8 ? c}}}}}| _        y c c
}	}}}}}}}}w c c}}}
w c c}}}}
w c c}}}}}}w c c}}}}}}
w c c}}}}}w )Nr,   rT   rO   r/   )r%   r.   rM   )r%   r$   rN   rP   rS   rR   rU   r
   )rO   r$   )rO   r$   r,   )r   r,   )r   r/   )r/   )r,   rP   rN   rZ   )r,   rM   rP   rY   r   )r
   r/   rW   )r0   )r@   rv   r   default_num_stagesr+   r[   r\   r]   r^   r   r   r6   r   r   default_flex_configrh   ri   r9   rj   r   rk   rl   rm   )rn   ro   rp   rq   r   r   r%   r-   r.   r0   rr   rs   ru   mfmawpeur   rF   s                   r!   rv   zROCmConfigHeuristic.__init__=  s=
   "8":7-
BT44aQR7-
 2r3(?(?AN	7-

 BD33QPQ7-
 2r3(?(?AN7-
 2r2t'>'>1M7-
 BT44aQR7-
 2r2t'>'>1M7-
 2r2t'>'>1M7-
 2r2t'>'>1M7-
  2r3(?(?AN!7-
" 2r2t'>'>1M#7-
$ 2r2t'>'>1M%7-
& 2r3(?(?BO'7-
( 2r3(?(?AN)7-
* CT44aQR+7-
0 2sB(?(?AN17-
2 2sB(?(?AN37-
4 2sC)@)@!QO57-
6 3B(?(?AN77-
8 3B(?(?AN97-
: RT44aQR;7-
@ 3B(?(?BOA7-
B 3C)@)@!QOC7-
D S"d55q"STE7-
J 3R)@)@!RPK7-
L S"d55q"STM7-
R 3R)@)@!RPS7-
T 3R)@)@!QOU7-
V 3S$*A*A1bQW7-
X S"d55q"STY7-
^ 3R)@)@!QO_7-
` 3B(?(?ANa7-
b S"d55q!RSc7-
h 3R)@)@!RPi7-
j 3R)@)@!QOk7-
l 3R)@)@!QOm7-
L .7->->&q.5
 5
 *'  !$"9"9:5
 #!5
  %#5
" (/%5
$ % &'5
& )5
( ' $
5

5

5

5

5

5

5
 5
0 ]]BRA!>]]C .b!Q"?]]C .RA">^^R .b!Q"?^^S!>#r1a#@^^S!>"b!Q#?]]BRA!>]]C .b!Q"?]]C .RA">
$
  (A
 A
+A
 	A
  661a0A
0A
0A
+ 'A
 A
'A
  &#3q!fQC	A
 A

 !# 661a6A
6A
6A
6A
+ !Q* Q* a+ Q* Q* a+E
) -C
 C
(C
 $	C
 &C

 C
 CY/0C
  7GZD$OC
OC
OC
OC
OC
OC
- ,	C
 	C
+	C
 $		C
 &	C

 	C
 CY/0	C
 !# 66:y$M	C
M	C
M	C
M	C
M	C
M	C
- -G
 G
$G
 &	G
 G

 CY/0G
  !*itSTUG
UG
UG
UG
UG
+c5
 5
FA
A
$C
	C
G
s)   A^#^A^
6A^$A^.9A^8c                6    |D ]  }| j                   |_         |S r?   )r  r   )rn   r   new_num_stagesr   s       r!   _filter_configsz#ROCmConfigHeuristic._filter_configs  s%    
  	3A22AL	3r    c           
   #  N  K   t               }t        j                  j                  }|D ]w  }t	        |j
                  |j                  |j                  z  dz        |_        t        |dd      }t        |dd      }t        |dd      }|dk7  r%|j                  |z  dk7  s|j                  |z  dk7  r|j                  |j                  |j                  |j                  |j
                  |||f}t        |dd	      }	|	||	fz  }|dk7  rt        d
|j
                  z        }||vs|t        |      |k  s|j                  |       |j                  |j                  |j                  |j                  |j
                  |||d}
|	|	|
d<    | j                  di |
 z y	w)rx   rT   r-   r,   r.   r   r0   r/   r%   Nr$   )ro   rp   rq   r   r   r-   r.   r0   ry   r   )r	   r   rz   r{   r|   r   r   r   r}   r   r   r   r~   r   r   )rn   r   r   r{   r   r-   r.   r0   r   r%   rD   s              r!   r   z(ROCmConfigHeuristic._finalize_mm_configs  s     -7L,,;; 4	3D 1LPS1STDN $+41G#L "4;LD'1-E#q(33q8<<"66!;  $	$C dIt4G"z!q "1#67$&#d)n*D#||#||#||"&//!%,@$0"	 &(/F9%(d((2622i4	3s   D#F%&F%8A-F%c                |    | j                  | j                  | j                        }t        | j                  |      S r   )r  r_   r  r   r   rn   filtered_configss     r!   r   z(ROCmConfigHeuristic.get_extra_mm_configs   s:    //!!4#:#:
 t11;KLLr    c                |    | j                  | j                  | j                        }t        | j                  |      S r   )r  r`   r  r   r   r  s     r!   r   z'ROCmConfigHeuristic.get_int8_mm_configs&  s:    //  $"9"9
 t11;KLLr    c                    t         j                  dk(  r| j                  | j                  z   n| j                  }| j	                  || j
                        }t        | j                  |      S r   )r   r   r[   ra   r  r  r   r   )rn   r[   r  s      r!   r   z(ROCmConfigHeuristic.get_mixed_mm_configs,  sa     44D OOd333 	
  //
D<S<STt11;KLLr    c                |    | j                  | j                  | j                        }t        | j                  |      S r   )r  rb   r  r   r   r  s     r!   r   z-ROCmConfigHeuristic.get_persistent_mm_configs5  s:    //&&(?(?
 t11;KLLr    c                |    | j                  | j                  | j                        }t        | j                  |      S r   )r  rc   r  r   r   r  s     r!   r   z)ROCmConfigHeuristic.get_scaled_mm_configs;  s:    //""D$;$;
 t11;KLLr    c                |    | j                  | j                  | j                        }t        | j                  |      S r   )r  rd   r  r   r   r  s     r!   r   z4ROCmConfigHeuristic.get_scaled_persistent_mm_configsA  s<      //--t/F/F
 t11;KLLr    c                h    | j                  | j                  d      }t        | j                  |      S )Nr
   r   )r  re   r   r   r  s     r!   r   z*ROCmConfigHeuristic.get_mm_plus_mm_configsI  s/    //0G0GKt00:JKKr    c                |    | j                  | j                  | j                        }t        | j                  |      S r   )r  rg   r  r   r   r  s     r!   r   z$ROCmConfigHeuristic.get_conv_configsM  s:    //t66
 t11;KLLr    c                   g }t         j                  r.t         j                  dk(  r| j                  S || j                  z  }|dk  rO|t
        j                  k(  rt        dddd      }nt        dddd      }| j                  j                  ||f|      }n0|t
        j                  k(  rt        dd	dd      }nt        dddd      }||vr|j                  |       |S )
Nr   rT   rP   r
   rO   rN   r$   rM   r,   )r   r   r   rk   rh   r   r   r6   r  r   r   r   s        r!   r   z-ROCmConfigHeuristic.get_flex_attn_fwd_configsS  s    2444D<<<!T%H%HH!s?%!/B1!=!/RA!>!5599!>N %!/B1!=!/B1!=!66!((8$$r    c                   g }t         j                  r.t         j                  dk(  r| j                  S || j                  z  }|t
        j                  k(  rt        dddd      }nJ|dk  r7|dk(  rt        dddd      }n1|dk(  rt        dddd      }nt        dddd      }nt        dddd      }||vr|j                  |       |S )	Nr   r,   r
   rO   rT   rP   rN   r$   )	r   r   r   rl   ri   r   r   r6   r   r   s        r!   r   z-ROCmConfigHeuristic.get_flex_attn_bwd_configsn  s    2444D<<<!T%H%HH!EMM!+BAq9N_2~!/B1!=S!/CA!>!/B1!=+BAq9N!66!((8$$r    c                    g }t         j                  r.t         j                  dk(  r| j                  S || j                  z  }t        ddd      }||vr|j                  |       |S )Nr   rP   r
   rO   )r   r   r   rm   rj   r9   r   r   s        r!   r   z+ROCmConfigHeuristic.get_flex_decode_configs  si     7944D:::4#D#DD-b!Q7!44&&~6""r    r   )r   r   r
  r   rG   r   r   r   r   r   )r   r   r   r   rv   r  r   r   r   r   r   r   r   r   r   r   r   r   rI   rJ   s   @r!   r  r  8  s    W
r'9<	?3!?3 
-?3BMMMMMM	5MLM%6%2##$'#	#r    r  c                      e Zd ZdZy)XPUConfigHeuristicz=
    Placeholder child class for XPU specific overrides.
    N)r   r   r   r   r   r    r!   r  r    s    r    r  ),
__future__r   r   r\   r   	functoolsr   	threadingr   typingr   r   r   r   torch.utils._ordered_setr	    r   utilsr   virtualizedr   collections.abcr   r   r   r   	dataclassr   r#   rf   r'   r)   r+   r4   r6   r9   typer;   rH   r   r   r  r  r   r    r!   <module>r'     s   "      / /  /  )  )- 	 	 	    
        Z   Z   Z   +  'T '&a#$: a#H	, 	~#- ~#B^#- ^#B, r    