
    rh1                     l   d dl Z d dlZd dlmZmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
dlmZmZ d
dlmZmZ d
dlm Z  d
dl!m"Z" d
dl#m$Z$ d
dlm%Z%m&Z& d
dl'm(Z(m)Z)m*Z+m,Z, d
dl-m.Z.m/Z/m0Z0m1Z1 d
dl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZH 	 d dlIZI eeIj                        ZKdZL ej                  eO      ZPej                  j                  ZRej                  j                  ZS e1deCej                  j                  eKdk\  rdnddd      ZV e1deEd       ZWd!ZXd"ZYd#ZZ e1d$eEeZeXz   eYz          Z[e j                  d%        Z] e/ej                  d&      Z_ e/ej                  d'eRj                  j                  (      Zb e/ej                  d)      Zd e/ej                  d*d+      Zf e/ej                  d,eRj                  j                  (      Zid- Zjd. Zke j                  d/emfd0       Zndddd1d2ZodEd3Zp e/eod      Zqd4 Zr e,eRj                  d5      dd6d7       Zs e,eRj                  d5      dd6d8       Zt e,eRj                  d5      dddd9d:       Zu e,eRj                  d5      ddd;d<       Zv e(eRj                  j                  e)        e,eRj                  j                  d5      	 	 	 	 	 dFd=       Zwe j                  d>eex   d/emfd?       Zyd@ Zz	 	 dGdAeex   fdBZ{dC Z|dD Z}y# eM$ r  ed      ZKdZLY \w xY w)H    N)AnyOptional)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)V)make_fx)TorchVersion   )configir)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKTileGemmTemplate)CKGemmTemplate)SubgraphTemplate)FlexibleLayout	is_triton)add_layout_constraintconstrain_to_fx_strides	loweringsregister_lowering)autotune_select_algorithmExternKernelChoicerealize_inputsTritonTemplate)_use_cutlass_for_opget_k_splitsget_tma_workspace_arguse_aten_gemm_kernelsuse_ck_gemm_templateuse_ck_tile_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_decompose_k_choiceuse_triton_templateuse_triton_tma_template   )
_is_static_problemaddmm_epiloguemm_argsmm_config_kwargsmm_grid
mm_optionspersistent_mm_gridpersistent_mm_optionsscale_mm_epiloguescaled_mm_optionsTz0.0.0Fmmz3.3.0a3
  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)
    tl.assume(pid_m >= 0)
    tl.assume(pid_n >= 0)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if ((stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1)) and M >= BLOCK_M:
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if ((stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1)) and N >= BLOCK_N:
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}

        {% if USE_FAST_ACCUM %}
        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% else %}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% endif %}

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
a
  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)
    tl.assume(pid_m >= 0)
    tl.assume(pid_n >= 0)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        {% if USE_FAST_ACCUM %}
        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% else %}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% endif %}

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
)namegridsource"cache_codegen_enabled_for_templateprologue_loads_all_inputsmm_persistent_tmaa  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    start_pid = tl.program_id(0)
    grid_m = tl.cdiv(M, BLOCK_M)
    grid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = grid_m * grid_n
    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    width = GROUP_M * grid_n
    rk_for_mask = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    {%- if TMA_EXPERIMENTAL_API %}
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
        global_size=[M, K] if A_ROW_MAJOR else [K, M],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
        global_size=[K, N] if B_ROW_MAJOR else [N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    a_desc = a_desc_ptr
    b_desc = b_desc_ptr
    {%- else %}
    a_desc = triton.language.make_tensor_descriptor(
        base=A,
        shape=[M, K] if A_ROW_MAJOR else [K, M],
        strides=[K, 1] if A_ROW_MAJOR else [M, 1],
        block_shape=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
    )
    b_desc = triton.language.make_tensor_descriptor(
        base=B,
        shape=[K, N] if B_ROW_MAJOR else [N, K],
        strides=[N, 1] if B_ROW_MAJOR else [K, 1],
        block_shape=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
    )
    {%- endif %}

    pid_m = 0
    pid_n = 0
    rm = 0
    rn = 0

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            # re-order program ID for better L2 performance
            group_id = tile_id // width
            group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
            pid_m = group_id * GROUP_M + (tile_id % group_size)
            pid_n = (tile_id % width) // (group_size)

            rm = pid_m * BLOCK_M
            rn = pid_n * BLOCK_N

        rk = ki * BLOCK_K

        {%- if TMA_EXPERIMENTAL_API %}
        a = tl._experimental_descriptor_load(
            a_desc,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
            [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
            A.dtype.element_ty,
        )
        b = tl._experimental_descriptor_load(
            b_desc,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
            [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
            B.dtype.element_ty,
        )
        {%- else %}
        a = tl.load_tensor_descriptor(
            a_desc,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
        )
        b = tl.load_tensor_descriptor(
            b_desc,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
        )
        {%- endif %}
        acc += tl.dot(
            a if A_ROW_MAJOR else a.T,
            b if B_ROW_MAJOR else b.T,
            allow_tf32=ALLOW_TF32,
        )

        if ki == k_tiles - 1:
            # rematerialize rm and rn to save registers
            rcm = rm + tl.arange(0, BLOCK_M)
            rcn = rn + tl.arange(0, BLOCK_N)
            idx_m = rcm[:, None]
            idx_n = rcn[None, :]
            mask = (idx_m < M) & (idx_n < N)

            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
            acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

)r8   r9   r:   a  
@triton.jit
def load_scales(a_scale_ptr, b_scale_ptr, SCALING_ROWWISE: tl.constexpr):
    if SCALING_ROWWISE:
        # For row-wise scaling, we'll return the pointers
        return a_scale_ptr, b_scale_ptr
    else:
        # For per-tensor scaling, we'll load the scalar values
        a_scale = tl.load(a_scale_ptr)
        b_scale = tl.load(b_scale_ptr)
        return a_scale, b_scale
a'  
@triton.jit
def apply_scaling(
    accumulator,
    a_scale,
    b_scale,
    SCALING_ROWWISE: tl.constexpr,
    offs_cm,
    offs_cn,
    M,
    N,
    stride_a_scale_m,
    stride_b_scale_n,
):
    if SCALING_ROWWISE:
        # For row-wise scaling, we need to load the scales for each row/column
        a_scales = tl.load(
            a_scale + (offs_cm * stride_a_scale_m),
            mask=offs_cm < M,
            other=0.0,
        )
        b_scales = tl.load(
            b_scale + (offs_cn * stride_b_scale_n),
            mask=offs_cn < N,
            other=0.0,
        )
        acc_scale = a_scales[:, None] * b_scales[None, :]
    else:
        # For per-tensor scaling, we can directly use the loaded scalar values
        acc_scale = a_scale * b_scale

    return accumulator * acc_scale
a  
{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    if SCALING_ROWWISE:
        stride_a_scale_m = 1
        stride_b_scale_n = 1
    else:
        stride_a_scale_m = 0
        stride_b_scale_n = 0

    start_pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_M)
    num_pid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = num_pid_m * num_pid_n

    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    {%- if TMA_EXPERIMENTAL_API %}
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K],
        global_size=[M, K],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_N, BLOCK_K],
        global_size=[N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    a_desc = a_desc_ptr
    b_desc = a_desc_ptr
    {%- else %}
    a_desc = triton.language.make_tensor_descriptor(
        base=A,
        shape=[M, K],
        strides=[K, 1],
        block_shape=[BLOCK_M, BLOCK_K],
    )
    b_desc = triton.language.make_tensor_descriptor(
        base=B,
        shape=[N, K],
        strides=[K, 1],
        block_shape=[BLOCK_N, BLOCK_K],
    )
    {%- endif %}

    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    pid_m = 0
    pid_n = 0
    offs_am = 0
    offs_bn = 0

    num_pid_in_group = GROUP_M * num_pid_n
    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    a_scale, b_scale = load_scales(A_inverse_scale, B_inverse_scale, SCALING_ROWWISE)

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            group_id = tile_id // num_pid_in_group
            first_pid_m = group_id * GROUP_M
            group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
            pid_m = first_pid_m + (tile_id % group_size_m)
            pid_n = (tile_id % num_pid_in_group) // group_size_m

            offs_am = pid_m * BLOCK_M
            offs_bn = pid_n * BLOCK_N

        offs_k = ki * BLOCK_K

        {%- if TMA_EXPERIMENTAL_API %}
        a = tl._experimental_descriptor_load(
            a_desc_ptr, [offs_am, offs_k], [BLOCK_M, BLOCK_K],  A.dtype.element_ty
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr, [offs_bn, offs_k], [BLOCK_N, BLOCK_K],  B.dtype.element_ty
        )
        {%- else %}
        a = tl.load_tensor_descriptor(a_desc, [offs_am, offs_k])
        b = tl.load_tensor_descriptor(b_desc, [offs_bn, offs_k])
        {%- endif %}
        if USE_FAST_ACCUM:
            accumulator = tl.dot(a, b.T, accumulator)
        else:
            accumulator += tl.dot(a, b.T)

        if ki == k_tiles - 1:
            # Apply inverse scaling
            offs_cm = offs_am + tl.arange(0, BLOCK_M)
            offs_cn = offs_bn + tl.arange(0, BLOCK_N)
            # Apply scaling
            accumulator = apply_scaling(
                accumulator,
                a_scale,
                b_scale,
                SCALING_ROWWISE,
                offs_cm,
                offs_cn,
                M,
                N,
                stride_a_scale_m,
                stride_b_scale_n,
            )

            idx_m = offs_cm[:, None]
            idx_n = offs_cn[None, :]
            mask = (idx_m < M) & (idx_n < N)
            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
            accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
scaled_mm_device_tmac                     t        |       S N)r   )fns    l/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/kernel/mm.pylazy_register_extern_choicerC   6  s    b!!    z
at::mm_outzat::addmm_out)op_overloadzat::_int_mm_outzat::_sparse_semi_structured_mm)has_out_variantzat::_scaled_mm_outc                 b    | j                         t        j                  t        j                  fv S r@   )	get_dtypetorchint8uint8)mats    rB   _is_int8_matrM   N  s     ==?uzz5;;777rD   c                     | |z  dkD  S )Ni     )mnks      rB   _is_large_block_for_cpurS   R  s    q55=rD   returnc                      t         j                  j                         syt         j                  j                  t         j                  j	                               } | j
                  dk(  S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.F
   )rI   cudais_availableget_device_propertiescurrent_devicemajor)device_propertiess    rB   
using_b200r]   W  sJ     ::""$

889R9R9TU""b((rD   outalphabetac                    | j                  d      dk(  s| j                  d      dk(  rt        j                  | d   |||||      S t        j                  | |||||      S )z
    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
    kernel under the hood.  There are a few shapes where this is slower,
    but they are rare.
    r   r,   r^   )stridesizerI   addmm)inpmat1mat2r_   r`   ra   s         rB   
bias_addmmri   a  sY     zz!}SXXa[A-{{3q643e$OO;;sD$Cu4HHrD   c                 X    dt         fd}dt         fd}dt         fd}t        j                   | j                               xs  | j	                                fd       t        j                   |j                               xs  |j	                               fd       y )NrT   c                 \    t         j                  j                  j                  | d   d      S )Nr,   r   graphsizevarsstatically_known_equalsrc   s    rB   is_row_majorz.check_supported_striding.<locals>.is_row_majorm  #    ww77q	1EErD   c                 \    t         j                  j                  j                  | d   d      S Nr   r,   rl   rp   s    rB   is_col_majorz.check_supported_striding.<locals>.is_col_majorp  rr   rD   c                     t        t        j                  j                  j	                  | d   d      xs- t        j                  j                  j	                  | d   d            S rt   )boolr   rm   rn   ro   )rd   s    rB   has_zero_dimz.check_supported_striding.<locals>.has_zero_dims  sQ    GG44T!Wa@ Dww77QC
 	
rD   c                  *    d j                          S )Nz$mat_a must be row_major, got stride 
get_stride)mat_as   rB   <lambda>z*check_supported_striding.<locals>.<lambda>|      6u7G7G7I6JK rD   c                  *    d j                          S )Nz$mat_b must be col_major, got stride rz   )mat_bs   rB   r}   z*check_supported_striding.<locals>.<lambda>  r~   rD   )rw   rI   _checkr{   get_size)r|   r   rq   ru   rx   s   ``   rB   check_supported_stridingr   l  s    F FF F
d 
 
LLU%%'(JL9I,JK 
LLU%%'(JL9I,JKrD   c                    | j                   d   }|j                   d   }| j                   d   }||z  }|}t        j                  | j                  |||      d      }|j                  |||      }	t        j                  ||	t        j
                        }
t        j                  |
d      }|j                  | j                        S )Nr   r,   )r,   r   r   	out_dtype)	shaperI   permutereshapebmmfloat32sumtodtype)abk_splitsrP   rQ   rR   k_partsB
a_reshaped
b_reshapedresultreduced_bufs               rB   
decomposeKr     s    	
A	
A	
A8mGAqyyAw7CJ1gq)JYYz:GF))FA&K>>!''""rD   )type_promotion_kindlayoutc                   t        | ||      \  }}}}} }t        j                  |       }d}t        d   d| d| d| xx   dz  cc<   t        j                  d|||| j                         |j                         |       |}t        j                  s<t        j                  s,t        |j                  |j                  |j                        }t               rt        j!                  | |f|      gng }	t#        |      \  }
}t$        j&                  j)                  |      }t$        j&                  j+                  |      }t$        j&                  j-                  |      }| j                         }|rt/        |      r ||||fi t1        |t2        |j4                        D ]*  }t7        j8                  |	f| |f|d	t;        |||||       , t=        | |      rw ||||fi t1        |t2        |j4                        D ]P  }t?        j8                  |	f| |f|tA        d
| jC                               dt;        |||||      tE        | |       R ddl#m$ tK        fd| jM                         | jO                         |jM                         |jO                         fD              }tQ        |||      r|sddl)m*} d
dl+m,} t[        |||      }|D ]  }t$        j\                  j^                  ja                  tc        jd                  tc        jf                  ||      d            sU |       5   |       }ti        d| dtk        tm        jn                  tp        |      |            }ddd       j9                  |	| |f|	        |r2ts        ||||      r$tu        d      rtw        jx                  |	|| |g       |r't{        ||||      rt}        j~                  |	|| |g       |r't        ||||      rt        j                  |	|| |g       t        || |      rt        j                  |	|| |g       | |g}|r t/        |      rt        j                  j                  j                  |      rt        |       rg }t               r|j                  d       t        |	      } ||||fi t1        |t2              D ]*  }t7        j8                  |	f| |f|d	t;        |||||       , t        | |||||	||t               dd|      }t        j                  j                  j                  |      s*|#t        |      dkD  r|	D cg c]	  }||v s| }	}n|	d| }	t        j                  D ].  }|	j                  t        |      j!                  | |f|             0 t        ||	| |g|      S # 1 sw Y   FxY wc c}w )z_
    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
    r   r7   aten_mm_infozaten.mm__r,   zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sdevicer   rd   input_nodesr   r   num_tma_descriptorsr   r   r   workspace_argr   )get_free_symbolsc              3   J   K   | ]  }t         |d             dkD    yw)T)unbacked_onlyr   N)len).0itrr   s     rB   	<genexpr>ztuned_mm.<locals>.<genexpr>  s+      
  D9:Q>
s    #)enable_python_dispatcher)select_decomp_tabledecompose_k_mm__split)r   )r8   make_fx_graphN	extern_mmrV   )top_kalways_included)Rr/   r   get_device_typer   loginforH   inductor_configmax_autotunemax_autotune_gemmr   r   r   rd   r$   aten_mmbindr-   r   choicesget_base_mm_configsget_persistent_mm_configsget_extra_mm_configsr*   r0   rS   itemsizemm_templatemaybe_append_choicer2   r+   persistent_tma_mm_templater#   
get_devicer4   torch._inductor.irr   anyr   r{   r)   torch._dispatch.pythonr   decompositionr   r"   rm   rn   statically_known_truesympyEqModr   r   	functoolspartialr   r(   r!   r   add_cutlass_gemm_choicesr%   r   add_ck_gemm_choicesr&   r   add_choicesr'   r   rI   	_inductorr   run_autoheuristicr   appendr   mm_autoheuristicr
   collect_autoheuristicexternal_matmulrC   r   )rg   rh   r   rP   rQ   rR   device_typer8   aten_layoutr   static_shape
is_nonzero
mm_configspersistent_mm_configsextra_mm_configsr   r   unbacked_symbolsr   r   r   k_splitdecompositionsdecompose_k_subgraph_templater   r    num_choices_before_extra_configs
ah_choiceschoicer   s                                @rB   tuned_mmr     s   
 #*$V"DAq!VT4$$T*KD ^xs!A3as3494HHY			 K((O,M,M$==6;;
 6K5LtTlK	01RT   2&9L*..{;JII??Lyy55kBNNE)&1 
 {,CU^^T	
 	F ++!4L VQ1f5		 #4./ #!8%..	  +>>
!%t!"7,-#0#	
 !Aq&9
 ,D$7
( 	8  
 !!	
 
 "!Q*3CG;#Aq!,H# ww''==HHUYYq'2A6 -/ 	%8%:N4D.wiv>&-%--j7K*'51	 .AA!%t! B #0 	 Aq1%66wtU*61a;**7FT4LI/1a@&&wtEVT40##4L	
 ,K'OO""44T:dO """;/+.w<(&q!
'5LM
 	F ++!4L VQ1f5		 &O+

 %%;;DA%#j/A*=
 18Pf6Z;O6PP!"C#CD,, R215::D$<PQR %T7T4L&IIm	 	^ Qs   /<V/	V<V</V9	c                   t        | ||t        j                        \  }}}}} }t        d   d| d| d| xx   dz  cc<   t        j                  d|||| j                         |j                         |       t        j                  |       }t        |      \  }}|xr |xr t        ||||      }	t               rt        j                  | |f|      gng }
|	r't        d      rt        j                   |
|| |gdd	       t"        j$                  j'                  |      }|rSt)        |d
      rF ||||fi t+        |t,              D ]*  }t/        j0                  |
f| |f|dt3        |||||       , t5        d|
| |g|      S )Nr   r   r   zaten._int_mm_r   r,   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sint_mmTfuseablenon_fuseable)enable_int32r   )r/   rI   int32r   r   r   rH   r   r   r-   r(   r$   aten__int_mmr   r!   r   r   r   r   get_int8_mm_configsr*   r0   rS   r   r   r2   r   )rg   rh   r   rP   rQ   rR   r   r   r   use_cutlassr   int8_mm_configsr   s                rB   tuned_int_mmr   R  s   ")d6U[[#Aq!VT4
 ^}QCq1QC89Q>9HH^			 $$T*K1&9L*W:W2FvqRSUV2WK 6K5L		D$<	01RT  *8466VdD\Dt	
 ii33K@O)&tD%q!
'5LM
 	F ++!4L VQ1f5		 %XwtfMMrD   )r`   ra   r   c                   t        j                  |      }t        ||| |      \  }}}	}}}}
t        |      \  }}t        d   d| d| d|	 xx   dz  cc<   t
        j                  d|||	|j                         |j                         |       |r t        j                  st        j                  swddlm}m} t        ||      r) ||j                  |j                   |j"                  	      }t%               rt&        j)                  | ||f|||
      gng }t+        d|| ||g|      S t%               rt&        j)                  |
||f|||
      gng }t%               ry|
j-                         d   dk(  rc|
j/                         j0                  dk(  rFt        j2                  j4                  r,|j7                  dt8        j)                  |
||f|||
             t:        j<                  j?                  |      }t:        j<                  jA                  |      }|j                         }|r.tC        |      r" ||||	fi tE        |tF        |jH                        D ]]  }tK        jL                  |f|
||f|dtO        ||||	|      dtQ        |j                   ||      tS        d|j                   ||g      d _ tU        ||      r ||||	fi tE        |tF        |jH                        D ]k  }tW        jL                  |f|
||f|tY        d|j/                               dtO        ||||	|      t[        ||      dtQ        |j                   ||      d m |r9t]        ||||	      r+t_        d      r ta        jb                  |||||
g||g d       |r.te        ||||	      r tg        jh                  |||||
g||g d       tk        |||      rtm        jn                  |||
||g||d       t+        d||
||g|      S )Nr   r   zaten.addmm_r   r,   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   )FixedLayoutr   r   )r`   ra   re   rW   r   r.   )prefix_argsepilogue_fnepilogue_fn_hashr   r   r   )r   r   )r   r   r,   )r`   ra   input_reorderT)r`   ra   has_bias)8r   r   r/   r-   r   r   r   rH   r   r   r   r   r   r   
isinstancer   r   rd   r$   
aten_addmmr   r   r{   r   typetritonautotune_cublasLtinsertaten_bias_addmmr   r   r   r   r*   r0   rS   r   r   r   r2   r.   strr+   r   r#   r4   r(   r!   r   r   r%   r   r   r'   r   r   )rf   rg   rh   r`   ra   r   r   rP   rQ   rR   inp_expandedr   r   r   r   r   r   r   r   r   s                       rB   tuned_addmmr    s\   $$T*K07dCPV0W-Aq!VT41&9L* ^{1#Qqc1#671<7HH\			 ))_-N-N 	Cfk*#}}FLLv{{F %& $%	     	 )'Ct;LfUU !" OOtT*	  	
   	##%a(A-##%**f4""44 	  tT*F%d ! 	
 ..{;JII??LNNE)&1 
 {,CU^^T	
 	F ++)46 VQ1f5	
 *6<<E!$&6eT%R!S	  #4./ #!8%..	  +>>!-tT :!"7,-#0#	 !Aq&9 ,D$7 !" .v||UD I. 	 Aq1(664&#	
 *61a;**4&#	
 VT40##4&	
 %<t4f rD   )r   r   c                   ddl m}  || ||      \  } }}| j                         \  }}|j                         \  }}	|j                         \  }
}t        j                  j
                  j                  ||      }t        j                  j
                  j                  d|z  |
      }|6ddlm}  ||j                         |r|n|j                         ||g|dg      }n	|J d       t               rt        j                  | ||f||      gng }||z  dk7  r6t        ||||      r(t        d      rt!        j"                  ||| ||gd	d	
       t%        d|| ||g|      S )Nr   )r   r   )r   r,   z,out_dtype is ignored if layout is specified.r   sparse_semi_structured_mmTr   ) torch._inductor.select_algorithmr   r   r   rm   rn   guard_equalsr   r   r   rH   r$   aten__sparse_semi_structured_mmr   r(   r!   r   r   r   )rg   	mat1_metarh   r   r   r   m1k1m2r   k2rQ   rP   rR   r   r   s                   rB   tuned_sparse_semi_structured_mmr    ss    @*4DAD)T]]_FB EBMMOEB	%%b"-A	%%a"fb1A~2OO"I(8FF	
  P"PP  !"	 ,00y$'9 1 	
   	
A
 Aq1 ;<66VdD)4tRV	
 %#WtY.Ev rD   c	                    t        | |||      \  }	}
}}} }t        d   d|	 d|
 d| xx   dz  cc<   t        j                  d|	|
|| j	                         |j	                         |       t        j                  |       }t        | |       t        ||      \  }}|s| |||f}nt        |      }| ||||f}t        j                  ||||      }g }t               r|j                  |       |j                  t        j                  k7  rt!        d|||      S t#        |      \  }}t$        j&                  j)                  |      }t$        j&                  j+                  |      }|rt-        |d	
      r|rTt/        |j1                               t/        |j1                               dz   k(  rt3        t4        j6                     |d      }n|}t/        |j1                               dk(  st/        |j1                               dk(  rt/        |j1                               t/        |j1                               k(  sJ t3        t4        j6                     t3        t4        j6                     |d      d      }t3        t4        j6                     t3        t4        j6                     |d      d      }n|}|}|r
| ||||f}d}n| |||f}d}t9        | |      rX|sV ||	|
|      D ]I  }t;        ||	|
|||||d		      }t=        j>                  |f||tA        d| jC                               d| K  ||	|
|      D ]  }t$        jD                  jF                  jI                  tK        jL                  |d            rAtO               r>t$        jD                  jF                  jI                  tK        jP                  |d            rt;        ||	|
|||||      }tS        j>                  |f||d||tU               dd  |r2tW        ||	|
|      r$tY        d      rt[        j\                  ||||       |r%t_        ||	|
|      rta        jb                  |||       t!        d|||      S )a9  
    Performs an optimized matrix multiplication where scaling factors are applied
    to the inputs and/or output.

    Args:
        mat1 (Tensor): First input matrix
        mat2 (Tensor): Second input matrix
        scale1 (Tensor): Scale factor applied to mat1 (supports broadcasting)
        scale2 (Tensor): Scale factor applied to mat2 (supports broadcasting)
        bias (Tensor, optional): Optional bias tensor to add to the result
        layout: Layout hint for optimization

    Returns:
        Tensor: The result of the scaled matrix multiplication
    r   r   zaten._scaled_mm.default_r   r,   z_Tuned aten._scaled_mm.default: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s)r   use_fast_accum	scaled_mmT)enable_float8r      r   )
device_tmar   r          r   r5   )suffix_argsr   r   )r  )2r/   r   r   r   rH   r   r   r   r   aten__fp8_mmr   r$   r   r   rI   r   r   r-   r   r   get_scaled_mm_configs get_scaled_persistent_mm_configsr*   r   r   Laten	unsqueezer+   r6   scaled_mm_device_tma_templater   r#   r   rm   rn   guard_or_falser   Ler]   Ltr   r5   r(   r!   r   r   r%   r   r   )r|   r   scale_ascale_bbiasscale_resultr   r  r   rP   rQ   rR   r   scale_a_realscale_b_realr   	bias_realaten_choicer   r   r   scaled_mm_configsscaled_persistent_mm_configstriton_biastriton_scale_atriton_scale_btriton_input_nodesr  r   kwargss                                 rB   tuned_scaled_mmr4  O  sO   6 %,uVy%!Aq!VUE ^7s!A3asCDIDHHi			 $$U+KUE*!/!AL, e\<@"4(	e\<K##Vy $ K G{# }}%(g{FSS&v.MAz		77D#$99#M#M$  )&EC()S-AA-EEDNN+D!4KKw!"a'3w/?/?/A+Ba+Gw'')*c'2B2B2D.EEEEt~~.q/@!/LaPNt~~.q/@!/LaPN$N$N" K"'!OK #5%06q!Q? *"#
 .AA	 2!"7,-$//1#		 	. (1a0 	Fww..uxx2? | 0 0 ? ?B P&1a'>F ++. 	
 (-/!4	2 	 Aq1,66)		
 *61a;**7FKH$[';OOrD   indexc                 f    t         j                  j                  | xs d      }|j                  dk  S )Nr      )rI   rW   rY   r[   )r5  propss     rB   _is_sm7x_or_older_gpur9    s)    JJ,,UZa8E;;!rD   c                 &    t        d | D              S )Nc              3   <   K   | ]  }t        |t                y wr@   )r   int)r   dims     rB   r   zdims_are_int.<locals>.<genexpr>  s     4z#s#4s   )all)dimss    rB   dims_are_intr@    s    4t444rD   r   c           	          t        | ||||      \  }}}t        |||g      sy t        | |      \  }}fd}d } ||||| |||      }t        ||||||	      }|
|j	                  |
|      S |j                         S )Nc                 V   t               }|j                  d|        |j                  d|       |j                  d|       |j                  d|j                  j                  d       |j                  d|j                  j                  d       t	        |d|       t	        |d	|       |j                  d
|j                  j                         d       |j                  d|j                  j                         d       dk(  r t        ||j                  j                         |S )NrP   rR   rQ   
mat1_dtypeT)is_categorical
mat2_dtyperg   rh   mat1_iscontigmat2_iscontigr7   )r   add_featurer   r   r   is_contiguousr	   )	rP   rR   rQ   rg   rh   mat1_stridemat2_stridecontextr8   s	           rB   get_contextz%mm_autoheuristic.<locals>.get_context  s   +C#C#C#L$++*;*;DQL$++*;*;DQGV[9GV[9T[[668 	 	
 	T[[668 	 	
 4<"7DKK,=,=>rD   c                       y r@   rO   rO   rD   rB   fallbackz"mm_autoheuristic.<locals>.fallback+  s    rD   )rO  r   r   rL  r8   augment_contextprecondition)r   )get_size_hintsr@  get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)rg   rh   rP   rQ   rR   r   r8   r   opsrQ  r   r   rJ  rK  rM  rO  rL  autoheuristics         `           rB   r   r     s     T4Aq1GAq!Aq	"5dDAK& !Q4{KHG0!M 55? 6 
 	
 **,,rD   c                    t        |t              rt        |t              s^t        j                  j                  j                  | j                         t        j                  j                  j                        \  }}t        |t              rt        |t              s^t        j                  j                  j                  |j                         t        j                  j                  j                        \  }}|||fS )NrO  )r   r<  r   rm   rn   
size_hintsr   rI   r   r   unbacked_symint_fallback)rg   rh   rP   rQ   rR   s        rB   rR  rR  B  s    aZ3%7!!,,MMO__++DD - 
A
 aZ3%7!!,,MMO__++DD - 
A a7NrD   c                 d   | j                   j                  }|j                   j                  }||g}g }|D ]p  }t        |t              sMt        j
                  j                  j                  |t        j                  j                  j                        }|j                  |       r |d   |d   fS )NrY  r   r,   )r   rc   r   r<  r   rm   rn   rZ  rI   r   r   r[  r   )rg   rh   rJ  rK  stridesstrides_hintsrc   s          rB   rS  rS  Q  s    ++$$K++$$KK(GM %&#&WW%%00//HH 1 F 	V$% ]1---rD   )rT   N)NNNFN)NN)~r   loggingtypingr   r   r   rI   torch._dynamo.utilsr   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r   r	   r
   )torch._inductor.codegen.cpp_gemm_templater   torch._inductor.virtualizedr   "torch.fx.experimental.proxy_tensorr   torch.torch_versionr    r   r   r   codegen.cuda.gemm_templater   r   ,codegen.rocm.ck_tile_universal_gemm_templater   'codegen.rocm.ck_universal_gemm_templater   codegen.subgraphr   r   r   loweringr   r   r   r  r   select_algorithmr   r   r   r    utilsr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   	mm_commonr-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r   __version__triton_version
has_tritonImportError	getLogger__name__r   rV  r  primsversionhipr   r   load_scalesapply_scalingr  r!  cacherC   r7   r   re   defaultr   _int_mmr   _sparse_semi_structured_mmr  
_scaled_mmr_   r  rM   rS   	lru_cacherw   r]   ri   r   r  r   r   r   r  r  r4  r<  r9  r@  r   rR  rS  rO   rD   rB   <module>r     s         ( T  F ) 6 , , U M D / *       !&"4"45NJ
 g!yy~~				T MM%.G*CQG	XFP (,"sZx ,		~B H FJ
Z !/		#m3!  " " UXX|
4	KKdjj.@.@
 "%--1BC"4	$$$#  "	*8K8K
8
 )D ) ) (,11 I4 %Z6# 4775#' wJ 6wJt 4<<T:'+ ,N ;,N^ 4::48*+!D U 9Up 422M(,T- N-` doo--/F G 4??**E 
hP FhPV # 4  
5  :- C=:-z.S(  !'*NJs   >L   L32L3