
    rh5c                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ ej"                  j%                  ej"                  j'                  e            Zej"                  j+                  ed      gZej"                  j+                  ed	      Zd
gZ e j2                         d        Z e j2                         d        Z G d de      Zd ZddddddZddddddZ dZ!d Z" G d d      Z# e$d  e%d      D              Z&de&d<   de&d <   d e&d<   d! Z'd" Z( G d# d$e      Z) G d% d&e      Z*y)'    N)Path)knobs)compile_module_from_src)_allocation)	GPUTarget)	GPUDriverincludelibcudac            	         t         j                  j                  x} r| gS t        j                  ddg      j                         }|j                         D cg c]  }d|v s|j                         d    }}|D cg c]!  }t        j                  j                  |      # }}t        j                  d      }|r^|s\|j                  d      D cg c]B  }t        j                  j                  t        j                  j                  |d            sA|D }}d}|r|dt        |      z  z  }|d	z  }n
|d
z  }|dz  }t        d |D              sJ |       |S c c}w c c}w c c}w )Nz/sbin/ldconfigz-plibcuda.so.1LD_LIBRARY_PATH:zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      K   | ]A  }t         j                  j                  t         j                  j                  |d              C yw)r   N)ospathexistsjoin).0r   s     p/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/triton/backends/nvidia/driver.py	<genexpr>zlibcuda_dirs.<locals>.<genexpr>(   s,     Sdrww~~bggll4@ASs   AA	)r   nvidialibcuda_path
subprocesscheck_outputdecode
splitlinessplitr   r   dirnamegetenvr   r   strany)	env_libcuda_pathlibslinelocslocdirsenv_ld_library_pathdirmsgs	            r   libcuda_dirsr-      sR    <<4444 !!""$4d#;<CCED *.):UnPT>TDJJLUDU,01SBGGOOC 1D1))$564288=sPRPWPWP\P\]`bpPqArss
&C2SY>>KKMM>>SdSSXUXXSK V1 ts   	E!E<&EAE#E#c                  $    t         gt               S N)libdevice_dirr-        r   library_dirsr3   ,   s    +LN++r2   c                   $     e Zd Z fdZd Z xZS )	CudaUtilsc                 d    t        | d      st        t        |   |       | _        | j                  S )Ninstance)hasattrsuperr5   __new__r7   )cls	__class__s    r   r:   zCudaUtils.__new__8   s*    sJ' C8=CL||r2   c                 `   t        t        t        j                  j	                  t
        d            j                         dt               t        t              }|j                  | _
        |j                  | _        |j                  | _        |j                  | _        |j                  | _        y )Nzdriver.c
cuda_utilssrcnamer3   include_dirs	libraries)r   r   r   r   r   r    	read_textr3   rB   rC   load_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_tma_descriptor)selfmods     r   __init__zCudaUtils.__init__=   s    %RWW\\':67AAC%%
 ??%(%>%>",/,L,L)$'$<$<!#&#:#: r2   )__name__
__module____qualname__r:   rL   __classcell__r<   s   @r   r5   r5   6   s    
;r2   r5   c                     | d   dk(  ry| j                  d      ryi dddd	d
ddddddddddddddddddddddddddd|    S )Nr   *CUdeviceptr
tensordescCUtensorMapi1int32_ti8int8_ti16int16_ti32i64int64_tu1uint32_tu8uint8_tu16uint16_tu32u64uint64_tfp16doublebf16fp32f32fp64	nvTmaDesc)
startswith)tys    r   	ty_to_cpprr   Q   s    	!u|	}}\"ih 	y 	y	
 	y 	j 	i 	z 	z 	z 	 	 	 	x 	  	]!" 	#
 
r2   re   ra   rh   )ri   rk   rl   rm   rn   	pack_fp16	pack_bf16	pack_fp32	pack_fp64iiiKKppOOOOOc                    fd}fdfdfd ||j                               }t        |      D ci c]  \  }}||
 }}}dj                  |j                         D cg c]
  } |       c}      }t        |z   }	g }
|j                         D ]  } ||
        t        |
      D ci c]  \  }}||
 }}}t	        |      dkD  r)ddj                  d |j                         D              z   nd}g }|j                         D ]P  \  }}|d	k(  r|t        v r|j                  t        |    d
|        2|j                  t        |       d
|        R dj                  |      }g }|j                         D ]u  \  }}|d   dk(  r|j                  d| d       $|t        v r|j                  d| d       B|dk(  r|j                  d|        \|d	k7  sb|j                  d|        w t        t	        |            }d}|j                         D cg c]  \  }}|d   dk(  rd| d| d| d| d	 }}}|j                         D cg c]  \  }}|dk(  rd| d| d| d }}}|j                         D cg c])  \  }}|t        v rt        |    d| dt        |    d| d+ }}}|j                         D cg c]  \  }}|d	k7  sd|  }}}|j                  d        d!t	        |      dkD  rd|z   nd d"dj                  |       d#|j                  |j                         D cg c]  \  }} |       d| d$ c}}       d%|	 d&| d'|j                  |       d|j                  |       d|j                  |       d(t	        |      dkD  rddj                  |      z   nd d)}|S c c}}w c c}w c c}}w c c}}w c c}}w c c}}w c c}}w c c}}w )*Nc                 ~   g }d}| D ]  }t        |t              r|j                  d      r
r
|   nd }|dz  }t        j                  d|      }|j                  d      }|j                  d      }|j                  d      dz   }|9|j                  d|z          t        d|z        D ]  }	|j                  d        n|j                  d	       t        |      D ]  }	|j                  d
        t        |      D ]  }	|j                  d        |j                  |       " 
r|t        
      k(  sJ |S )Nr   rU      ztensordesc<([^[>]*)\[([^]]*)\]   ,rS   r^   ro   r]   )

isinstancer"   rp   rematchgroupcountappendrangelen)	signatureoutputtensordesc_idxsigmetar   dtypeshapendim_tensordesc_metas             r   _expand_signaturez(make_launcher.<locals>._expand_signature~   s9     	#C#s#|(D:I~6t!#!CSIAA{{3'!+<MM#+. #1t8_ -e,- MM+.t )AMM%()t )AMM%() c"7	#: #nO8L&LLLr2   c                 j    t        | t              r| D ]  } ||        y |j                  |        y r/   )r}   tupler   )r   r   x_flatten_signatures      r   r   z)make_launcher.<locals>._flatten_signature   s4    c5! ."1f-. MM#r2   c                     t        | t              r!dj                  t        |             }d| dS | d   dk(  ry| dv ryt	        |       S )Nr|   []r   rS   z	PyObject*	constexprro   )r}   r   r   maprr   )rq   val_extracted_types     r   r   z&make_launcher.<locals>._extracted_type   sT    b% ((334Cse1:a5C<++}r2   c                     t        | t              r!dj                  t        |             }d| dS | d   dk(  ry| dv ry| j	                  d      ryd	d
ddddddddd
t        |          S )N ()r   rS   Or   rU   dlbhiLBHIK)
rj   longrZ   r\   rX   r_   rc   re   ra   rh   )r}   r   r   r   rp   rr   )rq   r   	format_ofs     r   r   z make_launcher.<locals>.format_of   s    b% ''#i,-Cse1:a5C<++==&
 B- 	r2   r   r   z, c              3   ,   K   | ]  \  }}d |   yw)z&_argNr1   )r   r   rq   s      r   r   z make_launcher.<locals>.<genexpr>   s      LB5 Ls   r   z argrS   ptr_infoz.dev_ptr_arg_storagero   z*tma_ptrz
  zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;zCUtensorMap* tma_ptrz = getTmaDesc(_argz); if (!tma_ptrz) return NULL;z _argz_storage = z(_argz);z&argz&global_scratchaR  
#include "cuda.h"
#include <stdbool.h>
#include <Python.h>
#include <dlfcn.h>

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratchz) {
  void *params[] = { a,   };
  if (gridX*gridY*gridZ > 0) {
    // 4 attributes that we can currently pass maxmimum
    CUlaunchAttribute launchAttr[4];
    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
    if (cuLaunchKernelExHandle == NULL) {
      cuLaunchKernelExHandle = getLaunchKernelExHandle();
    }
    CUlaunchConfig config;
    config.gridDimX = gridX;
    config.gridDimY = gridY;
    config.gridDimZ = gridZ;

    if (num_ctas != 1) {
      config.gridDimX *= clusterDimX;
      config.gridDimY *= clusterDimY;
      config.gridDimZ *= clusterDimZ;
    }

    config.blockDimX = 32 * num_warps;
    config.blockDimY = 1;
    config.blockDimZ = 1;
    config.sharedMemBytes = shared_memory;
    config.hStream = stream;
    config.attrs = launchAttr;
    int num_attrs = 0;

    if (launch_pdl != 0) {
      CUlaunchAttribute pdlAttr = { .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1};
      launchAttr[num_attrs] = pdlAttr;
      ++num_attrs;
    }

    if (launch_cooperative_grid != 0) {
      CUlaunchAttribute coopAttr = { .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1};
      launchAttr[num_attrs] = coopAttr;
      ++num_attrs;
    }

    if (num_ctas != 1) {
      CUlaunchAttribute clusterAttr = {};
      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      clusterAttr.value.clusterDim.x = clusterDimX;
      clusterAttr.value.clusterDim.y = clusterDimY;
      clusterAttr.value.clusterDim.z = clusterDimZ;
      launchAttr[num_attrs] = clusterAttr;
      ++num_attrs;

      CUlaunchAttribute clusterSchedulingAttr = {};
      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      launchAttr[num_attrs] = clusterSchedulingAttr;
      ++num_attrs;
    }

    config.numAttrs = num_attrs;

    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){
    PyObject *empty_tuple = PyTuple_New(0);
    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
    Py_DECREF(empty_tuple);
    Py_DECREF(ptr);
    if (!PyLong_Check(ret)) {
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
      ptr_info.valid = false;
      return ptr_info;
    }
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
    if(!ptr_info.dev_ptr)
      return ptr_info;
    uint64_t dev_ptr;
    int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
    if (status == CUDA_ERROR_INVALID_VALUE) {
        PyErr_Format(PyExc_ValueError,
                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
        ptr_info.valid = false;
    } else if (status != CUDA_SUCCESS) {
        CUDA_CHECK(status);  // Catch any other cuda API errors
        ptr_info.valid = false;
    }
    ptr_info.dev_ptr = dev_ptr;
    Py_DECREF(ret);  // Thanks ChatGPT!
    return ptr_info;
  }
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
  ptr_info.valid = false;
  return ptr_info;
}

static inline CUtensorMap* getTmaDesc(PyObject *obj) {
  if (sizeof(CUtensorMap*) != 8) {
    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
    return NULL;
  }

  PyObject *method_handle = PyObject_GetAttrString(obj, "tma_desc_cpu_ptr");
  if (!method_handle) {
    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() method does not exist");
    return NULL;
  }

  PyObject *empty_tuple = PyTuple_New(0);
  if (!empty_tuple) {
    Py_DECREF(method_handle);
    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
    return NULL;
  }
  PyObject *method_ret = PyObject_Call(method_handle, empty_tuple, NULL);
  Py_DECREF(empty_tuple);
  Py_DECREF(method_handle);
  if (!method_ret) {
    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
    return NULL;
  }

  if (!PyLong_Check(method_ret)) {
    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() must return 64-bit int");
    Py_DECREF(method_ret);
    return NULL;
  }

  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret);
  Py_DECREF(method_ret);
  if (!ptr_as_uint) {
    PyErr_SetString(PyExc_ValueError, "received NULL ptr from tma_desc_cpu_ptr()");
    return NULL;
  }
  if (ptr_as_uint % 64 != 0) {
    PyErr_SetString(PyExc_ValueError, "tma_desc_cpu_ptr() must be 64-byte aligned");
    return NULL;
  }

  return (CUtensorMap*)(ptr_as_uint);
}

static void ensureCudaContext() {
  CUcontext pctx;
  CUDA_CHECK(cuCtxGetCurrent(&pctx));
  if (!pctx) {
    // Ensure device context.
    CUdevice device;
    CUDA_CHECK(cuDeviceGet(&device, 0));
    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
    CUDA_CHECK(cuCtxSetCurrent(pctx));
  }
}

static uint16_t pack_fp16(double f) {
    uint16_t result;
    // from https://github.com/python/pythoncapi-compat
#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
#else
    PyFloat_Pack2(f, (unsigned char*)&result, 1);
#endif
    return result;
}

static uint16_t pack_bf16(double f) {
    float f32 = (float)f;
    uint32_t u32 = *(uint32_t*)&f32;
    return (uint16_t)(u32 >> 16);
}

static uint32_t pack_fp32(double f) {
    float f32 = (float)f;
    return *(uint32_t*)&f32;
}

static uint64_t pack_fp64(double f) {
    return *(uint64_t*)&f;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
  ensureCudaContext();

  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  int launch_cooperative_grid;
  int launch_pdl;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
  PyObject *global_scratch_obj = NULL;
  ;z
  if(!PyArg_ParseTuple(args, "a7  ", &gridX, &gridY, &gridZ,
                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hooka  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  CUdeviceptr global_scratch = 0;
  if (global_scratch_obj != Py_None) {
    DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1);
    if (!global_scratch_info.valid) {
      return NULL;
    }
    global_scratch = global_scratch_info.dev_ptr;
  }

  // raise exception asap
  z
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratchaC  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  Py_RETURN_NONE;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)values	enumerater   _BASE_ARGS_FORMATr   itemsFLOAT_STORAGE_TYPEr   rr   r   FLOAT_PACK_FUNCTION)	constantsr   r   r   expand_signaturer   srq   args_formatformatflat_signaturer   	args_listarg_decl_list	arg_declsinternal_args_listparamsnewline	ptr_decls	tma_declsfloat_storage_declsr@   r   r   r   s     `                   @@@r   make_launcherr   |   s	   $L. ))9)9);<"+,<"=>$!QA>I>''93C3C3EFR9R=FGK,FN! 03/0"+N";<$!QA<I<PST]P^abPbtyy L)//:K LLLhjI M" <2##  $6r$:#;4s!CD  IbM?$qc!:;< 		-(I" 	22a5C<%%8&<=%%%%QCx&89;%%n5;%%QCj1	2 3y>"F G __&Ara5C< !#5aS1#=MaSPdeI  foetetev\a\]_a qc!3A3oaSWI  __&Ar## b!
"%s+6I"6M5NeTUSVVXY 
 '0oo&7MUQ2;LQCjMFM
MM#$.F\ [^  _h  [i  lm  [m  GK  NW  GW  su  Fv vyy() Q*b <<	@QRuq"OB'(aS2RST U  &x (Q R[P[ \< <<	 
<<	 
<<#$% &H qt  uG  qH  KL  qL  IM  PT  PY  PY  Zl  Pm  Im  RT  HU 'UM	MC\
 JE ?F =8

 ND Ss5   O*O7O)!O O".O(O.#O.O4c                       e Zd ZdZd Zd Zy)TmaDescKernelParam   c                 f    dd l }|j                  | j                  |j                  d      | _        y )Nr   cpur   device)torchemptyTMA_DESC_SIZEuint8descrJ   r   s     r   rL   zTmaDescKernelParam.__init__V  s%    KK 2 2%++eKT	r2   c                 6    | j                   j                         S r/   )r   data_ptr)rJ   s    r   tma_desc_cpu_ptrz#TmaDescKernelParam.tma_desc_cpu_ptr[  s    yy!!##r2   N)rM   rN   rO   r   rL   r   r1   r2   r   r   r   S  s    MU
$r2   r   c              #   $   K   | ]  }||f 
 y wr/   r1   )r   r   s     r   r   r   `  s     :1A:s      
      	   c           
      6   |=| j                   g| j                  | j                  | j                  | j                  S |d   }|d   }|d   }|d   }|d   }| j                   j                         }| j                  }| j                  }	|	d   dk(  sJ t	               }
|
g||	}|rt        |      }|dxx   dz  cc<   t        j                  j                  j                  j                  j                  |
j                         |||t        |   |||	       |S )	Nswizzle	elem_size	elem_type
block_size
fp4_paddedr   rz   r{   )baser   stridesr   r   listtritonruntimedriveractiveutilsrI   r   TMA_DTYPE_DEVICE_TO_HOST)argmetadatar   r   r   r   r   r   r   r   r   results               r   make_tensordesc_argr   f  s'    M399Ms{{MSYYMMMy!G%I%I,'J,'Jxx  "HIIEkkG2;!D%U%W%FUb	Q	
NN  &&:: +	 Mr2   c                 6     ddl m ddlm  fd}|S )Nr   )TensorDescriptorc                  D   | d t        t               }| t        t              d  }d}g }t        |      D ]N  \  }}t        |	f      r*r|   nd }|dz  }|j	                  t        ||             >|j                  |       P r|t              k(  sJ  
g || S )Nr   rz   )r   r   r   r}   extendr   r   )args	meta_argsraw_kernel_argsr   
final_argsr   r   r   GluonTensorDescriptorr   launcherr   s           r   innerz%wrap_handle_tensordesc.<locals>.inner  s    0#/01	s#4567
0 	'FAs# 02GHI:I~6t!#!!"5c4"@A!!#&	' #nO8L&LLL00Z00r2   )triton.tools.tensor_descriptorr   'triton.experimental.gluon.nvidia.hopper)r   r   r   r   r   s   `` @@r   wrap_handle_tensordescr     s    ?a1 Lr2   c                       e Zd Zd Zd Zy)CudaLauncherc                    t        d      rj                  n	t               }fd}|j                         D ci c]  \  }} ||      | }}}j                  j                         D ci c]  \  }}||
 }}}t        |dd       }t        |||      t        dt               t        t              }	t        d |j                         D              }
t        j                  t        j                   |j"                  d      | _        |
rt'        |	j(                  |      n|	j(                  | _        |j*                  | _        |j,                  | _        |j.                  | _        |j0                  | _        y c c}}w c c}}w )Nr   c                 t    t        | t              r&j                  j                  j	                  |       fS | S r/   )r}   r"   fn	arg_namesindex)r   r@   s    r   <lambda>z'CudaLauncher.__init__.<locals>.<lambda>  s-    Z3=OSVV--33A69 UV r2   r   __triton_launcherr?   c              3   b   K   | ]'  }t        |t              xr |j                  d        ) yw)rU   N)r}   r"   rp   )r   r   s     r   r   z(CudaLauncher.__init__.<locals>.<genexpr>  s)     !v\_*S#"6"W3>>,;W"W!vs   -/rz   )r8   r   dictr   r   getattrr   r   r3   rB   rC   r#   r   	functoolsreduceoperatormulcluster_dimsnum_ctasr   launchglobal_scratch_sizeglobal_scratch_alignlaunch_cooperative_grid
launch_pdl)rJ   r@   r   r   arg_idxidxvaluer   r   rK   has_tensor_desc_args    `         r   rL   zCudaLauncher.__init__  s@   %,S+%>CMMDF	V;D??;LMZS%WS\5(M	M25--2E2E2GHJCS%ZH	H!(,=tDIy/B%$%%
 "!vclcscscu!vv!((x7L7LaPM`,SZZIfifpfp#+#?#? $,$A$A!'/'G'G$"--% NHs   E;/Fc           
         | j                   dkD  rF||z  |z  }|| j                  z  | j                   z  }t        j                  || j                  |      }	nd }	 | j
                  |||||| j                  | j                  |	g|  y Nr   )r  r  r   
_allocatorr  r  r  r  )
rJ   gridXgridYgridZstreamfunctionr   	grid_size
alloc_sizeglobal_scratchs
             r   __call__zCudaLauncher.__call__  s    ##a'-I"T]]2T5M5MMJ(33J@Y@Y[abN!NE5%4;W;WY]YhYh"	+%)	+r2   N)rM   rN   rO   rL   r$  r1   r2   r   r   r     s    ..+r2   r   c                   R     e Zd Z fdZd Zd Zd Zed        Zd Z	d Z
d Z xZS )	
CudaDriverc                 V    t               | _        t        | _        t        |           y r/   )r5   r   r   launcher_clsr9   rL   )rJ   r<   s    r   rL   zCudaDriver.__init__  s    [
(r2   c                 ~    | j                         }| j                  |      }|d   dz  |d   z   }d}t        d||      S )Nr   r   rz       r   )get_current_deviceget_device_capabilityr   )rJ   r   
capability	warp_sizes       r   get_current_targetzCudaDriver.get_current_target  sK    ((*//7
]R'*Q-7
	Y77r2   c                 J    dd l }|j                  d| j                               S )Nr   r   )r   r   r+  r   s     r   get_active_torch_devicez"CudaDriver.get_active_torch_device  s    ||FD$;$;$=>>r2   c                 "    dd l }|j                  S r  )r   r   r   s     r   get_device_interfacezCudaDriver.get_device_interface  s    zzr2   c                      	 dd l } | j                  j                         xr | j                  j                  d u S # t
        $ r Y yw xY w)Nr   F)r   r   is_availableversionhipImportError)r   s    r   	is_activezCudaDriver.is_active  sC    	::**,L%--2C2Ct2KL 		s   7: 	AAc                     ddl m} |S )Nr   )do_bench)triton.testingr;  )rJ   r;  s     r   get_benchmarkerzCudaDriver.get_benchmarker  s
    +r2   c                 b    dd l }d}|j                  t        |dz        |j                  d      S )Nr   i      r   r   )r   r   int)rJ   r   
cache_sizes      r   get_empty_cache_for_benchmarkz(CudaDriver.get_empty_cache_for_benchmark  s.    
 '
{{3zQ/uyy{PPr2   c                 $    |j                          y r/   )zero_)rJ   caches     r   clear_cachezCudaDriver.clear_cache  s    r2   )rM   rN   rO   rL   r/  r1  r3  staticmethodr9  r=  rB  rF  rP   rQ   s   @r   r&  r&    s;    
8?  Qr2   r&  )+r
  r  r   r   r   r~   pathlibr   r   triton.runtime.buildr   triton.runtimer   triton.backends.compilerr   triton.backends.driverr   r   r    realpath__file__r   rB   r0   rC   	lru_cacher-   r3   objectr5   rr   r   r   r   r   r   r  r   r   r   r   r   r&  r1   r2   r   <module>rQ     sr     	   	   8 & . ,
''//"''**84
5Wi01We,H	  . , ,; ;6
4     # Tn	$ 	$  :b	::         %P,!+6 !+H, ,r2   