a
    öDf"                     @   sz  d dl Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
mZ e je je jgZdd Zd	d
 Zdd Zeeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddge  g de
edddeddd ieejejejejejejejejejejd
dd ZG d!d" d"e jjZejZdS )#    N   )Configautotunecdiv
heuristicsjit)language   )early_config_pruneestimate_matmul_timec                 C   sR   | |u r| S | t v sJ |t v s$J t D ]$}| |u r<|  S ||u r(|   S q(d S N)_ordered_datatypes)abd r   ^/nfs/NAS7/SABIOD/METHODE/ermites/ermites_venv/lib/python3.9/site-packages/triton/ops/matmul.pyget_higher_dtype
   s    r   c                    s    fddS )Nc                    s   |     S r   )Zzero_)nargsnamer   r   <lambda>       zinit_to_zero.<locals>.<lambda>r   r   r   r   r   init_to_zero   s    r   c                  C   s   g } dD ]~}dD ]t}dD ]j}dD ]`}|dkr0dnd}|  t|||dd	||d
 dD ](}|  t||||d	||tdd qVq qqq| S )N)r               )       )r   @   )r   r          r    r   r   r	   BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)r   r      r   C)r)   r*   Zpre_hook)appendr   r   )configsr)   Zblock_mZblock_kZblock_nr*   Zsplit_kr   r   r   get_configs_io_bound   s$    
r/   r!   r"   r   r#   r   r+   r(   r    r   r   )MNK
   )r
   Z
perf_modelZtop_k)r.   keyZprune_configs_byEVEN_Kc                 C   s   | d | d | d   dkS )Nr2   r&   r'   r   r   )argsr   r   r   r   M   r   r   )
dot_out_dtype
allow_tf32fp8_fast_accumr$   r%   r&   GROUP_Mr'   r5   AB_DTYPEc           +      C   s
  t d}t d}t ||}t ||}|| }|| }t|||  |}|| ||  }|| | }|| t d| }|| t d| } t t || ||}!t t | | ||}"|| t d| }#| |!d d d f | |#d d d f |   } ||#d d d f | |"d d d f |	   }t j||f|d}$tdt ||| D ]}%|rvt 	| }&t 	|}'nb||%||   }(t jd|j
jd})t j	| |#d d d f |(k |)d}&t j	||#d d d f |(k |)d}'|r|&|j
j}&|'|j
j}'|rt j|&|'|$||d}$n|$t j|&|'||d7 }$| || | 7 } ||| | 7 }qV|$|j
j}$|| t d| }|| t d| } ||d d d f |
 | d d d f |   }||k d d d f | |k d d d f @ }*|dkrt j||$|*d nt j||$|*d d S )Nr   r	   )dtype)r	   r	   )maskother)Z	out_dtyper8   )r=   )tlZ
program_idr   minZarangeZmax_contiguousZmultiple_ofzerosrangeloadr<   Z
element_tytodotstoreZ
atomic_add)+ABr,   r0   r1   r2   Z	stride_amZ	stride_akZ	stride_bkZ	stride_bnZ	stride_cmZ	stride_cnr7   r8   r9   r$   r%   r&   r:   r'   r5   r;   pidZpid_zZgrid_mZgrid_nwidthZgroup_idZ
group_sizeZpid_mZpid_nZrmZrnZramZrbnZrkacckr   r   Zk_remainingZ_0r=   r   r   r   _kernel.   sR    -

,,
  ,(
rM   c                   @   s.   e Zd ZeZi Zedd ZedddZdS )_matmulc                    s*  | j }| ddkr*| ddkr*|  } |ddkrN|ddkrN| }| jd |jd ksjJ d| j\ }|j\}| jtjtjtjfv s|jtjtjtjfv rt	j
}n2| jt	jfv s|jt	jfv rt	j}nt| j|j}t	j f||d}	|d u r(|t	j
t	jt	jfv r tj}ntj}nJt|t	js>J d|t	j
krRtj
}n |t	jt	jfv rltj}ntj}d}
| jtjtjfv r|jtjtjfv rd}
| jt	jfv r|jt	jfv rd}
 fdd	}t| | ||	 || d| d|d|d|	d|	d|||d
|
d |	S )Nr   r	   zincompatible dimensions)devicer<   z#dot_out_dtype must be a torch.dtypeTFc                    s$   t  | d t | d  | d fS )Nr$   r%   r'   )r   )ZMETAr0   r1   r   r   r      r   z_matmul._call.<locals>.<lambda>r+   )r7   r8   r9   r:   r;   )rO   Zstride
contiguousshaper<   r?   Z
float8e4nvZfloat8e4b15Zfloat8e5torchfloat16Zint8int32r   emptyfloat32bfloat16
isinstancerM   )r   r   r7   r8   r9   rO   r2   _Zc_dtypecZab_dtypeZgridr   rP   r   _call   sV    


( 	z_matmul._callNTc                 C   s   t j|||||dS )N)r7   r8   r9   )rN   r\   )ctxr   r   r7   r8   r9   r   r   r   forward   s    z_matmul.forward)NTT)	__name__
__module____qualname__rM   kernelZ_locksstaticmethodr\   r^   r   r   r   r   rN      s   
3rN   )rS    r   r   r   r   r   r   r?   Zmatmul_perf_modelr
   r   rT   rX   rW   r   r   r   r/   Z	constexprrM   ZautogradZFunctionrN   applymatmulr   r   r   r   <module>   s^   >>