a
    öDf*
                 
   @  sr  d dl mZ d dlmZ d dlmZmZmZmZm	Z	 ddl
mZ ddlmZ ddlmZ e	d	ZG d
d deZddddddZddddddZddddddZdddddddZdddddddZddddd"d#d$d%Zddddd&d'd(Zddddd&d)d*Zddddd&d+d,Zddddd&d-d.Zddddd&d/d0Zdddddd1d2d3Zddddd&d4d5Z dddd"d&d6d7Z!ddddd&d8d9Z"ddddd&d:d;Z#ddddd&d<d=Z$ddddd&d>d?Z%ddddd&d@dAZ&dddBdCdDZ'ddddd&dEdFZ(ddddd&dGdHZ)ddddd&dIdJZ*dddKdLdMZ+ddddNdOdPZ,ddddNdQdRZ-ddSdTdUdVZ.ddddd&dWdXZ/ddddd&dYdZZ0ddddd&d[d\Z1ddddd&d]d^Z2ddddd&d_d`Z3ddddd&dadbZ4dddddcdddeZ5dfddddgdhdiZ6ddfdddjdkdlZ7ddfdddmdndoZ8ddfdddmdpdqZ9dddddrdsdtZ:ddddddudvdwZ;ddddNdxdyZ<ddfdddzd{d|Z=ddddd#d}d~Z>dddddddZ?dddddddZ@dd ZAdd ZBdd ZCdd ZDdd ZEdd ZFdd ZGdd ZHdd ZIdddddddddd	ddZJdd ZKdd ZLddddddddddZMddddddddddZNdddddddddZOddddddddddZPddddddddddZQddddddddddZRddddddddddZSddddddddddZTddddddddddZUddddddddddZVddddZWddddZXdddddddddddZYddddddddZZdddddŜddǄZ[dddddŜddɄZ\dd	d˜dd̈́Z]dddddΜddЄZ^e]ddgdӍddddԜddքZ_e]ddgdӍddddԜdd؄Z`e]ddgdӍddddԜddڄZae]ddgdӍddddԜdd܄Zbe]ddgdӍddddԜddބZce]ddgdӍddddԜddZdddddԜddZeddfddddZfddfddddZgddfddddZhdddddZidddddddZjdddddddddZkdd ZldddZmddddddZnddddddZodS (       )annotationswraps)ListOptionalSequenceTupleTypeVar   )iris_hip   )coreTc                      s   e Zd Z fddZ  ZS )IncompatibleTypeErrorImplc                   s@   || _ || _d| j   d | j  | _tt| | j d S )Nzinvalid operands of type  and )type_atype_b__repr__messagesuperr   __init__)selfr   r   	__class__ e/nfs/NAS7/SABIOD/METHODE/ermites/ermites_venv/lib/python3.9/site-packages/triton/language/semantic.pyr      s    z"IncompatibleTypeErrorImpl.__init__)__name__
__module____qualname__r   __classcell__r   r   r   r   r      s   r   intz
ir.builderz	tl.tensor)axisbuilderreturnc                 C  s*   | dvrt d|  t|| tjS )Nr   r   r
   z+program_id axis must be 0, 1, or 2 but got )
ValueErrortltensorZcreate_get_program_idint32r#   r$   r   r   r   
program_id   s    r,   c                 C  s*   | dvrt d|  t|| tjS )Nr&   z-num_programs axis must be 0, 1, or 2 but got )r'   r(   r)   Zcreate_get_num_programsr*   r+   r   r   r   num_programs!   s    r-   ztl.dtype)a_tyb_tyr%   c                 C  sx   | j }|j }| j}|j}||kr0||kr,| S |S |tjjjkrN||krJ| S |S |tjjjkrl||krh|S | S dstJ d S NF)int_bitwidthint_signednessr(   dtypeZ
SIGNEDNESSZUNSIGNED)r.   r/   Za_rankZb_rankZa_snZb_snr   r   r   integer_promote_impl,   s    r4   bool)r.   r/   
div_or_modr%   c                 C  s   |   s|  rtjS |  s&| r,tjS |  s<| rL|rFtjS tjS |  s\| r|rftjS |  r|| r|tjS tjS | 	 r|	 sdsJ |r| j
|j
krtd|   d |  d t| |S )NFzCannot use /, #, or % with r   x because they have different signedness;this is unlikely to result in a useful answer. Cast them to the same signedness.)Zis_fp64r(   float64is_fp32float32is_fp16float16is_bf16bfloat16is_intr2   r'   r   r4   )r.   r/   r6   r   r   r   computation_type_impl<   s*    r@   None)r   r   allow_ptr_ar%   c                 C  sF   |   rB|st| ||  r0| |kr0t| || rBt| |d S N)is_ptrr   is_floating)r   r   rB   r   r   r   check_ptr_type_impld   s    

rF   FTzTuple[tl.tensor, tl.tensor])lhsrhsr$   r%   c           
      C  sx   t | ||\} }| jj}|jj}t||| t||| |rp| sp| spt|||}	t| |	|} t||	|}| |fS rC   )broadcast_impl_valuetypescalarrF   rD   r@   cast)
rG   rH   r$   Zallow_lhs_ptrZallow_rhs_ptrZarithmetic_checkr6   Z
lhs_sca_tyZ
rhs_sca_ty
ret_sca_tyr   r   r   binary_op_type_checking_implp   s    rN   )inputotherr$   r%   c                 C  s   t | ||dd\} }| jj}|jj}| r<| r<td| rf| sf||  } }| jj}|jj}| rt|| j|j| jS |	 rt|
| j|j| jS | rt|| j|j| jS dsJ d S )NTzcannot add pointers togetherF)rN   rJ   rK   rD   r'   r(   r)   create_addptrhandlerE   Zcreate_faddr?   Z
create_addrO   rP   r$   input_scalar_tyother_scalar_tyr   r   r   add   s     
rV   c                 C  s   t | ||dd\} }| jj}| rDt|| jt||j| jS |	 rft|
| j|j| jS | rt|| j|j| jS dsJ d S )NTF)rN   rJ   rK   rD   r(   r)   rQ   rR   minusrE   Zcreate_fsubr?   Z
create_subrO   rP   r$   	scalar_tyr   r   r   sub   s     rZ   c                 C  sh   t | ||\} }| jj}| r:t|| j|j| jS | r\t|	| j|j| jS dsdJ d S r0   )
rN   rJ   rK   rE   r(   r)   Zcreate_fmulrR   r?   Z
create_mulrX   r   r   r   mul   s    r[   c                 C  s   t | ||dddd\} }| jj}|jj}| rF| rFt|||}n| rd| rdt| ||} nn| r| rt| tj|} t|tj|}n@| r| r|j|jkrt|||}qt| ||} ndsJ t	|
| j|j| jS NFT)rN   rJ   rK   rE   r?   rL   r(   r:   Zfp_mantissa_widthr)   create_fdivrR   rS   r   r   r   truediv   s     r^   c                 C  s   t | ||dddd\} }| jj}|jj}| r| rt||}t| ||} t|||}| r|t|	| j
|j
| jS t|| j
|j
| jS dsJ d S r\   )rN   rJ   rK   r?   r4   rL   is_int_signedr(   r)   Zcreate_sdivrR   Zcreate_udiv)rO   rP   r$   rT   rU   ret_tyr   r   r   floordiv   s    
ra   )rO   rP   ieee_roundingr$   r%   c                 C  s^   | j j}|j j}| r | s(tdt| ||dddd\} }|| j|j}t|| j S )Nz4both operands of fdiv must have floating scalar typeFT)	rJ   rK   rE   r'   rN   r]   rR   r(   r)   )rO   rP   rb   r$   rT   rU   retr   r   r   fdiv   s    rd   c              	   C  s   t | ||dddd\} }| jj}|jj}| rXt| ttt| |d|||||}|S | r|j	|j	krt
d|  d |  d | rt|| j|j| jS t|| j|j| jS dsJ d S )NFTzCannot mod z by r7   )rN   rJ   rK   rE   rZ   r[   floorrd   r?   r2   r'   r   r_   r(   r)   Zcreate_sremrR   Zcreate_urem)rO   rP   r$   rY   rU   rc   r   r   r   mod   s    $ rf   c                 C  sz   t | ||ddd\} }| jj}|jj}| r6| s@t||t||}||kr^t| ||} ||krrt|||}| |fS r0   )rN   rJ   rK   r?   r   r4   rL   )rO   rP   r$   input_sca_tyZother_sca_tyrM   r   r   r   bitwise_op_type_checking_impl  s    

rh   c                 C  s*   t | ||\} }t|| j|j| jS rC   )rh   r(   r)   Z
create_andrR   rJ   rO   rP   r$   r   r   r   and_  s    rj   c                 C  s*   t | ||\} }t|| j|j| jS rC   )rh   r(   r)   Z	create_orrR   rJ   ri   r   r   r   or_  s    rk   c                 C  s*   t | ||\} }t|| j|j| jS rC   )rh   r(   r)   Z
create_xorrR   rJ   ri   r   r   r   xor_  s    rl   c                 C  sD   | j  st| td|} |j  s8t|td|}t| ||S Nint1)rJ   is_int1bitcastr(   r3   rj   ri   r   r   r   logical_and   s
    

rq   c                 C  sD   | j  st| td|} |j  s8t|td|}t| ||S rm   )rJ   ro   rp   r(   r3   rk   ri   r   r   r   
logical_or(  s
    

rr   rO   r$   c                 C  s&   | j  st| td|} t| |S rm   )rJ   ro   rp   r(   r3   invertrs   r   r   r   not_0  s    
ru   c                 C  s*   t | ||\} }t|| j|j| jS rC   )rh   r(   r)   Zcreate_lshrrR   rJ   ri   r   r   r   lshr6  s    rv   c                 C  s*   t | ||\} }t|| j|j| jS rC   )rh   r(   r)   Zcreate_ashrrR   rJ   ri   r   r   r   ashr;  s    rw   c                 C  s*   t | ||\} }t|| j|j| jS rC   )rh   r(   r)   Z
create_shlrR   rJ   ri   r   r   r   shl@  s    rx   )rO   r%   c                 C  s   | S rC   r   )rO   r   r   r   plusJ  s    ry   )rO   r$   r%   c                 C  sH   | j j}| r$td|  d t||||}t	|| |S )Nz$wrong type argument to unary minus ())
rJ   rK   rD   r'   r   r(   r)   get_null_valueto_irrZ   )rO   r$   rg   _0r   r   r   rW   N  s
    rW   c                 C  sP   | j j}| s| r,td|  d t||	||}t
| ||S )Nz%wrong type argument to unary invert (rz   )rJ   rK   rD   rE   r'   r   r(   r)   Zget_all_ones_valuer|   rl   )rO   r$   rg   Z_1r   r   r   rt   V  s
    rt   ztl.block_type)vr%   c                 C  s&   | j  stjS | j j}ttj|S rC   )rJ   is_blockr(   rn   shape
block_type)r~   r   r   r   r   
_bool_likea  s    
r   c                 C  s   t | ||\} }| jj}| r<t|| j|jt| S |	 r|
 rht|| j|jt| S t|| j|jt| S dsJ d S r0   )rN   rJ   rK   rE   r(   r)   Zcreate_fcmpOGTrR   r   r?   r_   Zcreate_icmpSGTZcreate_icmpUGTrX   r   r   r   greater_thanh  s    r   c                 C  s   t | ||\} }| jj}| r<t|| j|jt| S |	 r|
 rht|| j|jt| S t|| j|jt| S dsJ d S r0   )rN   rJ   rK   rE   r(   r)   Zcreate_fcmpOGErR   r   r?   r_   Zcreate_icmpSGEZcreate_icmpUGErX   r   r   r   greater_equalw  s    r   c                 C  s   t | ||\} }| jj}| r<t|| j|jt| S |	 r|
 rht|| j|jt| S t|| j|jt| S dsJ d S r0   )rN   rJ   rK   rE   r(   r)   Zcreate_fcmpOLTrR   r   r?   r_   Zcreate_icmpSLTZcreate_icmpULTrX   r   r   r   	less_than  s    r   c                 C  s   t | ||\} }| jj}| r<t|| j|jt| S |	 r|
 rht|| j|jt| S t|| j|jt| S dsJ d S r0   )rN   rJ   rK   rE   r(   r)   Zcreate_fcmpOLErR   r   r?   r_   Zcreate_icmpSLEZcreate_icmpULErX   r   r   r   
less_equal  s    r   c                 C  sl   t | ||\} }| jj}| r<t|| j|jt| S |	 r`t|
| j|jt| S dshJ d S r0   )rN   rJ   rK   rE   r(   r)   Zcreate_fcmpOEQrR   r   r?   Zcreate_icmpEQrX   r   r   r   equal  s    r   c                 C  sl   t | ||\} }| jj}| r<t|| j|jt| S |	 r`t|
| j|jt| S dshJ d S r0   )rN   rJ   rK   rE   r(   r)   Zcreate_fcmpUNErR   r   r?   Zcreate_icmpNErX   r   r   r   	not_equal  s    r   )startendr$   r%   c                 C  s   t | trt |tstdt| d? }t|d? }|s<|rDtd|| krTtd||  g}ttj|}t|| ||S )Nz/arange's arguments must be of type tl.constexpr    zarange must fit in int32z=arange's end argument must be greater than the start argument)	
isinstancer"   r'   r5   r(   r   r*   r)   Zcreate_make_range)r   r   r$   Zis_start_int64Zis_end_int64r   r`   r   r   r   arange  s    
r   z	List[int])r   r3   r$   r%   c                 C  s   t |tjr.|jjdks J dt|||}nP|d u r>td|dkrX|||}nt	|d|j
 }||}t||}t|| |S )Nr   zonly accepts size-1 tensorz2dtype must be specified when value is not a tensorr   get_)r   r(   r)   numelvaluerL   r'   r{   r|   getattrnamesplat)r   r   r3   r$   Zget_value_fnr   r   r   full  s    r   )r   r   r$   r%   c                 C  sF   | j  rJ dt|dkr"| S t| j|}t|| j||S )NzCannot splat a block tensorr   )	rJ   r   lenr(   r   r3   r)   create_splatrR   )r   r   r$   r`   r   r   r   r     s
    r   )rO   	dst_shaper$   r%   c                 C  sR   d}|D ]}||9 }q| j j|kr*tdt| j j|}t|| j|d|S )Nr   z$cannot view block of different shapeT)	rJ   r   r'   r(   r   rK   r)   create_reshaperR   )rO   r   r$   r   sr`   r   r   r   view  s    
r   c                 C  s(   t | jj|}t || j|d|S r0   )r(   r   rJ   rK   r)   r   rR   )rO   r   r$   r`   r   r   r   reshape  s    r   )rO   r#   r$   r%   c                 C  sZ   dd | j D }||d | j s4t| ||dS t| jj|}t|	| j
||S )Nc                 S  s   g | ]}t |qS r   )r(   Z_constexpr_to_value).0xr   r   r   
<listcomp>      zexpand_dims.<locals>.<listcomp>r   )r   r$   )r   insertrJ   r   r   r(   r   rK   r)   create_expand_dimsrR   )rO   r#   r$   r   r`   r   r   r   expand_dims  s    
r   )rG   rH   can_reorderr$   r%   c                 C  sX   |sJ dt | jdksJ t| jj| jd |jd  g}t|| j|j|S )Nz;current implementation of `cat` always may reorder elementsr   r   )	r   r   r(   r   rJ   rK   r)   Z
create_catrR   )rG   rH   r   r$   ret_typer   r   r   cat	  s    "r   c                 C  sJ   t | jdkrtdt| jj| jd | jd g}t|| j	|S )Nr
   z!Only 2D tensors can be transposedr   r   )
r   r   r'   r(   r   rJ   rK   r)   Zcreate_transrR   )rO   r$   r   r   r   r   trans  s     r   )rO   r   r$   r%   c                 C  s   | j  s.t| j |}t|| j||S | j  }t|t|kr\t	d| d| ||krh| S t
|D ]F\}}|| |krp|dkrpt	d||  d| d| d| d| 
qpt| j j|}t|| j||S )Nz!Cannot broadcast, rank mismatch: z, r   z3Cannot broadcast, the expanded size of the tensor (z ) must match the existing size (z) at non-singleton dimension : )rJ   r   r(   r   r)   r   rR   get_block_shapesr   r'   	enumeraterK   create_broadcast)rO   r   r$   r`   Z	src_shapeiitemr   r   r   broadcast_impl_shape  s*    


r   c              	   C  sp  | j }|j }| rJ| sJt|j|j}t||j|	 |}n| s| rt|j|j}t|| j|	 |} n| rh| rh|	 }|	 }t
|t
|k rtt
|t
|D ]8}t|| jdt|jdg| } | j }|	 }qn`t
|t
|k rltt
|t
|D ]:}t||jdt|jdg| }|j }|	 }q0t
|t
|ksJ g }t|D ]|\}	}
||	 }|
dkr|| nT|dkr||
 n>|
|kr||
 n(tdt|	 d t|
 d t| q||kr:t|j|}t|| j||} ||krht|j|}t||j||}| |fS )Nr   r   z?Cannot make_shape_compatible: incompatible dimensions at index r   r   )rJ   r   r(   r   rK   r   r)   r   rR   r   r   ranger   r   appendr'   strr   )rG   rH   r$   Zlhs_tyZrhs_tyZ	lhs_shapeZ	rhs_shapedim	ret_shaper   leftrightr`   r   r   r   rI   )  sj    





rI   )rO   dst_tyr$   r%   c                 C  s   | j }| r"t|j| j  }||kr.| S |j}|j}| sJ| rVt| ||S |j}|j}||krt	dt
| d t
| t|| j|||S )Nz!Cannot bitcast data-type of size z to data-type of size )rJ   r   r(   r   rK   r   rD   rL   primitive_bitwidthr'   r   r)   create_bitcastrR   r|   )rO   r   r$   src_ty
src_sca_ty
dst_sca_tyZsrc_bitsZdst_bitsr   r   r   rp   e  s     rp   c                 C  s  | j }t|tjr|j}| r4t|j| j  }||kr@| S |j}|j}|	 s\|	 rl|j
jslJ d| r|| s| r| rt|| j|||S | r| r| r| stt| tj|||S | o| o|j|jk}|rt|| j|||S | o:| o:|j|jk }|r^t|| j|||S | r| r|j|jks|j|jkr| o|  }| r| j|}	t| |	| j}
t!| |
|S t|"| j||||S |# r| r| r@| j|}	t| |	| j}
t!| |
|S | rft|$| j|||S t|%| j|||S | r|# r| s| st|&| j|||S t|'| j|||S |( rR| rR|j}|dkr"t|)| j|||S |dkrRt!t| tj*|t|+dtj*|S | r|( rt|,| j|||S |( r|( rt|-| j|||S dsJ d|  d| d S )Nz4fp8e4nv data type is not supported on CUDA arch < 89@   r   r   Fzcannot cast z to ).rJ   r   r(   	constexprr   r   r   rK   r   
is_fp8e4nvoptionsallow_fp8e4nvis_fp8rE   r)   Zcreate_fp_to_fprR   r|   r;   r9   r=   rL   r:   r   Zcreate_fp_truncZcreate_fp_extr?   r1   r2   r_   is_boolr3   r{   r   create_int_castZis_standard_floatingZcreate_fp_to_siZcreate_fp_to_uiZcreate_ui_to_fpZcreate_si_to_fprD   Zcreate_ptr_to_intint64	get_int64Zcreate_int_to_ptrr   )rO   r   r$   r   r   r   Ztruncate_fpZext_fpZsign_extendtyr}   Zbitwidthr   r   r   rL   x  s    









&rL   c                 C  sD   t jj}| r@| dkrt jj}n"| dkr0t jj}ntd|  d|S )Nz.ca.cgCache modifier  not supported)r   CACHE_MODIFIERNONECACGr'   cache_modifiercacher   r   r   _str_to_load_cache_modifier  s    

r   c                 C  sh   t jj}| rd| dkrt jj}nF| dkr0t jj}n4| dkrBt jj}n"| dkrTt jj}ntd|  d|S )Nz.wbr   z.csz.wtr   r   )r   r   r   ZWBr   CSZWTr'   r   r   r   r   _str_to_store_cache_modifier  s    



r   c                 C  sD   t jj}| r@| dkrt jj}n"| dkr0t jj}ntd|  d|S )NZ
evict_lastZevict_firstzEviction policy r   )r   ZEVICTION_POLICYNORMALZ
EVICT_LASTZEVICT_FIRSTr'   )eviction_policyevictionr   r   r   _str_to_eviction_policy  s    

r   c                 C  s@   d }| r<| dkrt jj}n"| dkr,t jj}ntd|  d|S )NzeronanzPadding option r   )r   PADDING_OPTIONZPAD_ZEROPAD_NANr'   )padding_optionpaddingr   r   r   _str_to_padding_option  s    

r   c                 C  sh   t jj}| rd| dkrt jj}nF| dkr0t jj}n4| dkrBt jj}n"| dkrTt jj}ntd|  d|S )NacquirereleaseZacq_relZrelaxedMemory semantic r   )r   ZMEM_SEMANTICZACQUIRE_RELEASEZACQUIREZRELEASEZRELAXEDr'   )Z
sem_optionsemr   r   r   _str_to_sem  s    



r   c                 C  sV   t jj}| rR| dkrt jj}n4| dkr0t jj}n"| dkrBt jj}ntd|  d|S )NZgpuZctasysr   r   )r   ZMEM_SYNC_SCOPEZGPUZCTAZSYSTEMr'   )Zscope_optionscoper   r   r   _str_to_scope  s    


r   c                 C  s   | rt | ds| g} dd | D } | D ],}t|trNd|  krLt|k s&n J q&t| dksdJ t| tt| ksJ dt| S t S )N__iter__c                 S  s"   g | ]}t |tjr|jn|qS r   r   r(   r   r   r   elemr   r   r   r   /  r   z0_canonicalize_boundary_check.<locals>.<listcomp>r   z'Duplicate dimension in `boundary_check`)hasattrr   r"   r   setsortedtuple)boundary_checkblock_shaper   r   r   r   _canonicalize_boundary_check+  s    
*r   c	              
   C  s|   |s|rt d| jjj}	|	tjks,J d|	 rH|tjjkrHt d| jj}
t	||

 }t|| j||||||
S )NK`mask` and `other` arguments cannot be specified for loading block pointers3`tl.int1` should be rewrited in `tl.make_block_ptr`z@Padding option `nan` is not supported for integer block pointers)r'   rJ   
element_tyr(   rn   r?   r   r   r   r   r   r)   Zcreate_tensor_pointer_loadrR   )ptrmaskrP   r   r   r   r   is_volatiler$   elt_tyr   r   r   r   _load_block_pointer8  s    
r   c	              
   C  sn  | j j s"td| j   d|s2|r2td|s:|rBtd| j  sx|rb|j  rbtd|rx|j  rxtd| j  r|rt|| j  |}|rt|| j  |}| j j}	|	j}
|
t	j
krt	j}
t	|
|	j}	t| |	|} |rt||
|}| j  r| j  }t	|
|}n|
}|s>t	|| j||||S t	|| j|j|rZ|jnd ||||S d S )NUnsupported ptr type z in `tl.load`z)`other` cannot be provided without `mask`z`padding_option` or `boundary_check` argument is not supported for loading a tensor ofpointers or loading a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadEMask argument cannot be block type if pointer argument is not a blockzFOther argument cannot be block type if pointer argument is not a block)rJ   rK   rD   r'   r   r   r   r   r   r(   rn   int8pointer_typeaddress_spacerL   r   r)   Zcreate_loadrR   Zcreate_masked_load)r   r   rP   r   r   r   r   r   r$   ptr_tyr   r   r   r   r   r   _load_legacyN  sH    



r   zOptional[tl.tensor]r   )	r   r   rP   r   r   r   r   r$   r%   c	              
   C  sb   t |}	t|}
t|}| j rF| jj rFt| |||||	|
||	S t| |||||	|
||	S d S rC   )	r   r   r   rJ   rD   r   r   r   r   )r   r   rP   r   r   r   r   r   r$   r   r   r   r   r   r   load  s    r   c           	   	   C  s   |rt d| jj }|j s.t|||}|j s@J d||j ksjJ d| d|j  d| jjj|jjksJ d| jjj d|jj d| jjj}|tjksJ dt||}t	|
| j|j|||tjS )	Nr   z-Value argument must be block type or a scalarzBlock shape(z) and value shape(z
) mismatchzBlock element type(z) and value element type(r   )r'   rJ   r   r   r   r   r(   rn   r   r)   Zcreate_tensor_pointer_storerR   void)	r   valr   r   r   r   r$   r   r   r   r   r   _store_block_pointer  s     
2

r  c           	   	   C  s0  | j j s"td| j   d|r.td| j  s`|j  rJtd|r`|j  r`td| j  rt|| j  |}|rt|| j  |}| j j}|j}|t	j
krt	j}t	||j}t| ||} t|||}|st	|| j|j||t	jS |j j stdt	|| j|j|j||t	jS )Nr   z in `tl.store`z`boundary_check` argument is not supported for storing a tensor of pointers or storing a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadzFValue argument cannot be block type if pointer argument is not a blockr   z"Mask must have boolean scalar type)rJ   rK   rD   r'   r   r   r   r   r   r(   rn   r   r   r   rL   r)   Zcreate_storerR   r  r   Zcreate_masked_store)	r   r  r   r   r   r   r$   r   r   r   r   r   _store_legacy  s2    



r  )r   r  r   r   r   r$   r%   c           	      C  sR   t |}t|}| j r:| jj r:t| ||||||S t| ||||||S d S rC   )r   r   rJ   rD   r   r   r  r  )	r   r  r   r   r   r   r$   r   r   r   r   r   store  s
    r  )r   cmpr  r   r   r$   r%   c              	   C  sN   t |}t|}| jjj}|jdvr,tdt|	| j
|j
|j
|||jS )N)   r   r   z9atomic_cas only supports elements with width {16, 32, 64})r   r   rJ   rK   r   r   r'   r(   r)   Zcreate_atomic_casrR   )r   r  r  r   r   r$   r   r   r   r   
atomic_cas  s    

r  z&Tuple[tl.tensor, tl.tensor, tl.tensor])r   r  r   opr$   r%   c                 C  s   | j j std| j   | j jj}|tju rJ|dkrJtd| d |tjtj	tj
tjfv rztd| d t| | j  r|rt|| j  |}|rt|| j  |}t|| j jj|}|s|d}tj}| j  r
||| j  }ttj| j  }t||}| ||fS )Nz)Pointer argument of store instruction is rV   Zatomic_z does not support fp16z does not support T)rJ   rK   rD   r'   r   r   r(   r<   rn   r   int16r>   r   r   r   r   rL   Zget_int1r   r   r)   )r   r  r   r	  r$   r   Zmask_irZmask_tyr   r   r   atom_red_typechecking_impl  s*    


r  )r   r  r   r   r   r$   r%   c                 C  s  t | ||d|\} }}t|}t|}|jj}| r| rft|	t
jj| j|j|j|||jS t|	t
jj| j|j|j|||jS |tjtjhvrtd| |tjkrtjntj}tg d||}t|||}	t| t|d|}
t|||}t|||}t|	t
jj|
j|	jt|||j|||	j}t|	t
jj|
j|	jt|||j|||	j}t||||}t|||S )Nmaxz#atomic_max not supported for dtype         r   )r  r   r   rJ   rK   r?   r_   r(   r)   create_atomic_rmwr   	ATOMIC_OPMAXrR   UMAXr:   r8   	TypeErrorr*   r   rp   r   r   r   rj   UMINwherer   r  r   r   r   r$   sca_tyityper   Zi_valZi_ptrposnegZpos_retZneg_retrc   r   r   r   
atomic_max  sD      r  c                 C  s  t | ||d|\} }}t|}t|}|jj}| r| rft|	t
jj| j|j|j|||jS t|	t
jj| j|j|j|||jS |tjtjhvrtd| |tjkrtjntj}tg d||}t|||}	t| t|d|}
t|||}t|||}t|	t
jj|
j|	jt|||j|||	j}t|	t
jj|
j|	jt|||j|||	j}t||||}t|||S )Nminz#atomic_min not supported for dtype r  r   )r  r   r   rJ   rK   r?   r_   r(   r)   r  r   r  ZMINrR   r  r:   r8   r  r*   r   rp   r   r   r   rj   r  r  r  r   r   r   
atomic_min6  sD      r  c              
   C  sj   t | ||d|\} }}t|}t|}|jj}| r>tjjntjj	}t
||| j|j|j|||jS )NrV   )r  r   r   rJ   rK   rE   r   r  ZFADDZADDr(   r)   r  rR   )r   r  r   r   r   r$   r  r	  r   r   r   
atomic_addZ  s    r  c              
   C  sN   t | ||d|\} }}t|}t|}t|tjj| j	|j	|j	|||j
S )Nand)r  r   r   r(   r)   r  r   r  ANDrR   rJ   r   r  r   r   r   r$   r   r   r   
atomic_andc  s     r!  c              
   C  sN   t | ||d|\} }}t|}t|}t|tjj| j	|j	|j	|||j
S )Nor)r  r   r   r(   r)   r  r   r  ORrR   rJ   r   r   r   r   	atomic_ork  s     r$  c              
   C  sN   t | ||d|\} }}t|}t|}t|tjj| j	|j	|j	|||j
S )Nxor)r  r   r   r(   r)   r  r   r  ZXORrR   rJ   r   r   r   r   
atomic_xors  s     r&  c              
   C  sN   t | ||d|\} }}t|}t|}t|tjj| j	|j	|j	|||j
S )NZxchg)r  r   r   r(   r)   r  r   r  ZXCHGrR   rJ   r   r   r   r   atomic_xchg{  s     r'  )r%   c                   C  s   t  s
dS dS r\   r   r   r   r   r   gpu_has_mfma  s    r(  c                 C  s   t  s
dS dS r\   )r(  )MNK
allow_tf32ret_scalar_tyr   r   r   mfma_supported  s    r.  )rG   rH   accr,  max_num_imprecise_acc	out_dtyper$   r%   c              
   C  s  dd }| j  r|j  s J || j|j|j t| jdksRJ d| j dt|jdksrJ d|j d| jd j|jd jksJ d| j d	|j d
| jd j d|jd j d	| jd jdkr| jd jdkr|jd jdksJ d| j d|j d| j j r^| j jt	j
ks2J d| jd jdksLJ d|d}t	j}	nd| rrtdnP| j j s| j j r|d}t	j}	n"| r|dn|d}|}	| j jd }
|j jd }t rt|
|| j jd ||	s| j j rt	jn|	}t| ||} t|||}|t	jkrP||d|
|g}n||d|
|g}t	||
|g}t	|| j|j|||}t||	|S t rLt|
|| j jd ||	rL|	jdk rL| j j rt	j}||d|
|g}nt	j}||d|
|g}t	||
|g}t	|| j|j|||}t||	|S t	|	|
|g}|d u rx|||
|g}n|j}|j |ksJ d}| j r|j r|jj}|d u rd}t	|| j|j||||S )Nc                 S  s  |j sT|  s| rJ d|  r2| r2d S | |ksPJ d|  d| dnh|  sd| rlJ d|  s|| rJ d|  s| r| |ksJ d|  d| d	|  s|  sJ d
|  d	n|  s| r0|  s
|  s
J d|  d	| s| sJ d| d	n| 	 sf| 
 sf|  sf|  sfJ d|  |	 s|
 s| s| sJ d| | |ksJ d|  d| dd S )Nz1Dot op does not support fp8e4nv on CUDA arch < 90zFirst input (z) and second input (z) must have the same dtype!z3Dot op does not support fp8e4b15 on CUDA arch >= 90z5Dot op does not support fp8e4b15x4 on CUDA arch >= 90z0Both operands must be same type. First operand (z) and second operand (rz   z:Both operands must be either int8 or uint8. Operand type (z/Only supports fp8e4nv or fp8e5. First operand (z0Only supports fp8e4nv or fp8e5. Second operand (zUnsupported dtype )r   r   r   Zis_fp8e4b15Zis_fp8e4b15x4r?   Zis_int8Zis_uint8Zis_fp8e5r;   r=   r9   ro   )Z	lhs_dtypeZ	rhs_dtyper   r   r   r   assert_dtypes_valid  s@    "


**z dot.<locals>.assert_dtypes_validr
   zFirst input shape (z) is not two dimensional!zSecond input shape (r   r   z) and second input shape z= are not compatible for matmul (second index of first shape (z0) must be equal to first index of second shape (rz   r  z&All values in both first input shape (z) and second input shape (z) must be >= 16!zonly int8 supported!r   zsmall blocks not supported!zhout_dtype=bfloat16 is unsupported. Please use out_dtype=float32/float16 and cast with `.to(tl.bfloat16)`i   @) rJ   r   r3   r   r   r   r   rK   r?   r(   r   	get_int32r*   r=   r'   r9   Zget_fp32r:   r;   Zget_fp16r   r.  rL   r<   r   r   r)   Z
create_dotrR   r   r   Zmax_num_imprecise_acc_default)rG   rH   r/  r,  r0  r1  r$   r2  r}   r-  r)  r*  Zret_cast_scalar_tyr`   rc   Zret_dot_scalar_tyZ
acc_handler   r   r   dot  s      
0 


"

r4  )	conditionr   yr$   r%   c                 C  s   t | tj|} | j rHt| ||\} }t|||\}}t| ||\} }t|||dd\}}| j svt| ||\} }|j}t|| j	|j	|j	|S )NT)
rL   r(   rn   rJ   r   rI   rN   r)   Zcreate_selectrR   )r5  r   r6  r$   _r`   r   r   r   r     s    

r  zSequence[tl.tensor]zTuple[tl.tensor, ...])inputsr#   r$   r%   c           	        s    d u rNg }t tD ](}| jjg}|t| || qt|d d jj} fddt	|D D ]}|jj|kstJ qtfdd|
dd D  |   tfddt tD S )	Nr   c                   s   g | ]\}}| kr|qS r   r   )r   r   r   )r#   r   r   r     r   zreduction.<locals>.<listcomp>c                   s"    rt | }n|}t | |S rC   r(   r   r)   r   rY   Zres_ty)r   r   r   wrap_tensor!  s    zreduction.<locals>.wrap_tensorc                 S  s   g | ]
}|j qS r   rR   r   tr   r   r   r   )  r   c                 3  s&   | ]} | | jjV  qd S rC   Z
get_resultrJ   rK   r   r   )r8  	reduce_opr;  r   r   	<genexpr>-  r   zreduction.<locals>.<genexpr>)r   r   r   r   r   r   r   rJ   r   r   Zcreate_reduceverify)	r8  r#   region_builder_fnr$   Z
new_inputsr   Z	new_shaper   r>  r   )r#   r8  rA  r   r;  r   	reduction  s     rE  c                   st   t  dkrtd d jjfdd|dd  D ||   t fdd	tt  D S )
Nr   z7Current implementation only support single tensor inputr   c                   s   t | }t | |S rC   r9  r:  )r   r   r   r;  ;  s    z%associative_scan.<locals>.wrap_tensorc                 S  s   g | ]
}|j qS r   r<  r=  r   r   r   r   ?  r   z$associative_scan.<locals>.<listcomp>c                 3  s&   | ]} | | jjV  qd S rC   r?  r@  )r8  scan_opr;  r   r   rB  C  r   z#associative_scan.<locals>.<genexpr>)r   r'   rJ   r   Zcreate_scanrC  r   r   )r8  r#   rD  r$   r   )r8  rF  r   r;  r   associative_scan5  s    rG  z	List[str])dtypesr%   c                   s    fdd}|S )al  
    We're following libdevice's convention to check accepted data types for math functions.
    It is not a good practice to support all data types as accelerators/GPUs don't support
    many float16 and bfloat16 math operations.
    We should let the users know that they are using and invoke explicit cast to convert
    the data type to the supported one.
    c                   s   t   fdd}|S )Nc                    s^   t | t |  }dd |D D ],}|jjj vr"td  d|jjj q"| i |S )Nc                 S  s   g | ]}t |tjr|qS r   )r   r(   r)   )r   ar   r   r   r   Z  r   z@_check_dtype.<locals>.wrapper.<locals>.check.<locals>.<listcomp>zExpected dtype z	 but got )listvaluesrJ   rK   r   r'   )argskwargsall_argsarg)rH  fnr   r   checkV  s
    z,_check_dtype.<locals>.wrapper.<locals>.checkr   )rP  rQ  rH  )rP  r   wrapperT  s    z_check_dtype.<locals>.wrapperr   )rH  rS  r   rR  r   _check_dtypeK  s    	rT  )r   r6  r$   r%   c                 C  s,   t | ||\} }ddlm} |j| ||dS Nr   )math)Z_builder)rN    rV  Zmulhi)r   r6  r$   rV  r   r   r   umulhid  s    rX  Zfp32Zfp64rR  )r   r$   r%   c                 C  s   ddl m} |j| |dS rU  )rW  rV  re   )r   r$   rV  r   r   r   re   k  s    re   c                 C  s   t || j| jS rC   )r(   r)   Z
create_exprR   rJ   r   r$   r   r   r   expr  s    rZ  c                 C  s   t || j| jS rC   )r(   r)   Z
create_logrR   rJ   rY  r   r   r   logw  s    r[  c                 C  s   t || j| jS rC   )r(   r)   Z
create_cosrR   rJ   rY  r   r   r   cos|  s    r\  c                 C  s   t || j| jS rC   )r(   r)   Z
create_sinrR   rJ   rY  r   r   r   sin  s    r]  c                 C  s   t || j| jS rC   )r(   r)   Zcreate_sqrtrR   rJ   rY  r   r   r   sqrt  s    r^  c                 C  sd   | j }| r$t|| j| jS | rBt|| j| jS |	 rN| S ds`J d| d S )NFzUnexpected dtype )
r3   rE   r(   r)   Zcreate_fabsrR   rJ   r_   Zcreate_iabsZis_int_unsigned)r   r$   r3   r   r   r   abs  s    r_  )r   rK  r%   c                 C  s@   t dt| jt|kr td| jdt|| j  | S )Nr   zAShape of input to multiple_of does not match the length of valuesztt.divisibility)	r  r   r   r'   rR   set_attrr   	make_attrget_contextr   rK  r   r   r   multiple_of  s    rd  c                 C  s:   t | jt |krtd| jdt|| j  | S )NzDShape of input to max_contiguous does not match the length of valuesztt.contiguityr   r   r'   rR   r`  r   ra  rb  rc  r   r   r   max_contiguous  s    rf  c                 C  s:   t | jt |krtd| jdt|| j  | S )NzCShape of input to max_constancy does not match the length of valuesztt.constancyre  rc  r   r   r   max_constancy  s    rg  )r$   r%   c                 C  s   t |  t jS rC   )r(   r)   Zcreate_barrierr  )r$   r   r   r   debug_barrier  s    rh  zList[tl.tensor])prefixrL  r$   r%   c                 C  s   |  ds|r| d7 } |  ds4|r4| d d d } t| dkrR| dsRd|  } g }|D ]}||j qZt|| |tjS )N r   r
   )	endswithr   
startswithr   rR   r(   r)   Zcreate_printr  )ri  rL  r$   new_argsrO  r   r   r   device_print  s    ro  )condmsg	file_namelinenor$   r%   c              	   C  sP   | j }| s2t|jd}t|| jd|} t|| j||||tj	S )N)r   )
rJ   r   r(   r   rK   r)   r   rR   Zcreate_assertr  )rp  rq  rr  	func_namers  r$   Zcond_tyr   r   r   device_assert  s
    ru  c                 C  s   t |trt|}t |tjr<|r0| |jS | |jS t |tjr|jjdks\J d|j	
 snJ d|j	tjkr|r| |j|  |j	 S |j	tjkr| |j|  |j	 S |jS dsJ dt| d S )Nr   z*Expected a scalar in shape/strides/offsetsz8Expected an integer scalar type in shape/strides/offsetsFz3Unsupported element type in shape/strides/offsets: )r   r"   r(   r   r   r   r3  r)   r   r3   r?   r   r   rR   Zget_int64_tyr_   r*   Zget_int32_tyrJ   )r$   r   require_i64r   r   r   _convert_elem_to_ir_value  s    

rw  c                   s,   t |dr fdd|D S t |gS )Nr   c                   s   g | ]}t  |qS r   )rw  r   r$   rv  r   r   r     r   z)_convert_to_ir_values.<locals>.<listcomp>)r   rw  )r$   	list_likerv  r   rx  r   _convert_to_ir_values  s    
rz  )baser$   r%   c              	     s:  t ||}t ||}t ||dd}| j r8| jj r@td| jjtjkrht| t	tj
| jj|} t dsx g dd  D  tdd  D sJ dt|ds|g}d	d |D }t|ttt|ksJ d
t fdd||||fD sJ d|| j||| |}t|t	t| jj S )NFrv  zMExpected `base` to be a pointer type (but not a block pointer type or others)r   c                 S  s"   g | ]}t |tjr|jn|qS r   r   r   r   r   r   r     r   z"make_block_ptr.<locals>.<listcomp>c                 S  s.   g | ]&}t |to(d |  ko$dk n  qS )i   l        )r   r"   r   r   r   r   r     r   zGExpected a list of constant integers (`int32_t` range) in `block_shape`c                 S  s"   g | ]}t |tjr|jn|qS r   r   r   r   r   r   r     r   z<Expected a permutation of (0, 1, ..., len(order)-1) in orderc                   s   g | ]}t  t |kqS r   )r   )r   ry  r   r   r   r     r   zBExpected shape/strides/offsets/block_shape to have the same length)rz  rJ   rD   r   r   r'   r(   rn   rL   r   r   r   r   allr   rJ  r   r   Zcreate_make_block_ptrrR   r)   r   )r{  r   stridesoffsetsr   orderr$   rR   r   r}  r   make_block_ptr  s,    



 "r  c                 C  s&   t ||dd}t|| j|| jS )NFr|  )rz  r(   r)   Zcreate_advancerR   rJ   )r{  r  r$   r   r   r   advance  s    r  N)FFTF)T)p
__future__r   	functoolsr   typingr   r   r   r   r	   Z_C.libtriton.tritonr   Zcommon.buildr   rW  r   r(   r   	Exceptionr   r,   r-   r4   r@   rF   rN   rV   rZ   r[   r^   ra   rd   rf   rh   rj   rk   rl   rq   rr   ru   rv   rw   rx   ry   rW   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rI   rp   rL   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r!  r$  r&  r'  r(  r.  r4  r  rE  rG  rT  rX  re   rZ  r[  r\  r]  r^  r_  rd  rf  rg  rh  ro  ru  rw  rz  r  r  r   r   r   r   <module>   s   (   


<a:,	$$	j"
'