a
    Df                  
   @   sx  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d
dlmZ d dlmZ d dlmZmZmZ d dlm Z m!Z! e
 Z"e"j#Z#e"j$Z%e"j&Z&dd Z'e%e(eddd Z)e%e(eddd Z*e%e(eddd Z+e%e(eddd Z,e%e(eddd Z-e%e d d!d" Z.e%e d#d$d% Z/e%e d&d'd( Z0e#ej1j2ej3d)d* Z4d a5d+d, Z6e#ej7j8ej9ej:d-d. Z;e#ej7j8ej<ej:e#ej7j8ej=ej:d/d0 Z>e#ej?j8ej9ej:d1d2 Z@e#ej?j8ej<ej:e#ej?j8ej=ej:d3d4 ZAe#ejBd5d6 ZCe#ejDd7d8 ZEe#ejFd9d: ZGe#ejHd;d< ZIe#ejHejJd=d> ZKe#ejLejJejJejJejJejJe#ejLejJejJejMejJejJe#ejLejJejJejNejJejJe#ejLejJejJejOejJejJd?d@ ZPe#ejQejJejJejRdAdB ZSe#ejTejJejJe#ejTejJejMe#ejTejJejNe#ejTejJejOdCdD ZUe#ejVejJejJe#ejVejJejMe#ejVejJejNe#ejVejJejOdEdF ZWe#ejXdGdH ZYe#ejZdIdJ Z[e#ej\ej:dKdL Z]e#ej^ej:ej:ej:dMdN Z_dOdP Z`eejaejbdQdR ZceejbejadSdT ZddUdV ZeeejaejfdWdX Zgeejfejaeej9ejadYdZ Zhd[d\ Zieiejjjkd] eiejld] eiejmd] eiejjjnd^ eiejod^ eiejpd^ eiejjjqd_ eiejrd_ eiejsd_ e#ejjjtejad`da Zue#ejvejadbdc Zwe#ejjjxejaddde Zye#ezejadfdg Z{e#ejjj|ejaejaejadhdi Z}e#ej~ejaejae#ejejaejadjdk ZdlZdmdn Ze#ejjjejaejaedo e#ejejaejaedo e#ejjjejaejaedp e#ejejaejaedp e#ejjjejaejaedq e#ejejaejaedq e#ejjjejaejaedr e#ejejaejaedr e#ejjjejaejaeds e#ejejaejaeds e#ejjjejaejaedt e#ejejaejaedt dudv Zeejjjdwdr eejjjdxdt ejdyejdziZe#ejeje#ejejd{d| Ze#ejejd}d~ Ze#ejejdd Ze#ejej:dd Ze#ejejJe#ejejdd Ze#ejejMe#ejejdd Ze#ejej:ej:ej:dd Ze#eejNejNdd Ze#eejOejNe#eejNejOe#eejOejOdd Ze#eejNejNdd Ze#eejOejNe#eejNejOe#eejOejOdd Ze#eejNe#eejOdd Ze#eejNejfe#eejOejfdd Zdd Zejd Zdej Ze#ejejNee e#ejejOee e#ejejNee e#ejejOee dd Zdd Ze#ejjlej3ejej:e#ejjlej3ej=ej:e#ejjlej3ej<ej:edd Ze#ejjoej3ejej:e#ejjoej3ej=ej:e#ejjoej3ej<ej:edd Ze#ejjej3ejej:e#ejjej3ej=ej:e#ejjej3ej<ej:edd Ze#ejjej3ejej:e#ejjej3ej=ej:e#ejjej3ej<ej:edd Zdd Zeejjd eejjd eejjd e#ejjej3ejej:e#ejjej3ej=ej:e#ejjej3ej<ej:edd Ze#ejjej3ejej:e#ejjej3ej<ej:e#ejjej3ej=ej:edd Ze#ejjej3ejej:e#ejjej3ej<ej:e#ejjej3ej=ej:edd Ze#ejjej3ejej:e#ejjej3ej<ej:e#ejjej3ej=ej:edd Ze#ejjej3ejej:e#ejjej3ej<ej:e#ejjej3ej=ej:edd Ze#ejjej3ej:ej:dd Ze#ejjej3ejej:ej:e#ejjej3ej<ej:ej:e#ejjej3ej=ej:ej:dd Ze#ejej̓dd ZdddZe&e!dd Zeeѡ e# dS )    )reduceN)ir)Registry
lower_cast)parse_dtype)models)typescgutils)ufunc_db)register_ufuncs   )nvvm)cuda)	nvvmutilsstubserrors)dim3CUDADispatcherc                 C   sB   t | d| }t | d| }t | d| }t| |||fS )Nz%s.xz%s.yz%s.z)r   	call_sregr	   Zpack_struct)builderprefixxyz r   `/nfs/NAS7/SABIOD/METHODE/ermites/ermites_venv/lib/python3.9/site-packages/numba/cuda/cudaimpl.pyinitialize_dim3   s    r   Z	threadIdxc                 C   s
   t |dS )Ntidr   contextr   sigargsr   r   r   cuda_threadIdx    s    r#   ZblockDimc                 C   s
   t |dS )NZntidr   r   r   r   r   cuda_blockDim%   s    r$   ZblockIdxc                 C   s
   t |dS )NZctaidr   r   r   r   r   cuda_blockIdx*   s    r%   ZgridDimc                 C   s
   t |dS )NZnctaidr   r   r   r   r   cuda_gridDim/   s    r&   laneidc                 C   s   t |dS )Nr'   )r   r   r   r   r   r   cuda_laneid4   s    r(   r   c                 C   s   | |dS Nr   extract_valuer   r   r   r   dim3_x9   s    r,   r   c                 C   s   | |dS )Nr   r*   r   r   r   r   dim3_y>   s    r-   r   c                 C   s   | |dS )N   r*   r   r   r   r   dim3_zC   s    r/   c                 C   s   |d S r)   r   r   r   r   r   cuda_const_array_likeJ   s    r0   c                 C   s   t d7 a d| t S )zDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})_unique_smem_idformatnamer   r   r   _get_unique_smem_idT   s    r5   c              	   C   s8   |j d j}t|j d }t| ||f|tdtjddS )Nr   r   _cudapy_smemTshapedtypesymbol_name	addrspacecan_dynsized)r"   literal_valuer   _generic_arrayr5   r   ADDRSPACE_SHAREDr    r   r!   r"   lengthr9   r   r   r   cuda_shared_array_integer^   s    rB   c              	   C   s>   dd |j d D }t|j d }t| |||tdtjddS )Nc                 S   s   g | ]
}|j qS r   r=   .0sr   r   r   
<listcomp>k       z+cuda_shared_array_tuple.<locals>.<listcomp>r   r   r6   Tr7   )r"   r   r>   r5   r   r?   r    r   r!   r"   r8   r9   r   r   r   cuda_shared_array_tupleh   s    
rJ   c              	   C   s4   |j d j}t|j d }t| ||f|dtjddS )Nr   r   _cudapy_lmemFr7   )r"   r=   r   r>   r   ADDRSPACE_LOCALr@   r   r   r   cuda_local_array_integers   s    rM   c              	   C   s:   dd |j d D }t|j d }t| |||dtjddS )Nc                 S   s   g | ]
}|j qS r   rC   rD   r   r   r   rG      rH   z(ptx_lmem_alloc_array.<locals>.<listcomp>r   r   rK   Fr7   )r"   r   r>   r   rL   rI   r   r   r   ptx_lmem_alloc_array}   s    
rN   c                 C   sD   |rJ d}|j }tt d}t|||}||d |  S )Nzllvm.nvvm.membar.ctar   moduler   FunctionTypeVoidTyper	   get_or_insert_functioncallget_dummy_valuer    r   r!   r"   fnamelmodfntysyncr   r   r   ptx_threadfence_block   s    r[   c                 C   sD   |rJ d}|j }tt d}t|||}||d |  S )Nzllvm.nvvm.membar.sysr   rO   rV   r   r   r   ptx_threadfence_system   s    r\   c                 C   sD   |rJ d}|j }tt d}t|||}||d |  S )Nzllvm.nvvm.membar.glr   rO   rV   r   r   r   ptx_threadfence_device   s    r]   c                 C   s*   |  tjd}ttj}t| |||gS )Nl    )get_constantr   int32noneptx_syncwarp_mask)r    r   r!   r"   maskZmask_sigr   r   r   ptx_syncwarp   s    rc   c                 C   sD   d}|j }tt tdf}t|||}||| |  S )Nzllvm.nvvm.bar.warp.sync    )	rP   r   rQ   rR   IntTyper	   rS   rT   rU   rV   r   r   r   ra      s    ra   c              
   C   s  |\}}}}}|j d }	|	tjv r6||t|	j}d}
|j}tt	tdtdftdtdtdtdtdf}t
|||
}|	jdkr|||||||f}|	tjkr||d}||d}||t }t
|||f}n||td}||| tjd}||td}|||||||f}|||||||f}||d}||d}||d}||td}||td}||| tjd}|||}|	tjkr||t }t
|||f}|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    r.   zllvm.nvvm.shfl.sync.i32rd   r   r   @   )r"   r   real_domainbitcastr   re   bitwidthrP   rQ   LiteralStructTyper	   rS   rT   float32r+   	FloatTypeZmake_anonymous_structtruncZlshrr^   i8zextZshlor_float64
DoubleType)r    r   r!   r"   rb   modevalueindexclampZ
value_typerW   rX   rY   funcretrvpredZfvZvalue1Z
value_lshrZvalue2Zret1Zret2Zrv1Zrv2Zrv1_64Zrv2_64Zrv_shlr   r   r   ptx_shfl_sync_i32   sH    



r{   c                 C   s^   d}|j }tttdtdftdtdtdf}t|||}|||S )Nzllvm.nvvm.vote.syncrd   r   )rP   r   rQ   rj   re   r	   rS   rT   )r    r   r!   r"   rW   rX   rY   rw   r   r   r   ptx_vote_sync   s    r|   c                 C   s   |\}}|j d j}|j d tjv r6||t|}d|}|j}t	tdtdt|f}	t
||	|}
||
||fS )Nr   zllvm.nvvm.match.any.sync.i{}rd   )r"   ri   r   rg   rh   r   re   r2   rP   rQ   r	   rS   rT   r    r   r!   r"   rb   rt   widthrW   rX   rY   rw   r   r   r   ptx_match_any_sync   s    
"r   c                 C   s   |\}}|j d j}|j d tjv r6||t|}d|}|j}t	t
tdtdftdt|f}	t||	|}
||
||fS )Nr   zllvm.nvvm.match.all.sync.i{}rd   )r"   ri   r   rg   rh   r   re   r2   rP   rQ   rj   r	   rS   rT   r}   r   r   r   ptx_match_all_sync  s    
r   c                 C   s,   t jt t dg dddd}||g S )Nrd   zactivemask.b32 $0;=rTZside_effectr   	InlineAsmrQ   re   rT   r    r   r!   r"   
activemaskr   r   r   ptx_activemask  s    r   c                 C   s,   t jt t dg dddd}||g S )Nrd   zmov.u32 $0, %lanemask_lt;r   Tr   r   r   r   r   r   ptx_lanemask_lt$  s
    r   c                 C   s   | |d S r)   )Zctpopr   r   r   r   ptx_popc,  s    r   c                 C   s
   |j | S N)fmar   r   r   r   ptx_fma1  s    r   c                 C   sB   ddd}z
||  W S  t y<   d|  d}t|Y n0 d S )N)Zf32f)Zf64d)rd   rf   z$Conversion between float16 and float unsupportedKeyErrorr   ZCudaLoweringErrorri   typemapmsgr   r   r   float16_float_ty_constraint6  s    

r   c           	      C   sd   |j |j kr|S t|j \}}t| |tdg}t|d| dd| d}|||gS )N   zcvt..f16 $0, $1;=,h)ri   r   r   rQ   get_value_typere   r   rT   	r    r   fromtytotyvalty
constraintrY   asmr   r   r   float16_to_float_cast@  s    r   c           	      C   sb   |j |j kr|S t|j \}}ttd| |g}t|d| dd| }|||gS )Nr   cvt.rn.f16. $0, $1;=h,)ri   r   r   rQ   re   r   r   rT   r   r   r   r   float_to_float16_castL  s    r   c                 C   sF   ddddd}z
||  W S  t y@   d|  d}t|Y n0 d S )Nchrl)   r   rd   rf   z"Conversion between float16 and intr   r   r   r   r   r   float16_int_constraintX  s    
r   c           
      C   sf   |j }t|}|jrdnd}t| |tdg}t|d| | dd| d}	||	|gS )NrF   ur   zcvt.rni.r   r   r   )	ri   r   signedr   rQ   r   re   r   rT   
r    r   r   r   r   ri   r   Z
signednessrY   r   r   r   r   float16_to_integer_castb  s    
r   c           
      C   sd   |j }t|}|jrdnd}ttd| |g}t|d| | dd| }	||	|gS )NrF   r   r   r   r   r   )	ri   r   r   r   rQ   re   r   r   rT   r   r   r   r   integer_to_float16_casto  s    
r   c                    s    t | tjtj fdd}d S )Nc                    sB   t t dt dt dg}t |  dd}|||S )Nr   z.f16 $0,$1,$2;=h,h,hr   rQ   re   r   rT   r    r   r!   r"   rY   r   opr   r   ptx_fp16_binary  s
    z*lower_fp16_binary.<locals>.ptx_fp16_binarylowerr   float16)fnr   r   r   r   r   lower_fp16_binary~  s    r   addsubmulc                 C   s4   t t dt dg}t |dd}|||S )Nr   zneg.f16 $0, $1;=h,hr   r   r   r   r   ptx_fp16_hneg  s    r   c                 C   s   t | |||S r   )r   r   r   r   r   operator_hneg  s    r   c                 C   s4   t t dt dg}t |dd}|||S )Nr   zabs.f16 $0, $1;r   r   r   r   r   r   ptx_fp16_habs  s    r   c                 C   s   t | |||S r   )r   r   r   r   r   operator_habs  s    r   c                 C   sH   t dt dt dg}t t d|}t |dd}|||S )Nr   zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   re   rQ   r   rT   )r    r   r!   r"   ZargtysrY   r   r   r   r   ptx_hfma  s    r   c                 C   s   dd }|  ||||S )Nc                 S   s   t j| |S r   )r   fp16Zhdiv)r   r   r   r   r   fp16_div  s    zfp16_div_impl.<locals>.fp16_divZcompile_internal)r    r   r!   r"   r   r   r   r   fp16_div_impl  s    r   z{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                    s    fdd}|S )Nc           	         sr   t t dt dt dg}t |tj dd}|||}| tj	d}|
|t d}|d||S )Nr   r   r   r   z!=)r   rQ   re   r   	_fp16_cmpr2   rT   r^   r   int16rh   Zicmp_unsigned)	r    r   r!   r"   rY   r   resultZzeroZ
int_resultr   r   r   ptx_fp16_comparison  s    "z*_gen_fp16_cmp.<locals>.ptx_fp16_comparisonr   )r   r   r   r   r   _gen_fp16_cmp  s    r   eqnegegtleltc                    s    t | tjtj fdd}d S )Nc                    s(   t  | |||}|||d |d S )Nr   r   )r   select)r    r   r!   r"   choicer   r   r   ptx_fp16_minmax  s    z*lower_fp16_minmax.<locals>.ptx_fp16_minmaxr   )r   rW   r   r   r   r   r   lower_fp16_minmax  s    r   maxminZ
__nv_cbrtfZ	__nv_cbrtc           
      C   sF   |j }t| }| |}|j}t||g}t|||}	||	|S r   )	return_type
cbrt_funcsr   rP   r   rQ   r	   rS   rT   )
r    r   r!   r"   r   rW   ZftyrX   rY   r   r   r   r   ptx_cbrt  s    
r   c              	   C   s2   t |jttdtdfd}|||S )Nrd   Z	__nv_brevr	   rS   rP   r   rQ   re   rT   r    r   r!   r"   r   r   r   r   ptx_brev_u4  s    r   c              	   C   s2   t |jttdtdfd}|||S )Nrf   Z__nv_brevllr   r   r   r   r   ptx_brev_u8	  s    r   c                 C   s   | |d | tjdS r)   )Zctlzr^   r   booleanr   r   r   r   ptx_clz  s    r   c              	   C   s2   t |jttdtdfd}|||S )Nrd   Z__nv_ffsr   r   r   r   r   
ptx_ffs_32  s    r   c              	   C   s2   t |jttdtdfd}|||S )Nrd   rf   Z
__nv_ffsllr   r   r   r   r   
ptx_ffs_64&  s    r   c                 C   s   |\}}}| |||S r   )r   )r    r   r!   r"   testabr   r   r   ptx_selp0  s    
r   c              	   C   s4   t |jtt t t fd}|||S )NZ
__nv_fmaxfr	   rS   rP   r   rQ   rl   rT   r   r   r   r   
ptx_max_f46  s    r   c              
   C   sh   t |jtt t t fd}||| ||d |jd t	j
| ||d |jd t	j
gS )NZ	__nv_fmaxr   r   r	   rS   rP   r   rQ   rr   rT   castr"   r   doubler   r   r   r   
ptx_max_f8A  s    r   c              	   C   s4   t |jtt t t fd}|||S )NZ
__nv_fminfr   r   r   r   r   
ptx_min_f4R  s    r   c              
   C   sh   t |jtt t t fd}||| ||d |jd t	j
| ||d |jd t	j
gS )NZ	__nv_fminr   r   r   r   r   r   r   
ptx_min_f8]  s    r   c              	   C   sJ   t |jttdt fd}||| ||d |j	d t
jgS )Nrf   Z__nv_llrintr   )r	   rS   rP   r   rQ   re   rr   rT   r   r"   r   r   r   r   r   r   	ptx_roundn  s    r   c                 C   s   dd }|  ||||S )Nc                 S   s   t | st | r| S |dkrb|dkr:d|d  }d}nd| }d}| | | }t |rt| S nd|  }| | }t|}t || dkrdt|d  }|dkr|| | }n||9 }|S )Nr      g      $@gMDg      ?g      ?g       @)mathisinfisnanroundfabs)r   ndigitsZpow1Zpow2r   r   r   r   r   round_ndigits  s(    

z$round_to_impl.<locals>.round_ndigitsr   )r    r   r!   r"   r   r   r   r   round_to_impl  s    !r   c                    s    fdd}|S )Nc                    s$   |j \}| | }|||d S r)   )r"   r^   Zfmul)r    r   r!   r"   ZargtyZfactorconstr   r   impl  s    zgen_deg_rad.<locals>.implr   )r   r   r   r   r   gen_deg_rad  s    r   g     f@c                    s   |t jv r t j|dd}|g}ntj |t|d} fddt||D }|j}||krltd||f |j	t|krtd|j	t|f ||fS )z4
    Convert integer indices into tuple of intp
    r   )r9   count)r   c                    s"   g | ]\}}  ||tjqS r   )r   r   intp)rE   tir   r    r   r   rG     s   z&_normalize_indices.<locals>.<listcomp>zexpect %s but got %sz#indexing %d-D array with %d-D index)
r   Zinteger_domainUniTupler	   Zunpack_tuplelenzipr9   	TypeErrorndim)r    r   indtyindsarytyvaltyindicesr9   r   r  r   _normalize_indices  s    
r  c                    s    fdd}|S )Nc                    sj   |j \}}}|\}}}	|j}
t| |||||\}}| || ||}tj| ||||dd} | ||
||	S )NTZ
wraparound)r"   r9   r  
make_arrayr	   get_item_pointer)r    r   r!   r"   r  r
  r  aryr  r   r9   r  laryptrdispatch_fnr   r   imp  s    

z_atomic_dispatcher.<locals>.impr   )r  r  r   r  r   _atomic_dispatcher  s    r  c                 C   s`   |t jkr&|j}|t|||fS |t jkrL|j}|t|||fS |d||dS d S )Nr   	monotonic)	r   rk   rP   rT   r   Zdeclare_atomic_add_float32rq   Zdeclare_atomic_add_float64
atomic_rmwr    r   r9   r  r   rX   r   r   r   ptx_atomic_add_tuple  s    

r  c                 C   s`   |t jkr&|j}|t|||fS |t jkrL|j}|t|||fS |d||dS d S )Nr   r  )	r   rk   rP   rT   r   Zdeclare_atomic_sub_float32rq   Zdeclare_atomic_sub_float64r  r  r   r   r   ptx_atomic_sub  s    

r  c                 C   sP   |t jjv r<|j}|j}ttd| }|||||fS td| dd S )NZdeclare_atomic_inc_intzUnimplemented atomic inc with  array	r   cudadeclZunsigned_int_numba_typesri   rP   getattrr   rT   r  r    r   r9   r  r   ZbwrX   r   r   r   r   ptx_atomic_inc  s    r$  c                 C   sP   |t jjv r<|j}|j}ttd| }|||||fS td| dd S )NZdeclare_atomic_dec_intzUnimplemented atomic dec with r  r   r#  r   r   r   ptx_atomic_dec  s    r%  c                    s@   t  fdd}tjtjtjfD ]}t| tj|tj| q d S )Nc                    s6   |t jjv r| ||dS td  d| dd S )Nr  zUnimplemented atomic z with r  r   r!  integer_numba_typesr  r  r    r   r9   r  r   r   r   r   impl_ptx_atomic  s    z+ptx_atomic_bitwise.<locals>.impl_ptx_atomic)r  r   r  r  Tupler   ArrayAny)Zstubr   r)  r   r   r   r   ptx_atomic_bitwise  s    r-  andorxorc                 C   s0   |t jjv r|d||dS td| dd S )NZxchgr  zUnimplemented atomic exch with r  r&  r(  r   r   r   ptx_atomic_exch/  s    r1  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fv rh|j
d||ddS |tjtjfv r|j
d||ddS td| d S Nr   r  ZorderingZumaxz&Unimplemented atomic max with %s array)rP   r   rq   rT   r   Zdeclare_atomic_max_float64rk   Zdeclare_atomic_max_float32r_   int64r  uint32uint64r  r  r   r   r   ptx_atomic_max:  s    

r7  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fv rh|j
d||ddS |tjtjfv r|j
d||ddS td| d S Nr   r  r3  Zuminz&Unimplemented atomic min with %s array)rP   r   rq   rT   r   Zdeclare_atomic_min_float64rk   Zdeclare_atomic_min_float32r_   r4  r  r5  r6  r  r  r   r   r   ptx_atomic_minN  s    

r9  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fv rh|j
d||ddS |tjtjfv r|j
d||ddS td| d S r2  )rP   r   rq   rT   r   Zdeclare_atomic_nanmax_float64rk   Zdeclare_atomic_nanmax_float32r_   r4  r  r5  r6  r  r  r   r   r   ptx_atomic_nanmaxb  s    

r:  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fv rh|j
d||ddS |tjtjfv r|j
d||ddS td| d S r8  )rP   r   rq   rT   r   Zdeclare_atomic_nanmin_float64rk   Zdeclare_atomic_nanmin_float32r_   r4  r  r5  r6  r  r  r   r   r   ptx_atomic_nanminv  s    

r;  c                 C   sT   | |jd tj|jd |jd }|d | tjd|d |d f}t| |||S )Nr   r   r.   )r   r"   r   r  r^   ptx_atomic_casr   r   r   r   ptx_atomic_compare_and_swap  s    $"r=  c                 C   s   |j \}}}}|\}}	}
}t| |||	||\}}| || ||}tj| ||||dd}|jtjjv r|j	}|jj
}t|||||
|S td|j d S )NTr  z&Unimplemented atomic cas with %s array)r"   r  r  r	   r  r9   r   r!  r'  rP   ri   r   Zatomic_cmpxchgr  )r    r   r!   r"   r  r
  Zoldtyr  r  r  oldr   r  r  r  rX   ri   r   r   r   r<    s    r<  c                 C   s@   t jt t  t dgdddd}|d }|||g d S )Nrd   znanosleep.u32 $0;r   Tr   r   )r   r   rQ   rR   re   rT   )r    r   r!   r"   	nanosleepnsr   r   r   ptx_nanosleep  s
    rA  Fc               	      sf  t tj|d}|dko$|o$t|dk}|dkr:|s:td j| }	t|tjtj	fpjt|	t
jpj|tjk}
|tjvr|
std|  |}t||}|tjkrtj|||d}nh|j}t||||} |}d|d  > |_|rd|_nt|tj|_||t t!dd}t"#t$ j%} |}|&|}|}g }t't(|D ]\}}|)| ||9 }qXd	d
 t(|D } fdd
|D }|rtj*t+t!dg dddd}|,|-|g t!d} .tj/|}|0||g}n fdd
|D }t|}tj1||dd} 2| |} j3||4||j5j6|| .tj/|d d |7 S )Nr   r   zarray length <= 0zunsupported type: %sr3   Zexternalr   Zgenericc                 S   s   g | ]}|qS r   r   rD   r   r   r   rG     rH   z"_generic_array.<locals>.<listcomp>c                    s   g | ]}  tj|qS r   r^   r   r  rD   r    r   r   rG     rH   rd   zmov.u32 $0, %dynamic_smem_size;r   Tr   rf   c                    s   g | ]}  tj|qS r   rB  rD   rC  r   r   rG     rH   C)r9   r	  Zlayout)datar8   stridesitemsizeZmeminfo)8r   operatorr   r  
ValueErrorZdata_model_manager
isinstancer   ZRecordBooleanr   ZStructModelr   Znumber_domainr  Zget_data_typer   	ArrayTyper   rL   r	   Zalloca_oncerP   Zadd_global_variableZget_abi_sizeof
bit_lengthalignlinkageConstant	UndefinedZinitializerZaddrspacecastZPointerTypere   llZcreate_target_dataZNVVMZdata_layoutZget_abi_size	enumeratereversedappendr   rQ   ro   rT   r^   r  Zudivr+  r  Zpopulate_arrayrh   rE  typeZ	_getvalue) r    r   r8   r9   r:   r;   r<   Z	elemcountZdynamic_smemZ
data_modelZother_supported_typeZlldtypeZlarytyZdataptrrX   ZgvmemrN  Z
targetdatarG  Z
laststrideZrstridesr  ZlastsizerF  ZkstridesZget_dynshared_sizeZdynsmem_sizeZ	kitemsizeZkshaper	  r  r  r   rC  r   r>     sx    








r>   c                 C   s   |   S r   )rU   )r    r   r   Zpyvalr   r   r   cuda_dispatcher_const  s    rW  )F)	functoolsr   rH  r   Zllvmliter   Zllvmlite.bindingZbindingrR  Znumba.core.imputilsr   r   Znumba.core.typing.npydeclr   Znumba.core.datamodelr   Z
numba.corer   r	   Znumba.npr
   Znumba.np.npyimplr   Zcudadrvr   Znumbar   Z
numba.cudar   r   r   Znumba.cuda.typesr   r   registryr   Zlower_getattrZ
lower_attrZlower_constantr   Moduler#   r$   r%   r&   r(   r,   r-   r/   r   Z
array_liker+  r0   r1   r5   ZsharedarrayZIntegerLiteralr,  rB   r*  r  rJ   localrM   rN   Zthreadfence_blockr[   Zthreadfence_systemr\   Zthreadfencer]   Zsyncwarprc   i4ra   Zshfl_sync_intrinsicrn   Zf4Zf8r{   Zvote_sync_intrinsicr   r|   Zmatch_any_syncr   Zmatch_all_syncr   r   r   Zlanemask_ltr   Zpopcr   r   r   r   r   ZFloatr   r   r   ZIntegerr   r   r   r   Zhaddr   iaddZhsubr   isubZhmulr   imulZhnegr   negr   Zhabsr   absr   Zhfmar   truedivitruedivr   r   r   Zheqr   hner   Zhger   Zhgtr   hler   Zhltr   r   ZhmaxZhminrk   rq   r   Zcbrtr   ZbrevZu4r   u8r   Zclzr   Zffsr   r   Zselpr   r   r   r   r   r   r   r   r   r   r   piZ_deg2radZ_rad2degradiansdegreesr  r  Zatomicr  r  r  incr$  decr%  r-  and_rp   r0  Zexchr1  r7  r9  Znanmaxr:  Znanminr;  Zcompare_and_swapr=  Zcasr<  r?  r5  rA  r>   rW  Z
get_ufuncsr   r   r   r   <module>   s  










		
		







	.










	
















%





 
d
