a
    DfI                     @   s   d dl mZ d dlmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d	d
 Zedd Zedd Zedd Zeeeddddd Zedd Zdd Zedd Zedd Zedd Zd S )!    )ir)cudatypes)cgutils)RequireLiteralValue)	signature)overload_attribute)	nvvmutils)	intrinsicc                 C   sB   | j }|dkrtj}n |dv r.ttj|}ntdt|tjS )N   )      zargument can only be 1, 2, 3)Zliteral_valuer   int64UniTuple
ValueErrorr   int32)ndimvalrestype r   b/nfs/NAS7/SABIOD/METHODE/ermites/ermites_venv/lib/python3.9/site-packages/numba/cuda/intrinsics.py_type_grid_function   s    r   c                 C   s,   t |tjst|t|}dd }||fS )a  grid(ndim)

    Return the absolute position of the current thread in the entire grid of
    blocks.  *ndim* should correspond to the number of dimensions declared when
    instantiating the kernel. If *ndim* is 1, a single integer is returned.
    If *ndim* is 2 or 3, a tuple of the given number of integers is returned.

    Computation of the first integer is as follows::

        cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x

    and is similar for the other two indices, but using the ``y`` and ``z``
    attributes.
    c                 S   sJ   |j }|tjkrtj|ddS t|tjrFtj||jd}t	||S d S )Nr   )dim)
return_typer   r   r	   Zget_global_id
isinstancer   countr   
pack_array)contextbuildersigargsr   idsr   r   r   codegen1   s    
zgrid.<locals>.codegenr   r   ZIntegerLiteralr   r   	typingctxr   r   r"   r   r   r   grid   s
    r&   c                    s8   t |tjst|t|}dd   fdd}||fS )a  gridsize(ndim)

    Return the absolute size (or shape) in threads of the entire grid of
    blocks. *ndim* should correspond to the number of dimensions declared when
    instantiating the kernel. If *ndim* is 1, a single integer is returned.
    If *ndim* is 2 or 3, a tuple of the given number of integers is returned.

    Computation of the first integer is as follows::

        cuda.blockDim.x * cuda.gridDim.x

    and is similar for the other two indices, but using the ``y`` and ``z``
    attributes.
    c                 S   sJ   t d}t| d| }t| d| }| | ||| ||S )N@   zntid.znctaid.)r   IntTyper	   	call_sregmulZsext)r   r   Zi64ZntidZnctaidr   r   r   _nthreads_for_dimR   s    
z#gridsize.<locals>._nthreads_for_dimc                    sx   |j } |d}|tjkr|S t|tjrt |d}|jdkrNt|||fS |jdkrt |d}t||||fS d S )Nxyr   r   z)r   r   r   r   r   r   r   r   )r   r   r   r    r   ZnxnyZnzr+   r   r   r"   X   s    





zgridsize.<locals>.codegenr#   r$   r   r0   r   gridsize<   s    r1   c                 C   s   t tj}dd }||fS )Nc                 S   s   t |dS )Nwarpsize)r	   r)   )r   r   r   r    r   r   r   r"   n   s    z_warpsize.<locals>.codegen)r   r   r   r%   r   r"   r   r   r   	_warpsizej   s    
r4   r2   r   )targetc                 C   s   dd }|S )z_
    The size of a warp. All architectures implemented to date have a warp size
    of 32.
    c                 S   s   t  S )N)r4   )modr   r   r   getz   s    zcuda_warpsize.<locals>.getr   )r6   r7   r   r   r   cuda_warpsizet   s    r8   c                 C   s   t tj}dd }||fS )a  
    Synchronize all threads in the same thread block.  This function implements
    the same pattern as barriers in traditional multi-threaded programming: this
    function waits until all threads in the block call it, at which point it
    returns control to all its callers.
    c                 S   s<   d}|j }tt d}t|||}||d |  S )Nzllvm.nvvm.barrier0r   )moduler   FunctionTypeZVoidTyper   get_or_insert_functioncallZget_dummy_value)r   r   r   r    fnameZlmodfntysyncr   r   r   r"      s    zsyncthreads.<locals>.codegen)r   r   noner3   r   r   r   syncthreads   s    
rA   c                    s2   t |tjsd S ttjtj} fdd}||fS )Nc                    s6   t t dt df}t|j| }|||S )N    )r   r:   r(   r   r;   r9   r<   )r   r   r   r    r>   r?   r=   r   r   r"      s    z'_syncthreads_predicate.<locals>.codegen)r   r   ZIntegerr   i4)r%   	predicater=   r   r"   r   rC   r   _syncthreads_predicate   s
    rF   c                 C   s   d}t | ||S )z
    syncthreads_count(predicate)

    An extension to numba.cuda.syncthreads where the return value is a count
    of the threads where predicate is true.
    zllvm.nvvm.barrier0.popcrF   r%   rE   r=   r   r   r   syncthreads_count   s    rI   c                 C   s   d}t | ||S )z
    syncthreads_and(predicate)

    An extension to numba.cuda.syncthreads where 1 is returned if predicate is
    true for all threads or 0 otherwise.
    zllvm.nvvm.barrier0.andrG   rH   r   r   r   syncthreads_and   s    rJ   c                 C   s   d}t | ||S )z
    syncthreads_or(predicate)

    An extension to numba.cuda.syncthreads where 1 is returned if predicate is
    true for any thread or 0 otherwise.
    zllvm.nvvm.barrier0.orrG   rH   r   r   r   syncthreads_or   s    rK   N)Zllvmliter   Znumbar   r   Z
numba.corer   Znumba.core.errorsr   Znumba.core.typingr   Znumba.core.extendingr   Z
numba.cudar	   Znumba.cuda.extendingr
   r   r&   r1   r4   Moduler8   rA   rF   rI   rJ   rK   r   r   r   r   <module>   s2   
 
-
	



