a
    <Df                     @   s  d Z ddlZddlmZmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z- g dZ.e.g d Z/e.e/ ddg Z0ddlm1Z1 dBddZ2dCd#d$Z3dDd&d'Z4dEd(d)Z5dFd*d+Z6dGd.d/Z7dHd0d1Z8d2d3 Z9d4d5 Z:d6d7 Z;d8d9 Z<d:d; Z=ddd ddd!d"d%d<eddd=d,dd-dddfd>d?Z>G d@dA dAeeZ?dS )Iz\
HDBSCAN: Hierarchical Density-Based Spatial Clustering
         of Applications with Noise
    N)BaseEstimatorClusterMixin)pairwise_distances)issparse)KDTreeBallTree)Memory)warn)check_array)	cpu_count)csgraph   )single_linkagemst_linkage_coremst_linkage_core_vectorlabel)condense_treecompute_stabilityget_clustersoutlier_scores)mutual_reachabilitysparse_mutual_reachability)KDTreeBoruvkaAlgorithmBallTreeBoruvkaAlgorithm)DistanceMetric)CondensedTreeSingleLinkageTreeMinimumSpanningTree)PredictionData)		euclideanl2	minkowskipZ	manhattanZ	cityblockl1Z	chebyshevZinfinity)Z
braycurtisZcanberraZdiceZhammingZ	haversineZjaccardZmahalanobisZrogerstanimotoZ
russellraoZ
seuclideanZsokalmichenerZsokalsneathcosinearccos)isclose
   eomF        c                 C   s:   t ||}t|}	t||	|||||\}
}}|
||||fS )z\Converts a pretrained tree and cluster size into a
    set of labels and probabilities.
    )r   r   r   )Xsingle_linkage_treemin_cluster_sizecluster_selection_methodallow_single_clustermatch_reference_implementationcluster_selection_epsilonmax_cluster_sizeZcondensed_treeZstability_dictlabelsZprobabilitiesZstabilities r3   ]/nfs/NAS7/SABIOD/METHODE/ermites/ermites_venv/lib/python3.9/site-packages/hdbscan/hdbscan_.py_tree_to_labelsA   s    


r5            ?r!      c                 K   s  |dkrt | ||d}nD|dkr6t | fddi|}n&|dkrH|  }nt | fd|i|}t|rt|||||||fi |S t|||}	t|	}
t|
jd 	 rt
dt |rR|
 }t|d	d  d	D ]|\}}tt|	t|d	  |d d
 }t||
d |d df t}|||d	 k }t|d
ksBJ |d
 |d
< qnd }|
t|
jd d d f }
t|
}||fS )Nr!   )metricr"   r%   r9   r$   precomputedr8   zThe minimum spanning tree contains edge weights with value infinity. Potentially, you are missing too many distances in the initial distance matrix for the given neighborhood size.r   r   )r   copyr   _hdbscan_sparse_distance_matrixr   r   npisinfTanyr	   UserWarning	enumeratewherer&   intZintersect1dastypelenargsortr   )r*   min_samplesalphar9   r"   	leaf_sizegen_min_span_treekwargsZdistance_matrixmutual_reachability_min_spanning_treeresult_min_span_treeindexrow
candidatesr+   r3   r3   r4   _hdbscan_generic]   sP    

$rS   (   c                 K   s   t | sJ tj| ddddkr(td|  }|dd}	t|||	|d}
tj|
ddddkrptd||f t|
}| }|| }t	
||f j}|t	|jd	 d d f d
 }t|}|r||fS |d fS d S )NF)ZdirectedZreturn_labelsr   zSparse distance matrix has multiple connected components!
That is, there exist groups of points that are completely disjoint -- there are no distance relations connecting them
Run hdbscan on each component.max_distr)   )Z
min_pointsrU   rI   zThere exists points with less than %s neighbors. Ensure your distance matrix has non zeros values for at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), or specify a `max_dist` to use when distances are missing.r8   r   )r   r   Zconnected_components
ValueErrortolilgetr   Zminimum_spanning_treeZnonzeror=   vstackr?   rG   r   )r*   rH   rI   r9   r"   rJ   rK   rL   Z
lil_matrixrU   rM   Zsparse_min_spanning_treeZnonzerosZnonzero_valsrN   r+   r3   r3   r4   r<      s<    

 r<   c                 K   s   | j tjkr| tj} | jd s4tj| tjdd} t| f||d|}tj	|fi |}	|j
| |d dddd d d d	f jdd
}
t| |
|	|}|t|jd d d f }t|}|r||fS |d fS d S NZC_CONTIGUOUSC)dtypeorderr9   rJ   r   T)kZdualtreeZbreadth_firstr   )r]   r8   )r\   r=   float64rE   flagsarraydoubler   r   
get_metricqueryr;   r   rG   r?   r   r*   rH   rI   r9   r"   rJ   rK   rL   treeZdist_metricZcore_distancesrN   r+   r3   r3   r4   _hdbscan_prims_kdtree   s*    


ri   c                 K   s   | j tjkr| tj} | jd s4tj| tjdd} t| f||d|}tj	|fi |}	|j
| |d dddd d d d	f jdd
}
t| |
|	|}|t|jd d d f }t|}|r||fS |d fS d S rZ   )r\   r=   ra   rE   rb   rc   rd   r   r   re   rf   r;   r   rG   r?   r   rg   r3   r3   r4   _hdbscan_prims_balltree  s*    


rj   T   c	                 K   s   |dk rd}|dk r(t t d | d}| jtjkr@| tj} t| f||d|	}
t|
|f||d ||d|	}| }t	|j
d }||d d f }t|}|r||fS |d fS d S N   r   r^   )r9   rJ   approx_min_span_treeZn_jobsr8   )maxr   r\   r=   ra   rE   r   r   spanning_treerG   r?   r   )r*   rH   rI   r9   r"   rJ   rn   rK   core_dist_n_jobsrL   rh   algrN   Z	row_orderr+   r3   r3   r4   _hdbscan_boruvka_kdtree>  s2    	rs   c	                 K   s   |dk rd}|dk r(t t d | d}| jtjkr@| tj} t| f||d|	}
t|
|f||d ||d|	}| }|t	|j
d d d f }t|}|r||fS |d fS d S rl   )ro   r   r\   r=   ra   rE   r   r   rp   rG   r?   r   )r*   rH   rI   r9   r"   rJ   rn   rK   rq   rL   rh   rr   rN   r+   r3   r3   r4   _hdbscan_boruvka_balltreej  s0    	rt   c                 C   s"   |   }d|t|< t| dS )zaPerform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix.r   N)r;   r=   r>   r
   )r*   tmpr3   r3   r4   !check_precomputed_distance_matrix  s    rv   c                 C   s   t |}t |}t| D ]>\}\}}}}	||k r:|| }n|| }|| |||	f| |< qg }
| d d }|D ]}|
||ddf qltj|
dtjfdtjfdtfdtjfgd}t|| } | S )a  
    Takes an internal condensed_tree structure and adds back in a set of points
    that were initially detected as non-finite and returns that new tree.
    These points will all be split off from the maximal node at lambda zero and
    considered noise points.

    Parameters
    ----------
    tree: condensed_tree
    internal_to_raw: dict
        a mapping from internal integer index to the raw integer index
    finite_index: ndarray
        Boolean array of which entries in the raw data were finite
    r   r   parentchild
lambda_val
child_size)r\   )rF   rB   appendr=   rc   Zintpfloat)rh   internal_to_rawoutliersfinite_countoutlier_countirw   rx   ry   rz   Zoutlier_listrootoutlieroutlier_treer3   r3   r4   remap_condensed_tree  s*    
	r   c                 C   s  t |}t |}t| D ]d\}\}}}}	||k rB|| | |df< n|| | |df< ||k rl|| | |df< q|| | |df< qtt |df}
| | jd d  dd  }| | jd d  d }t|D ]2\}}||d tj|d f|
|< |d7 }|d7 }qt| |
g} | S )a  
    Takes an internal single_linkage_tree structure and adds back in a set of points
    that were initially detected as non-finite and returns that new tree.
    These points will all be merged into the final node at np.inf distance and
    considered noise points.

    Parameters
    ----------
    tree: single_linkage_tree
    internal_to_raw: dict
        a mapping from internal integer index to the raw integer index
    finite_index: ndarray
        Boolean array of which entries in the raw data were finite
    r   r   rk   r8   rm   )rF   rB   r=   zerosshapero   infrY   )rh   r}   r~   r   r   r   leftrightdistancesizer   Zlast_cluster_idZlast_cluster_sizer   r3   r3   r4   remap_single_linkage_tree  s$    
r   c                 C   s2   t | rtt|  jS tt| S dS )zLReturns true only if all the values of a ndarray or sparse matrix are finiteN)r   r=   ZalltrueisfiniteZtocoodata)matrixr3   r3   r4   	is_finite  s    r   c                 C   sR   t | r(tdd t|  jD }n&tt| jdd| j	d kd }|S )zQReturns the indices of the purely finite rows of a sparse matrix or dense ndarrayc                 S   s$   g | ]\}}t t |r|qS r3   )r=   allr   ).0r   rQ   r3   r3   r4   
<listcomp>      z*get_finite_row_indices.<locals>.<listcomp>r   axisr   )
r   r=   rc   rB   rW   r   rC   r   sumr   )r   Zrow_indicesr3   r3   r4   get_finite_row_indices  s    &r   bestverbosec              
   K   sN  |du r|}t t|t jr0t t|t js8td|dksH|dkrPtd|dkr`tdt t|t jrzt|}t|tus|dk rtdt|tr|dkrtd	|dk rtd
|dkr|du rtd|dk rtd|r|d }|d }d}|dvrtd|dks(t| r8t	| ddd} nt
|  t|
trXt|
dd}
| jd }t|d |}|dkr~d}|	dkr&|dkrt| r|	dkrtd|	dkr|
t| ||||||fi |\}}q2|	dkr&|tvrtd|
t| ||||||fi |\}}n|	dkrj|tvrBtd|
t| ||||||fi |\}}n|	dkr|tvrtd|
t| ||||||||f	i |\}}np|	dkr|tvrtd| jd | d krtd! |
t| ||||||||f	i |\}}ntd"|	 nt| s:|tvrb|
t| ||||||fi |\}}n|tv r| jd d#kr|
t| ||||||fi |\}}n*|
t| ||||||||f	i |\}}nb| jd d#kr|
t| ||||||fi |\}}n*|
t| ||||||||f	i |\}}t| ||||||||f S )$a  Perform HDBSCAN clustering from a vector array or distance matrix.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or             array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    min_cluster_size : int, optional (default=5)
        The minimum number of samples in a group for that group to be
        considered a cluster; groupings smaller than this size will be left
        as noise.

    min_samples : int, optional (default=None)
        The number of samples in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
        defaults to the min_cluster_size.

    cluster_selection_epsilon: float, optional (default=0.0)
        A distance threshold. Clusters below this value will be merged.
        See [3]_ for more information. Note that this should not be used
        if we want to predict the cluster labels for new points in future
        (e.g. using approximate_predict), as the approximate_predict function
        is not aware of this argument.

    alpha : float, optional (default=1.0)
        A distance scaling parameter as used in robust single linkage.
        See [2]_ for more information.

    max_cluster_size : int, optional (default=0)
        A limit to the size of clusters returned by the eom algorithm.
        Has no effect when using leaf clustering (where clusters are
        usually small regardless) and can also be overridden in rare
        cases by a high value for cluster_selection_epsilon. Note that
        this should not be used if we want to predict the cluster labels
        for new points in future (e.g. using approximate_predict), as
        the approximate_predict function is not aware of this argument.

    metric : string or callable, optional (default='minkowski')
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.

    p : int, optional (default=2)
        p value to use if using the minkowski metric.

    leaf_size : int, optional (default=40)
        Leaf size for trees responsible for fast nearest
        neighbour queries.

    algorithm : string, optional (default='best')
        Exactly which algorithm to use; hdbscan has variants specialised
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of
        the data. You can force other options if you believe you know
        better. Options are:
            * ``best``
            * ``generic``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    memory : instance of joblib.Memory or string, optional
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    approx_min_span_tree : bool, optional (default=True)
        Whether to accept an only approximate minimum spanning tree.
        For some algorithms this can provide a significant speedup, but
        the resulting clustering may be of marginally lower quality.
        If you are willing to sacrifice speed for correctness you may want
        to explore this; in general this should be left at the default True.

    gen_min_span_tree : bool, optional (default=False)
        Whether to generate the minimum spanning tree for later analysis.

    core_dist_n_jobs : int, optional (default=4)
        Number of parallel jobs to run in core distance computations (if
        supported by the specific algorithm). For ``core_dist_n_jobs``
        below -1, (n_cpus + 1 + core_dist_n_jobs) are used.

    cluster_selection_method : string, optional (default='eom')
        The method used to select clusters from the condensed tree. The
        standard approach for HDBSCAN* is to use an Excess of Mass algorithm
        to find the most persistent clusters. Alternatively you can instead
        select the clusters at the leaves of the tree -- this provides the
        most fine grained and homogeneous clusters. Options are:
            * ``eom``
            * ``leaf``

    allow_single_cluster : bool, optional (default=False)
        By default HDBSCAN* will not produce a single cluster, setting this
        to t=True will override this and allow single cluster results in
        the case that you feel this is a valid result for your dataset.
        (default False)

    match_reference_implementation : bool, optional (default=False)
        There exist some interpretational differences between this
        HDBSCAN* implementation and the original authors reference
        implementation in Java. This can result in very minor differences
        in clustering results. Setting this flag to True will, at a some
        performance cost, ensure that the clustering results match the
        reference implementation.

    **kwargs : optional
        Arguments passed to the distance metric

    Returns
    -------
    labels : ndarray, shape (n_samples, )
        Cluster labels for each point.  Noisy samples are given the label -1.

    probabilities : ndarray, shape (n_samples, )
        Cluster membership strengths for each point. Noisy samples are assigned
        0.

    cluster_persistence : array, shape  (n_clusters, )
        A score of how persistent each cluster is. A score of 1.0 represents
        a perfectly stable cluster that persists over all distance scales,
        while a score of 0.0 represents a perfectly ephemeral cluster. These
        scores can be guage the relative coherence of the clusters output
        by the algorithm.

    condensed_tree : record array
        The condensed cluster hierarchy used to generate clusters.

    single_linkage_tree : ndarray, shape (n_samples - 1, 4)
        The single linkage tree produced during clustering in scipy
        hierarchical clustering format
        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

    min_spanning_tree : ndarray, shape (n_samples - 1, 3)
        The minimum spanning as an edgelist. If gen_min_span_tree was False
        this will be None.

    References
    ----------

    .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
       Density-based clustering based on hierarchical density estimates.
       In Pacific-Asia Conference on Knowledge Discovery and Data Mining
       (pp. 160-172). Springer Berlin Heidelberg.

    .. [2] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
       cluster tree. In Advances in Neural Information Processing Systems
       (pp. 343-351).

    .. [3] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical 
	   Density-based Cluster Selection. arxiv preprint 1911.02282.
    Nz2Min samples and min cluster size must be integers!r   z:Min samples and Min cluster size must be positive integersr   z)Min cluster size must be greater than oner)   z9Epsilon must be a float value greater than or equal to 0!z4Alpha must be a positive float value greater than 0!z!Leaf size must be greater than 0!r!   z/Minkowski metric given but no p value supplied!z6Minkowski metric with negative p value is not defined!F)r(   ZleafzEInvalid Cluster Selection Method: %s
Should be one of: "eom", "leaf"
r:   csraccept_sparseZforce_all_finiter   r   Zgenericz6Sparse data matrices only support algorithm 'generic'.Zprims_kdtreez.Cannot use Prim's with KDTree for this metric!Zprims_balltreez0Cannot use Prim's with BallTree for this metric!Zboruvka_kdtreez/Cannot use Boruvka with KDTree for this metric!Zboruvka_balltreez1Cannot use Boruvka with BallTree for this metric!i>  zA large dataset size and small leaf_size may induce excessive memory usage. If you are running out of memory consider increasing the ``leaf_size`` parameter.z#Unknown algorithm type %s specified<   )r=   Z
issubdtypetypeintegerrV   r|   
isinstance	TypeErrorr   r
   rv   strr   r   mincacherS   KDTREE_VALID_METRICSri   BALLTREE_VALID_METRICSrj   rs   r	   rt   FAST_METRICSr5   )r*   r,   rH   rI   r0   r1   r9   r"   rJ   	algorithmmemoryrn   rK   rq   r-   r.   r/   rL   r   r+   rO   r3   r3   r4   hdbscan  s    1

















r   c                   @   s   e Zd ZdZddddddddd	eddd
dddddddfddZd+ddZd,ddZdd Zdd Z	dd Z
d-ddZedd Zedd  Zed!d" Zed#d$ Zed%d& Zed'd( Zed)d* ZdS ).HDBSCANaf%  Perform HDBSCAN clustering from vector array or distance matrix.

    HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications
    with Noise. Performs DBSCAN over varying epsilon values and integrates
    the result to find a clustering that gives the best stability over epsilon.
    This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN),
    and be more robust to parameter selection.

    Parameters
    ----------
    min_cluster_size : int, optional (default=5)
        The minimum size of clusters; single linkage splits that contain
        fewer points than this will be considered points "falling out" of a
        cluster rather than a cluster splitting into two new clusters.

    min_samples : int, optional (default=None)
        The number of samples in a neighbourhood for a point to be
        considered a core point.

    metric : string, or callable, optional (default='euclidean')
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.

    p : int, optional (default=None)
        p value to use if using the minkowski metric.

    alpha : float, optional (default=1.0)
        A distance scaling parameter as used in robust single linkage.
        See [3]_ for more information.

    cluster_selection_epsilon: float, optional (default=0.0)
                A distance threshold. Clusters below this value will be merged.
        See [5]_ for more information.

    algorithm : string, optional (default='best')
        Exactly which algorithm to use; hdbscan has variants specialised
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of
        the data. You can force other options if you believe you know
        better. Options are:
            * ``best``
            * ``generic``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    leaf_size: int, optional (default=40)
        If using a space tree algorithm (kdtree, or balltree) the number
        of points ina leaf node of the tree. This does not alter the
        resulting clustering, but may have an effect on the runtime
        of the algorithm.

    memory : Instance of joblib.Memory or string (optional)
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    approx_min_span_tree : bool, optional (default=True)
        Whether to accept an only approximate minimum spanning tree.
        For some algorithms this can provide a significant speedup, but
        the resulting clustering may be of marginally lower quality.
        If you are willing to sacrifice speed for correctness you may want
        to explore this; in general this should be left at the default True.

    gen_min_span_tree: bool, optional (default=False)
        Whether to generate the minimum spanning tree with regard
        to mutual reachability distance for later analysis.

    core_dist_n_jobs : int, optional (default=4)
        Number of parallel jobs to run in core distance computations (if
        supported by the specific algorithm). For ``core_dist_n_jobs``
        below -1, (n_cpus + 1 + core_dist_n_jobs) are used.

    cluster_selection_method : string, optional (default='eom')
        The method used to select clusters from the condensed tree. The
        standard approach for HDBSCAN* is to use an Excess of Mass algorithm
        to find the most persistent clusters. Alternatively you can instead
        select the clusters at the leaves of the tree -- this provides the
        most fine grained and homogeneous clusters. Options are:
            * ``eom``
            * ``leaf``

    allow_single_cluster : bool, optional (default=False)
        By default HDBSCAN* will not produce a single cluster, setting this
        to True will override this and allow single cluster results in
        the case that you feel this is a valid result for your dataset.

    prediction_data : boolean, optional
        Whether to generate extra cached data for predicting labels or
        membership vectors for new unseen points later. If you wish to
        persist the clustering object for later re-use you probably want
        to set this to True.
        (default False)

    match_reference_implementation : bool, optional (default=False)
        There exist some interpretational differences between this
        HDBSCAN* implementation and the original authors reference
        implementation in Java. This can result in very minor differences
        in clustering results. Setting this flag to True will, at a some
        performance cost, ensure that the clustering results match the
        reference implementation.

    **kwargs : optional
        Arguments passed to the distance metric

    Attributes
    ----------
    labels_ : ndarray, shape (n_samples, )
        Cluster labels for each point in the dataset given to fit().
        Noisy samples are given the label -1.

    probabilities_ : ndarray, shape (n_samples, )
        The strength with which each sample is a member of its assigned
        cluster. Noise points have probability zero; points in clusters
        have values assigned proportional to the degree that they
        persist as part of the cluster.

    cluster_persistence_ : ndarray, shape (n_clusters, )
        A score of how persistent each cluster is. A score of 1.0 represents
        a perfectly stable cluster that persists over all distance scales,
        while a score of 0.0 represents a perfectly ephemeral cluster. These
        scores can be guage the relative coherence of the clusters output
        by the algorithm.

    condensed_tree_ : CondensedTree object
        The condensed tree produced by HDBSCAN. The object has methods
        for converting to pandas, networkx, and plotting.

    single_linkage_tree_ : SingleLinkageTree object
        The single linkage tree produced by HDBSCAN. The object has methods
        for converting to pandas, networkx, and plotting.

    minimum_spanning_tree_ : MinimumSpanningTree object
        The minimum spanning tree of the mutual reachability graph generated
        by HDBSCAN. Note that this is not generated by default and will only
        be available if `gen_min_span_tree` was set to True on object creation.
        Even then in some optimized cases a tre may not be generated.

    outlier_scores_ : ndarray, shape (n_samples, )
        Outlier scores for clustered points; the larger the score the more
        outlier-like the point. Useful as an outlier detection technique.
        Based on the GLOSH algorithm by Campello, Moulavi, Zimek and Sander.

    prediction_data_ : PredictionData object
        Cached data used for predicting the cluster labels of new or
        unseen points. Necessary only if you are using functions from
        ``hdbscan.prediction`` (see
        :func:`~hdbscan.prediction.approximate_predict`,
        :func:`~hdbscan.prediction.membership_vector`,
        and :func:`~hdbscan.prediction.all_points_membership_vectors`).

    exemplars_ : list
        A list of exemplar points for clusters. Since HDBSCAN supports
        arbitrary shapes for clusters we cannot provide a single cluster
        exemplar per cluster. Instead a list is returned with each element
        of the list being a numpy array of exemplar points for a cluster --
        these points are the "most representative" points of the cluster.

    relative_validity_ : float
        A fast approximation of the Density Based Cluster Validity (DBCV)
        score [4]. The only differece, and the speed, comes from the fact
        that this relative_validity_ is computed using the mutual-
        reachability minimum spanning tree, i.e. minimum_spanning_tree_,
        instead of the all-points minimum spanning tree used in the
        reference. This score might not be an objective measure of the
        goodness of clusterering. It may only be used to compare results
        across different choices of hyper-parameters, therefore is only a
        relative score.

    References
    ----------

    .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
       Density-based clustering based on hierarchical density estimates.
       In Pacific-Asia Conference on Knowledge Discovery and Data Mining
       (pp. 160-172). Springer Berlin Heidelberg.

    .. [2] Campello, R. J., Moulavi, D., Zimek, A., & Sander, J. (2015).
       Hierarchical density estimates for data clustering, visualization,
       and outlier detection. ACM Transactions on Knowledge Discovery
       from Data (TKDD), 10(1), 5.

    .. [3] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
       cluster tree. In Advances in Neural Information Processing Systems
       (pp. 343-351).

    .. [4] Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and
       Sander, J., 2014. Density-Based Clustering Validation. In SDM
       (pp. 839-847).

    .. [5] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical
           Density-based Cluster Selection. arxiv preprint 1911.02282.

    r6   Nr)   r   r   r7   r   rT   r   TFrk   r(   c                 K   s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _d | _d | _d | _d | _d | _d | _d | _d S )N)r,   rH   rI   r1   r0   r9   r"   r   rJ   r   rn   rK   rq   r-   r.   r/   prediction_data_metric_kwargs_condensed_tree_single_linkage_tree_min_spanning_tree	_raw_data_outlier_scores_prediction_data_relative_validity)selfr,   rH   r0   r1   r9   rI   r"   r   rJ   r   rn   rK   rq   r-   r.   r   r/   rL   r3   r3   r4   __init__L  s2    zHDBSCAN.__init__c           
      C   sz  | j dkrt|ddd}|| _t|| _| j r|t|}|| }dd ttt||D }t	t
t|jd t
| }q|}n&t|rt|dd}|}nt| |}|  }|d	d
 || j t|fi |\| _| _| _| _| _| _| j dkrf| jsft| j||| _t| j||| _t|jd d}| j||< || _t|jd }	| j|	|< |	| _| jrv|   | S )a  Perform HDBSCAN clustering from features or distance matrix.

        Parameters
        ----------
        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or                 array of shape (n_samples, n_samples)
            A feature array, or array of distances between samples if
            ``metric='precomputed'``.

        Returns
        -------
        self : object
            Returns self
        r:   r   Fr   c                 S   s   i | ]\}}||qS r3   r3   )r   xyr3   r3   r4   
<dictcomp>  s   zHDBSCAN.fit.<locals>.<dictcomp>r   )r   r   Nr`   ) r9   r
   r   r   Z_all_finiter   ziprangerF   listsetr   r   rv   
get_paramspopupdater   r   labels_probabilities_Zcluster_persistence_r   r   r   r   r   r=   fullr   r   generate_prediction_data)
r   r*   r   Zfinite_indexZ
clean_datar}   r~   rL   Z
new_labelsZnew_probabilitiesr3   r3   r4   fit}  sX    

 	

zHDBSCAN.fitc                 C   s   |  | | jS )a  Performs clustering on X and returns cluster labels.

        Parameters
        ----------
        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or                 array of shape (n_samples, n_samples)
            A feature array, or array of distances between samples if
            ``metric='precomputed'``.

        Returns
        -------
        y : ndarray, shape (n_samples, )
            cluster labels
        )r   r   )r   r*   r   r3   r3   r4   fit_predict  s    
zHDBSCAN.fit_predictc                 C   s|   | j tv rp| jp| j}| j tv r&d}n$| j tv r6d}ntd| j  dS t| j	| j
|f|| j d| j| _ntd dS )z
        Create data that caches intermediate results used for predicting
        the label of new/unseen points. This data is only useful if
        you are intending to use functions from ``hdbscan.prediction``.
        ZkdtreeZballtreez,Metric {} not supported for prediction data!N)	tree_typer9   z~Cannot generate prediction data for non-vectorspace inputs -- access to the source data ratherthan mere distances is required!)r9   r   rH   r,   r   r   r	   formatr   r   condensed_tree_r   r   )r   rH   r   r3   r3   r4   r     s*    



	z HDBSCAN.generate_prediction_datac                 C   sP   t | dstd|dkr"td| j|k}| j| }| j| }tj||ddS )aO  Provide an approximate representative point for a given cluster.
        Note that this technique assumes a euclidean metric for speed of
        computation. For more general metrics use the ``weighted_cluster_medoid``
        method which is slower, but can work with the metric the model trained
        with.

        Parameters
        ----------
        cluster_id: int
            The id of the cluster to compute a centroid for.

        Returns
        -------
        centroid: array of shape (n_features,)
            A representative centroid for cluster ``cluster_id``.
        r   Model has not been fit to datar`   MCannot calculate weighted centroid for -1 cluster since it is a noise clusterr   )weightsr   )hasattrAttributeErrorrV   r   r   r   r=   Zaverage)r   
cluster_idmaskcluster_datacluster_membership_strengthsr3   r3   r4   weighted_cluster_centroid  s    



z!HDBSCAN.weighted_cluster_centroidc                 C   sz   t | dstd|dkr"td| j|k}| j| }| j| }t|fd| ji| j}|| }t	
|jdd}|| S )a/  Provide an approximate representative point for a given cluster.
        Note that this technique can be very slow and memory intensive for
        large clusters. For faster results use the ``weighted_cluster_centroid``
        method which is faster, but assumes a euclidean metric.

        Parameters
        ----------
        cluster_id: int
            The id of the cluster to compute a medoid for.

        Returns
        -------
        centroid: array of shape (n_features,)
            A representative medoid for cluster ``cluster_id``.
        r   r   r`   r   r9   r   r   )r   r   rV   r   r   r   r   r9   r   r=   Zargminr   )r   r   r   r   r   Zdist_matZmedoid_indexr3   r3   r4   weighted_cluster_medoid  s&    



zHDBSCAN.weighted_cluster_medoidc                 C   s   | j j||dS )a  Return clustering that would be equivalent to running DBSCAN* for a particular cut_distance (or epsilon)
        DBSCAN* can be thought of as DBSCAN without the border points.  As such these results may differ slightly
        from sklearns implementation of dbscan in the non-core points.

        This can also be thought of as a flat clustering derived from constant height cut through the single
        linkage tree.

        This represents the result of selecting a cut value for robust single linkage
        clustering. The `min_cluster_size` allows the flat clustering to declare noise
        points (and cluster smaller than `min_cluster_size`).

        Parameters
        ----------

        cut_distance : float
            The mutual reachability distance cut value to use to generate a flat clustering.

        min_cluster_size : int, optional
            Clusters smaller than this value with be called 'noise' and remain unclustered
            in the resulting flat clustering.

        Returns
        -------

        labels : array [n_samples]
            An array of cluster labels, one per datapoint. Unclustered points are assigned
            the label -1.
        )cut_distancer,   )single_linkage_tree_r   )r   r   r,   r3   r3   r4   dbscan_clusteringC  s    zHDBSCAN.dbscan_clusteringc                 C   s   | j d u rtdn| j S d S )Nz No prediction data was generated)r   r   r   r3   r3   r4   prediction_data_e  s    

zHDBSCAN.prediction_data_c                 C   s8   | j d ur| j S | jd ur,t| j| _ | j S tdd S Nz7No condensed tree was generated; try running fit first.)r   r   r   r   r   r3   r3   r4   outlier_scores_l  s    

zHDBSCAN.outlier_scores_c                 C   s(   | j d urt| j | j| jS tdd S r   )r   r   r-   r.   r   r   r3   r3   r4   r   y  s    
zHDBSCAN.condensed_tree_c                 C   s    | j d urt| j S tdd S )Nz<No single linkage tree was generated; try running fit first.)r   r   r   r   r3   r3   r4   r     s
    

zHDBSCAN.single_linkage_tree_c                 C   s<   | j d ur0| jd ur"t| j | jS td d S ntdd S )NzNo raw data is available; this may be due to using a precomputed metric matrix. No minimum spanning tree will be provided without raw data.zNo minimum spanning tree was generated.This may be due to optimized algorithm variations that skip explicit generation of the spanning tree.)r   r   r   r	   r   r   r3   r3   r4   minimum_spanning_tree_  s    

zHDBSCAN.minimum_spanning_tree_c                 C   s8   | j d ur| j jS | jtv r,|   | j jS tdd S )NzCurrently exemplars require the use of vector input datawith a suitable metric. This will likely change in the future, but for now no exemplars can be provided)r   Z	exemplarsr9   r   r   r   r   r3   r3   r4   
exemplars_  s    

zHDBSCAN.exemplars_c                    s  | j d ur| j S | js"tdd S | j}t|d }|d }|dd  |t t}t| tj	}d}t
|tj	 d}| j }| D ]}	|t|	d d  }
|t|	d d  }|	d d }t||}|
dkr|dkrqn |
dks|dkrt||}q|
|kr.t| |
  |
< qt||
 |
< t|| |< q|tj	krd|n|}||dkrx|n| }|ttj	k<  fd	d
t|D tfdd
t|D }|| _ | j S )NzMinimum spanning tree not present. Either HDBSCAN object was created with gen_min_span_tree=False or the tree was not generated in spite of it owing to internal optimization criteria.r   r   r8   fromtor   r`   c                    s.   g | ]&}|  |  t |  |  qS r3   )ro   r   r   )DSCDSPC_wrtr3   r4   r     s   z.HDBSCAN.relative_validity_.<locals>.<listcomp>c                    s    g | ]}|  |   qS r3   r3   r   )V_indexcluster_sizetotalr3   r4   r     r   )r   rK   r   r   r=   Zbincountr   rF   r   r   Zonesr   Z	to_pandasZiterrowsrD   ro   r   rC   r   )r   r2   sizesZ
noise_sizeZnum_clustersZmin_outlier_sepZcorrection_constZmax_distanceZmst_dfZedgelabel1label2lengthZ
correctionZscorer3   )r   r   r   r   r   r4   relative_validity_  s\    





	zHDBSCAN.relative_validity_)N)N)r6   )__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   propertyr   r   r   r   r   r   r   r3   r3   r3   r4   r     sP    K

1
O
  %
"





r   )r'   r(   FFr)   r   )r6   r7   r!   r8   NF)r6   r7   r!   r8   rT   F)r6   r7   r!   r8   rT   F)r6   r7   r!   r8   rT   F)r6   r7   r!   r8   rT   TFrk   )r6   r7   r!   r8   rT   TFrk   )@r   numpyr=   Zsklearn.baser   r   Zsklearn.metricsr   Zscipy.sparser   Zsklearn.neighborsr   r   Zjoblibr   warningsr	   Zsklearn.utilsr
   Zjoblib.parallelr   r   Z_hdbscan_linkager   r   r   r   Z_hdbscan_treer   r   r   r   Z_hdbscan_reachabilityr   r   Z_hdbscan_boruvkar   r   Zdist_metricsr   Zplotsr   r   r   Z
predictionr   r   r   r   r&   r5   rS   r<   ri   rj   rs   rt   rv   r   r   r   r   r   r   r3   r3   r3   r4   <module>   s         
      
N      
F      
,      
)        
.        
++'

   