
    >[gK              	          d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z#  edd      \  Z$Z% ee$e%d      \  Z$Z% e       jM                  e$      Z$g dZ'dh ejP                         D  ch c]
  \  } }|d    c}} z  Z)dQdZ*ejV                  jY                  de      d        Z-d Z.ejV                  jY                  dg e#e"      d        Z/d Z0ejV                  jY                  de'      ejV                  jY                  d e      d!               Z1d" Z2ejV                  jY                  d#d$      d%        Z3d& Z4d' Z5d( Z6d) Z7ejV                  jY                  d*d+d,g      d-        Z8ejV                  jY                  d.e#      d/        Z9ejV                  jY                  d0e'      d1        Z:d2 Z;d3 Z<ejV                  jY                  d4d d5i ejz                  d6ej|                  gej|                  d6gg      fd d5id6d7gd7d6ggfi d6d7gd8d9ggfg      d:        Z?ejV                  jY                  d.e#      d;        Z@ejV                  jY                  d.e#      d<        ZAd= ZBd> ZCd? ZDejV                  jY                  d@dAdBg      ejV                  jY                  dCddDg      dE               ZEdF ZFejV                  jY                  dGdHdIg      dJ        ZGejV                  jY                  dKdLdMg      dN        ZHejV                  jY                  dOd+d,g      dP        ZIyc c}} w )RzF
Tests for HDBSCAN clustering algorithm
Based on the DBSCAN test code
    N)stats)distance)HDBSCAN)CONDENSED_dtype_condense_tree_do_labelling)_OUTLIER_ENCODING)
make_blobs)fowlkes_mallows_score)_VALID_METRICSeuclidean_distances)BallTreeKDTree)StandardScaler)shuffle)assert_allcloseassert_array_equal)CSC_CONTAINERSCSR_CONTAINERS   
   )	n_samplesrandom_state   )r   )kd_tree	ball_treebruteautolabelc                 r    t        t        |       t        z
        }|dk(  sJ t        | t              |kD  sJ y )N   )lensetOUTLIER_SETr   y)labels	threshold
n_clusterss      ]/var/www/html/bid-api/venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hdbscan.pycheck_label_qualityr+   )   s6    S[;./J?? +i777    outlier_typec                    t         j                  t         j                  d|    }d d d|    }t        |    d   }t        |    d   }t        j                         }|dg|d<   ||g|d<   t               j                  |      }|j                  |k(  j                         \  }t        |ddg        ||j                  |      j                         \  }t        |ddg       t        t        dd            t        t        d	d
            z   }	t               j                  ||	         }
t        |
j                  |j                  |	          y)O
    Tests if np.inf and np.nan data are each treated as special outliers.
    )infinitemissingc                     | |k(  S N xr&   s     r*   <lambda>z#test_outlier_data.<locals>.<lambda>9   s    ar,   c                 ,    t        j                  |       S r3   )npisnanr5   s     r*   r7   z#test_outlier_data.<locals>.<lambda>:   s    r,   r    prob   r         r   N)r9   infnanr	   Xcopyr   fitlabels_nonzeror   probabilities_listrange)r-   outlier
prob_checkr    r;   	X_outliermodelmissing_labels_idxmissing_probs_idxclean_indicesclean_models              r*   test_outlier_datarQ   /   s8    FF66 G
 (+ J l+G4E\*62DIQ<IaLW%IaLIMM)$E"]]e3<<>)Aq62&u';';TBKKM(1a&1q!%U1c](;;M)--	- 89K{**EMM-,HIr,   c                     t        t              } | j                         }t        dd      j	                  |       }t        | |       t        |       d}t        j                  t        |      5  t        dd      j	                  t               ddd       d}d| d	<   d
| d<   t        j                  t        |      5  t        d      j	                  |        ddd       y# 1 sw Y   UxY w# 1 sw Y   yxY w)zy
    Tests that HDBSCAN works with precomputed distance matrices, and throws the
    appropriate errors when needed.
    precomputedT)metricrB   z*The precomputed distance matrix.*has shapematchNz'The precomputed distance matrix.*valuesr   )r   r<   r<   )r<   r   rT   )
r   rA   rB   r   fit_predictr   r+   pytestraises
ValueError)D
D_originalr'   msgs       r*   test_hdbscan_distance_matrixr_   O   s    
 	AAJM5AA!DFAz"
7C	z	-}40<<Q? 
. 5CAdGAdG	z	-}%11!4 
.	- 
.	- 
.	-s   0!C% C1%C.1C:sparse_constructorc                 `   t        j                  t        j                  t                    }|t	        j
                  |      z  }t        j                  |j                         d      }d|||k\  <    | |      }|j                          t        d      j                  |      }t        |       y)zA
    Tests that HDBSCAN works with sparse distance matrices.
    2           rS   rW   N)r   
squareformpdistrA   r9   maxr   scoreatpercentileflatteneliminate_zerosr   rX   r+   )r`   r\   r(   r'   s       r*   #test_hdbscan_sparse_distance_matrixrj   g   s    
 	HNN1-.ANA''		R8IAa9n1AM*66q9Fr,   c                  T    t               j                  t              } t        |        y)z
    Tests that HDBSCAN works with feature array, including an arbitrary
    goodness of fit check. Note that the check is a simple heuristic.
    N)r   rX   rA   r+   r'   s    r*   test_hdbscan_feature_arrayrm   y   s     
 Y""1%F r,   algorT   c                    t        |       j                  t              }t        |       | dv ryt        t
        d}dt        j                  t        j                  d         idt        j                  t        j                  d         iddidt        j                  t        j                  d         d	d
j                  |d      }t        | ||      }|||    j                  vr8t        j                  t              5  |j                  t               ddd       y|dk(  r8t        j                   t"              5  |j                  t               ddd       y|j                  t               y# 1 sw Y   yxY w# 1 sw Y   yxY w)z
    Tests that HDBSCAN works with the expected combinations of algorithms and
    metrics, or raises the expected errors.
    )	algorithm)r   r   N)r   r   Vr<   p   )rr   w)mahalanobis
seuclidean	minkowski
wminkowski)rp   rT   metric_paramsrx   )r   rX   rA   r+   r   r   r9   eyeshapeonesgetvalid_metricsrY   rZ   r[   rC   warnsFutureWarning)rn   rT   r'   ALGOS_TREESry   hdbs         r*   test_hdbscan_algorithmsr      s9    t$003F    K
 RVVAGGAJ/0BGGAGGAJ/01XBGGAGGAJ$78	
 
c&$  #C [&444]]:&GGAJ '&	<	\\-(GGAJ )( 	
 '& )(s   E3>E?3E<?Fc                  z    t               j                  t              } | j                  d      }t	        |d       y)z
    Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
    This test is more of a sanity check than a rigorous evaluation.
    333333?gq=
ףp?)r(   N)r   rC   rA   dbscan_clusteringr+   )	clustererr'   s     r*   test_dbscan_clusteringr      s0    
 	a I((-F $/r,   cut_distance)皙?      ?r<   c                    t         d   d   }t         d   d   }t        j                         }t        j                  dg|d<   dt        j
                  g|d<   t        j                  t        j
                  g|d<   t               j                  |      }|j                  |       }t        j                  ||k(        }t        |ddg       t        j                  ||k(        }t        |dg       t        t        t        d	            t        ||z         z
        }t               j                  ||         }	|	j                  |       }
t        |
||          y
)r/   r1   r    r0   r<   r   rs   r=   )r   r   N)r	   rA   rB   r9   r?   r@   r   rC   r   flatnonzeror   rG   r$   rH   )r   missing_labelinfinite_labelrK   rL   r'   rM   infinite_labels_idx	clean_idxrP   clean_labelss              r*   #test_dbscan_clustering_outlier_datar      s&   
 &i09M&z27;NIFFA;IaLrvv;IaLFFBFF#IaLIMM)$E$$,$?F-(?@)Aq62..>)AB*QC0Ss_s+=@S+S'TTUI)--	) 45K00l0KL|VI%67r,   c                      t        ddt        j                  t        j                  d         i      j                  t              } t        |        y)z4
    Tests that HDBSCAN using `BallTree` works.
    rv   rq   r<   )rT   ry   N)r   r9   r|   rA   r{   rX   r+   rl   s    r*   !test_hdbscan_best_balltree_metricr      s?     C1D+Ek!n  r,   c                      t        t        t              dz
        j                  t              } t	        |       j                  t              sJ y)z
    Tests that HDBSCAN correctly does not generate a valid cluster when the
    `min_cluster_size` is too large for the data.
    r<   min_cluster_sizeN)r   r#   rA   rX   r$   issubsetr%   rl   s    r*   test_hdbscan_no_clustersr      s9    
 c!fqj1==a@Fv;,,,r,   c                  ,   t        dt        t              d      D ]r  } t        |       j	                  t              }|D cg c]
  }|dk7  s	| }}t        |      dk7  sFt        j                  t        j                  |            | k\  rrJ  yc c}w )zb
    Test that the smallest non-noise cluster has at least `min_cluster_size`
    many points
    rs   r<   r   r   r   N)rH   r#   rA   r   rX   r9   minbincount)r   r'   r    true_labelss       r*   test_hdbscan_min_cluster_sizer      s}    
 "!SVQ/*:;GGJ*0@&ERKu&@{q 66"++k237GGGG	 0@s   
BBc                  x    t         j                  } t        |       j                  t              }t        |       y)zA
    Tests that HDBSCAN works when passed a callable metric.
    rW   N)r   	euclideanr   rX   rA   r+   )rT   r'   s     r*   test_hdbscan_callable_metricr      s,     FF#//2Fr,   treer   r   c                     t        d|       }d}t        j                  t        |      5  |j	                  t
               ddd       y# 1 sw Y   yxY w)z
    Tests that HDBSCAN correctly raises an error when passing precomputed data
    while requesting a tree-based algorithm.
    rS   rT   rp   z%precomputed is not a valid metric forrU   N)r   rY   rZ   r[   rC   rA   )r   r   r^   s      r*   "test_hdbscan_precomputed_non_bruter      s:     $
7C
1C	z	-
 
.	-	-s   A

Acsr_containerc                 0   t               j                  t              j                  }t	        |        | t              }|j                         }t               j                  |      j                  }t        ||       t        j                  dft        j                  dffD ]  \  }}t        j                         }||d<   t               j                  |      j                  }t	        |       |d   t        |   d   k(  sJ |j                         }||d<   t               j                  |      j                  }t        ||        d}t        j                  t        |      5  t        dd	
      j                  |       ddd       y# 1 sw Y   yxY w)z
    Tests that HDBSCAN works correctly when passing sparse feature data.
    Evaluates correctness by comparing against the same data passed as a dense
    array.
    r0   r1   r   r   r   r    z4Sparse data matrices only support algorithm `brute`.rU   r   r   r   N)r   rC   rA   rD   r+   rB   r   r9   r?   r@   r	   rY   rZ   r[   )	r   dense_labels	_X_sparseX_sparsesparse_labelsoutlier_valr-   X_denser^   s	            r*   test_hdbscan_sparser   
  s<    9==#++L%a I~~HIMM(+33M|]3 (*vvz&:RVVY<O%P!\&&(#y}}W-55L)A"3L"A'"JJJJ>>#$	h/77<7 &Q AC	z	-{k:>>xH 
.	-	-s   &FFrp   c                    ddg}t        dd|d      \  }}t        d      j                  |      }t        ||j                  |j
                        D ]$  \  }}}t        ||d	d
       t        ||d	d
       & t        | dt        j                  d         j                  t              }|j                  j                  d   dk(  sJ |j
                  j                  d   dk(  sJ y)zj
    Tests that HDBSCAN centers are calculated and stored properly, and are
    accurate to the data.
    )rc   rc   )      @r   i  r   r   )r   r   centerscluster_stdboth)store_centersr<   g?)rtolatol)rp   r   r   N)	r
   r   rC   zip
centroids_medoids_r   rA   r{   )rp   r   H_r   centercentroidmedoids           r*   test_hdbscan_centersr   -  s     :&G1gSVWDAq

'
+
+A
.C$'$N &qt<QT: %O
 6AGGAJ	c!f  >>"a'''<<a A%%%r,   c                     t         j                  j                  d      } | j                  dd      }t	        dddd      j                  |      }t        j                  |d	      \  }}t        |      dk(  sJ ||d
k(     dkD  sJ t	        ddddd      j                  |      }t        j                  |d	      \  }}t        |      dk(  sJ ||d
k(     dk(  sJ y)zS
    Tests that HDBSCAN single-cluster selection with epsilon works correctly.
    r      rs   r=   rc   eomT)r   cluster_selection_epsiloncluster_selection_methodallow_single_cluster)return_countsr      g
ףp=
?r   )r   r   r   r   rp   N)r9   randomRandomStaterandr   rX   uniquer#   )rngno_structurer'   unique_labelscountss        r*   .test_hdbscan_allow_single_cluster_with_epsilonr   C  s     ))


"C88C#L"%!&!	
 k,  IIfDAM6}""" -2%&+++ "&!&! k,  IIfDAM6}"""-2%&!+++r,   c                      ddgddgddgddgg} t        d| g dd      \  }}t               j                  |      j                  }t	        t        |            t        d	|v       z
  }|d
k(  sJ t        ||      dkD   y)z
    Validate that HDBSCAN can properly cluster this difficult synthetic
    dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
    example)
    g333333g333333?r"   i  )皙?gffffff?皙?r   r   )r   r   r   r   r      Gz?N)r
   r   rC   rD   r#   r$   intr   )r   rA   r&   r'   r)   s        r*   test_hdbscan_better_than_dbscanr   d  s     u~t}q!fq"g>G+	DAq Y]]1%%FS[!Cf$55J??&!$t+r,   z	kwargs, XrS   r<   rs   r"   r   c                 <    t        dddi|j                  |        y)zo
    Tests that HDBSCAN works correctly for array-likes and precomputed inputs
    with non-finite points.
    min_samplesr<   Nr4   )r   rC   )rA   kwargss     r*   test_hdbscan_usable_inputsr   x  s     $$V$((+r,   c                      | t        j                  d            }d}t        j                  t        |      5  t        d      j                  |       ddd       y# 1 sw Y   yxY w)zd
    Tests that HDBSCAN raises the correct error when there are too few
    non-zero distances.
    )r   r   z#There exists points with fewer thanrU   rS   rW   N)r9   zerosrY   rZ   r[   r   rC   r   rA   r^   s      r*   -test_hdbscan_sparse_distances_too_few_nonzeror     sI     	bhhx()A
/C	z	-}%))!, 
.	-	-s   AA'c                 "   t        j                  d      }d|ddddf<   d|ddddf<   ||j                  z   } | |      }d}t        j                  t
        |      5  t        d	      j                  |       ddd       y# 1 sw Y   yxY w)
zu
    Tests that HDBSCAN raises the correct error when the distance matrix
    has multiple connected components.
    )   r   r<   Nr=      z2HDBSCAN cannot be perfomed on a disconnected graphrU   rS   rW   )r9   r   TrY   rZ   r[   r   rC   r   s      r*   0test_hdbscan_sparse_distances_disconnected_graphr     s     	AAbqb"1"fIAab"#gJ	ACCAaA
>C	z	-}%))!, 
.	-	-s    BBc                     d } d}t        j                  t        |      5  t        d|       j	                  t
               ddd       t        j                  t        |      5  t        d|       j	                  t
               ddd       t        t        t        j                        t        t        j                        z
        }t        |      dkD  rHt        j                  t        |      5  t        d|d         j	                  t
               ddd       yy# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   yxY w)	zR
    Tests that HDBSCAN correctly raises an error for invalid metric choices.
    c                     | S r3   r4   )r6   s    r*   r7   z2test_hdbscan_tree_invalid_metric.<locals>.<lambda>  s    r,   zV.* is not a valid metric for a .*-based algorithm\. Please select a different metric\.rU   r   )rp   rT   Nr   r   )rY   rZ   r[   r   rC   rA   rG   r$   r   r~   r   r#   )metric_callabler^   metrics_not_kds      r*    test_hdbscan_tree_invalid_metricr     s     "O	  
z	-)O<@@C 
.	z	-+o>BB1E 
.
 #h445F<P<P8QQRN
>Q]]:S1iq0ABFFqI 21  
.	-	-	- 21s#   !D!%!D-3$D9!D*-D69Ec                      t        t        t              dz         } d}t        j                  t
        |      5  | j                  t               ddd       y# 1 sw Y   yxY w)zx
    Tests that HDBSCAN correctly raises an error when setting `min_samples`
    larger than the number of samples.
    r<   )r   z min_samples (.*) must be at mostrU   N)r   r#   rA   rY   rZ   r[   rC   )r   r^   s     r*   !test_hdbscan_too_many_min_samplesr     s@    
 c!fqj
)C
-C	z	-
 
.	-	-s   AA"c                      t         j                         } t        j                  | d<   d}t	        d      }t        j                  t        |      5  |j                  |        ddd       y# 1 sw Y   yxY w)zu
    Tests that HDBSCAN correctly raises an error when providing precomputed
    distances with `np.nan` values.
    r   z(np.nan values found in precomputed-denserS   rW   rU   N)	rA   rB   r9   r@   r   rY   rZ   r[   rC   )X_nanr^   r   s      r*   "test_hdbscan_precomputed_dense_nanr     sP    
 FFHE&&E$K
4C

'C	z	- 
.	-	-s   A,,A5r   TFepsilonr   c                 P   d}t        || ddgddgddgg      \  }}t               j                  |      }t        |j                  |j
                        }|dz   |dz   |dz   h}|dz   d|dz   d	|dz   di}	t        |||	||
      }
t        t        |            D ci c]!  }|t        j                  ||k(        d   d   # }}t        t        |            D ci c]  }||
||       }} t        j                  |j                        |      }t        |
|       yc c}w c c}w )zR
    Tests that the `_do_labelling` helper function correctly assigns labels.
    0   r   r   )r   r   r   rs   r"   r   r<   condensed_treeclusterscluster_label_mapr   r   N)r
   r   rC   r   _single_linkage_tree_r   r   rG   r$   r9   where	vectorizer}   r   )global_random_seedr   r   r   rA   r&   estr   r   r   r'   _yfirst_with_labely_to_labelsaligned_targets                  r*   test_labelling_distinctr    sL    I' FGG
		DAq )--
C#!!C4H4HN Ay1}i!m<H"Q9q=!Y]AN%+1")F ?C3q6lKlBHHQ"W-a033lK>B3q6lKl2v.r233lKK2R\\+//215Nv~. LKs   &DD#c                  L   d} d}t        j                  dd|dfddd|dfddgt        	      }t        || h| d| dz   did
d      }|d   dk  }t	        |      t	        |dk(        k(  sJ t        || h| d| dz   did
d      }|d   |k  }t	        |      t	        |dk(        k(  sJ y)z
    Tests that the `_do_labelling` helper function correctly thresholds the
    incoming lambda values given various `cluster_selection_epsilon` values.
    r=   g      ?rs   r<   )r=   r<   r   r<   r   )r=   r"   r   r<   )r=   r   r   r<   )dtypeTr   valuer   N)r9   arrayr   r   sum)r   
MAX_LAMBDAr   r'   	num_noises        r*   test_labelling_thresholdingr
    s    
 IJXX:q!:q!	
 	N %$aQ:!"#F w'!+Iy>S2....%$aQ:!"#F w'*4Iy>S2....r,   r   r   r   c                    t         j                  j                  d      }|j                  d      }t        |      }d}t	        j
                  t        |      5  t        d|       j                  |       ddd       y# 1 sw Y   yxY w)zCheck that we raise an error if the centers are requested together with
    a precomputed input matrix.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27893
    r   )d   rs   z>Cannot store centers when using a precomputed distance matrix.rU   rS   )rT   r   N)	r9   r   r   r   rY   rZ   r[   r   rC   )r   r   rA   X_disterr_msgs        r*   0test_hdbscan_error_precomputed_and_store_centersr  %  sd     ))


"C

8A #FNG	z	1}MBFFvN 
2	1	1s   A??B
valid_algor   r   c                 D    t        d|       j                  t               y)zTest that HDBSCAN works with the "cosine" metric when the algorithm is set
    to "brute" or "auto".

    Non-regression test for issue #28631
    cosiner   N)r   rX   rA   )r  s    r*   *test_hdbscan_cosine_metric_valid_algorithmr  5  s     8z2>>qAr,   invalid_algoc                     t        d|       }t        j                  t        d      5  |j	                  t
               ddd       y# 1 sw Y   yxY w)zTest that HDBSCAN raises an informative error is raised when an unsupported
    algorithm is used with the "cosine" metric.
    r  r   zcosine is not a valid metricrU   N)r   rY   rZ   r[   rX   rA   )r  hdbscans     r*   ,test_hdbscan_cosine_metric_invalid_algorithmr  ?  s:    
 X>G	z)G	HA 
I	H	Hs   AA)r   )J__doc__numpyr9   rY   scipyr   scipy.spatialr   sklearn.clusterr   sklearn.cluster._hdbscan._treer   r   r    sklearn.cluster._hdbscan.hdbscanr	   sklearn.datasetsr
   sklearn.metricsr   sklearn.metrics.pairwiser   r   sklearn.neighborsr   r   sklearn.preprocessingr   sklearn.utilsr   sklearn.utils._testingr   r   sklearn.utils.fixesr   r   rA   r&   fit_transform
ALGORITHMSitemsr%   r+   markparametrizerQ   r_   rj   rm   r   r   r   r   r   r   r   r   r   r   r   r   r  r?   r   r   r   r   r   r   r  r
  r  r  r  )r   outs   00r*   <module>r-     s  
    " # 
 ? ' 1 H . 0 ! F >Cb11q!!$1""1%
 d1H1B1H1H1JK1Jvq#c'l1JKK8 ):;J <J>50 -/Q/Q./QR  S "	  ,>2$ 3 -$N
0 78 884 -	H  )[!9: ; .9I :ID j1& 2&*,B,( 
M	"HBHHq"&&kBFFA;-G$HI
M	"aVaV$45	q!fq!f,, .9	- :	- .9- :- J0
 /$?QH-!/ . @!/H&/R :x*@AO BO '89B :B )[)AB Cu Ls   0M