
    -iK              	          S r SSKrSSKrSSKJr  SSKJr  SSKJ	r	  SSK
JrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJr  SSKJr  SSKJrJ r   SSK!J"r"J#r#  \" SSS9u  r$r%\" \$\%SS9u  r$r%\" 5       RM                  \$5      r$/ SQr'S1\RP                  " 5        V Vs1 s H
  u  pUS   iM     snn -  r)SQS jr*\RV                  RY                  S\5      S 5       r-S r.\RV                  RY                  S/ \#Q\"Q5      S 5       r/S r0\RV                  RY                  S\'5      \RV                  RY                  S \5      S! 5       5       r1S" r2\RV                  RY                  S#S$5      S% 5       r3S& r4S' r5S( r6S) r7\RV                  RY                  S*S+S,/5      S- 5       r8\RV                  RY                  S.\#5      S/ 5       r9\RV                  RY                  S0\'5      S1 5       r:S2 r;S3 r<\RV                  RY                  S4S S50\Rz                  " S6\R|                  /\R|                  S6//5      4S S50S6S7/S7S6//40 S6S7/S8S9//4/5      S: 5       r?\RV                  RY                  S.\#5      S; 5       r@\RV                  RY                  S.\#5      S< 5       rAS= rBS> rCS? rD\RV                  RY                  S@SASB/5      \RV                  RY                  SCSSD/5      SE 5       5       rESF rF\RV                  RY                  SGSHSI/5      SJ 5       rG\RV                  RY                  SKSLSM/5      SN 5       rH\RV                  RY                  SOS+S,/5      SP 5       rIgs  snn f )RzF
Tests for HDBSCAN clustering algorithm
Based on the DBSCAN test code
    N)stats)distance)HDBSCAN)CONDENSED_dtype_condense_tree_do_labelling)_OUTLIER_ENCODING)
make_blobs)fowlkes_mallows_score)_VALID_METRICSeuclidean_distances)BallTreeKDTree)StandardScaler)shuffle)assert_allcloseassert_array_equal)CSC_CONTAINERSCSR_CONTAINERS   
   )	n_samplesrandom_state   )r   )kd_tree	ball_treebruteautolabelc                 v    [        [        U 5      [        -
  5      nUS:X  d   e[        U [        5      U:  d   eg )N   )lensetOUTLIER_SETr   y)labels	threshold
n_clusterss      U/var/www/html/venv/lib/python3.13/site-packages/sklearn/cluster/tests/test_hdbscan.pycheck_label_qualityr+   )   s6    S[;./J?? +i777    outlier_typec                    [         R                  [         R                  S.U    nS S S.U    n[        U    S   n[        U    S   n[        R                  5       nUS/US'   X/US'   [        5       R                  U5      nUR                  U:H  R                  5       u  n[        USS/5        U" UR                  U5      R                  5       u  n[        USS/5        [        [        SS5      5      [        [        S	S
5      5      -   n	[        5       R                  XY   5      n
[        U
R                  UR                  U	   5        g)G
Tests if np.inf and np.nan data are each treated as special outliers.
)infinitemissingc                 
    X:H  $ N xr&   s     r*   <lambda>#test_outlier_data.<locals>.<lambda>9   s    r,   c                 .    [         R                  " U 5      $ r3   )npisnanr5   s     r*   r7   r8   :   s    r,   r    prob   r         r   N)r:   infnanr	   Xcopyr   fitlabels_nonzeror   probabilities_listrange)r-   outlier
prob_checkr    r<   	X_outliermodelmissing_labels_idxmissing_probs_idxclean_indicesclean_models              r*   test_outlier_datarR   /   s4    FF66 G
 (+ J l+G4E\*62DIQ<IaL%IaLIMM)$E"]]e3<<>)Aq62&u';';TBKKM(1a&1q!%U1c](;;M)--	 89K{**EMM-,HIr,   c                     [        [        5      n U R                  5       n[        SSS9R	                  U 5      n[        X5        [        U5        Sn[        R                  " [        US9   [        SSS9R	                  [        5        SSS5        SnSU S	'   S
U S'   [        R                  " [        US9   [        SS9R	                  U 5        SSS5        g! , (       d  f       NU= f! , (       d  f       g= f)zm
Tests that HDBSCAN works with precomputed distance matrices, and throws the
appropriate errors when needed.
precomputedT)metricrC   z*The precomputed distance matrix.*has shapematchNz'The precomputed distance matrix.*valuesr   )r   r=   r=   )r=   r   rU   )
r   rB   rC   r   fit_predictr   r+   pytestraises
ValueError)D
D_originalr'   msgs       r*   test_hdbscan_distance_matrixr`   O   s    
 	AAJM5AA!DFA"
7C	z	-}40<<Q? 
. 5CAdGAdG	z	-}%11!4 
.	- 
.	- 
.	-s   *C5C(
C%(
C6sparse_constructorc                 `   [         R                  " [         R                  " [        5      5      nU[        R
                  " U5      -  n[        R                  " UR                  5       S5      nSXU:  '   U " U5      nUR                  5         [        SS9R                  U5      n[        U5        g)z9
Tests that HDBSCAN works with sparse distance matrices.
2           rT   rX   N)r   
squareformpdistrB   r:   maxr   scoreatpercentileflatteneliminate_zerosr   rY   r+   )ra   r]   r(   r'   s       r*   #test_hdbscan_sparse_distance_matrixrk   g   s    
 	HNN1-.ANA''		R8IA9n1AM*66q9Fr,   c                  T    [        5       R                  [        5      n [        U 5        g)z
Tests that HDBSCAN works with feature array, including an arbitrary
goodness of fit check. Note that the check is a simple heuristic.
N)r   rY   rB   r+   r'   s    r*   test_hdbscan_feature_arrayrn   y   s     
 Y""1%F r,   algorU   c                 0   [        U S9R                  [        5      n[        U5        U S;   a  g[        [
        S.nS[        R                  " [        R                  S   5      0S[        R                  " [        R                  S   5      0SS0S[        R                  " [        R                  S   5      S	.S
.R                  US5      n[        U UUS9nXU    R                  ;  a9  [        R                  " [        5         UR                  [        5        SSS5        gUS:X  a9  [        R                   " ["        5         UR                  [        5        SSS5        gUR                  [        5        g! , (       d  f       g= f! , (       d  f       g= f)zs
Tests that HDBSCAN works with the expected combinations of algorithms and
metrics, or raises the expected errors.
)	algorithm)r   r   N)r   r   Vr=   p   )rs   w)mahalanobis
seuclidean	minkowski
wminkowski)rq   rU   metric_paramsry   )r   rY   rB   r+   r   r   r:   eyeshapeonesgetvalid_metricsrZ   r[   r\   rD   warnsFutureWarning)ro   rU   r'   ALGOS_TREESrz   hdbs         r*   test_hdbscan_algorithmsr      s7    t$003F    K
 RVVAGGAJ/0BGGAGGAJ/01XBGGAGGAJ$78	
 
c&$  #C &444]]:&GGAJ '&	<	\\-(GGAJ )( 	
 '& )(s   E6F6
F
Fc                  t    [        5       R                  [        5      n U R                  S5      n[	        USS9  g)z
Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
This test is more of a sanity check than a rigorous evaluation.
333333?gq=
ףp?)r(   N)r   rD   rB   dbscan_clusteringr+   )	clustererr'   s     r*   test_dbscan_clusteringr      s0    
 	a I((-F $/r,   cut_distance)皙?      ?r=   c                    [         S   S   n[         S   S   n[        R                  5       n[        R                  S/US'   S[        R
                  /US'   [        R                  [        R
                  /US'   [        5       R                  U5      nUR                  U S9n[        R                  " XQ:H  5      n[        USS/5        [        R                  " XR:H  5      n[        US/5        [        [        [        S	5      5      [        Xg-   5      -
  5      n[        5       R                  X8   5      n	U	R                  U S9n
[        XU   5        g
)r/   r1   r    r0   r=   r   rt   r>   )r   r   N)r	   rB   rC   r:   r@   rA   r   rD   r   flatnonzeror   rH   r$   rI   )r   missing_labelinfinite_labelrL   rM   r'   rN   infinite_labels_idx	clean_idxrQ   clean_labelss              r*   #test_dbscan_clustering_outlier_datar      s   
 &i09M&z27;NIFFA;IaLrvv;IaLFFBFF#IaLIMM)$E$$,$?F(?@)Aq62..)AB*QC0Ss_s+=+S'TTUI)--	 45K00l0KL|I%67r,   c                      [        SS[        R                  " [        R                  S   5      0S9R                  [        5      n [        U 5        g)z,
Tests that HDBSCAN using `BallTree` works.
rw   rr   r=   )rU   rz   N)r   r:   r}   rB   r|   rY   r+   rm   s    r*   !test_hdbscan_best_balltree_metricr      s?     C1D+Ek!n  r,   c                      [        [        [        5      S-
  S9R                  [        5      n [	        U 5      R                  [        5      (       d   eg)zw
Tests that HDBSCAN correctly does not generate a valid cluster when the
`min_cluster_size` is too large for the data.
r=   min_cluster_sizeN)r   r#   rB   rY   r$   issubsetr%   rm   s    r*   test_hdbscan_no_clustersr      s<    
 c!fqj1==a@Fv;,,,,r,   c                  <   [        S[        [        5      S5       Hy  n [        U S9R	                  [        5      nU Vs/ s H  o"S:w  d  M
  UPM     nn[        U5      S:w  d  MI  [
        R                  " [
        R                  " U5      5      U :  a  My   e   gs  snf )zV
Test that the smallest non-noise cluster has at least `min_cluster_size`
many points
rt   r=   r   r   r   N)rI   r#   rB   r   rY   r:   minbincount)r   r'   r    true_labelss       r*   test_hdbscan_min_cluster_sizer      s{    
 "!SVQ/*:;GGJ*0@&RKu&@{q 66"++k237GGGG	 0@s   	BBc                  r    [         R                  n [        U S9R                  [        5      n[        U5        g)z9
Tests that HDBSCAN works when passed a callable metric.
rX   N)r   	euclideanr   rY   rB   r+   )rU   r'   s     r*   test_hdbscan_callable_metricr      s,     FF#//2Fr,   treer   r   c                     [        SU S9nSn[        R                  " [        US9   UR	                  [
        5        SSS5        g! , (       d  f       g= f)zu
Tests that HDBSCAN correctly raises an error when passing precomputed data
while requesting a tree-based algorithm.
rT   rU   rq   z%precomputed is not a valid metric forrV   N)r   rZ   r[   r\   rD   rB   )r   r   r_   s      r*   "test_hdbscan_precomputed_non_bruter      s:     $
7C
1C	z	-
 
.	-	-s   A
Acsr_containerc                 ,   [        5       R                  [        5      R                  n[	        U5        U " [        5      nUR                  5       n[        5       R                  U5      R                  n[        X5        [        R                  S4[        R                  S44 H  u  pV[        R                  5       nXWS'   [        5       R                  U5      R                  n[	        U5        US   [        U   S   :X  d   eUR                  5       nXSS'   [        5       R                  U5      R                  n[        X5        M     Sn[        R                  " [        US9   [        SS	S
9R                  U5        SSS5        g! , (       d  f       g= f)z
Tests that HDBSCAN works correctly when passing sparse feature data.
Evaluates correctness by comparing against the same data passed as a dense
array.
r0   r1   r   r   r   r    z4Sparse data matrices only support algorithm `brute`.rV   r   r   r   N)r   rD   rB   rE   r+   rC   r   r:   r@   rA   r	   rZ   r[   r\   )	r   dense_labels	_X_sparseX_sparsesparse_labelsoutlier_valr-   X_denser_   s	            r*   test_hdbscan_sparser   
  s2    9==#++L%a I~~HIMM(+33M|3 (*vvz&:RVVY<O%P!&&(#y}}W-55L)A"3L"A'"JJJJ>>#$	h/77<7 &Q AC	z	-{k:>>xH 
.	-	-s   "F
Frq   c                    SS/n[        SSUSS9u  p#[        SS9R                  U5      n[        XR                  UR
                  5       H  u  pVn[        XVS	S
S9  [        XWS	S
S9  M     [        U S[        R                  S   S9R                  [        5      nUR                  R                  S   S:X  d   eUR
                  R                  S   S:X  d   eg)z^
Tests that HDBSCAN centers are calculated and stored properly, and are
accurate to the data.
)rd   rd   )      @r   i  r   r   )r   r   centerscluster_stdboth)store_centersr=   g?)rtolatol)rq   r   r   N)	r
   r   rD   zip
centroids_medoids_r   rB   r|   )rq   r   H_r   centercentroidmedoids           r*   test_hdbscan_centersr   -  s     :&G1gSVWDA

'
+
+A
.C$'$N &qt<QT: %O
 6AGGAJ	c!f  >>"a'''<<a A%%%r,   c                     [         R                  R                  S5      n U R                  SS5      n[	        SSSSS9R                  U5      n[         R                  " USS	9u  p4[        U5      S:X  d   eXCS
:H     S:  d   e[	        SSSSSS9R                  U5      n[         R                  " USS	9u  p4[        U5      S:X  d   eXCS
:H     S:X  d   eg)zK
Tests that HDBSCAN single-cluster selection with epsilon works correctly.
r      rt   r>   rd   eomT)r   cluster_selection_epsiloncluster_selection_methodallow_single_cluster)return_countsr      g
ףp=
?r   )r   r   r   r   rq   N)r:   randomRandomStaterandr   rY   uniquer#   )rngno_structurer'   unique_labelscountss        r*   .test_hdbscan_allow_single_cluster_with_epsilonr   C  s     ))


"C88C#L"%!&!	
 k,  IIfDAM}""" 2%&+++ "&!&! k,  IIfDAM}"""2%&!+++r,   c                      SS/SS/SS/SS//n [        SU / SQSS9u  p[        5       R                  U5      R                  n[	        [        U5      5      [        S	U;   5      -
  nUS
:X  d   e[        X25      S:    g)z
Validate that HDBSCAN can properly cluster this difficult synthetic
dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
example)
g333333g333333?r"   i  )皙?gffffff?皙?r   r   )r   r   r   r   r      Gz?N)r
   r   rD   rE   r#   r$   intr   )r   rB   r&   r'   r)   s        r*   test_hdbscan_better_than_dbscanr   d  s     u~t}q!fq"g>G+	DA Y]]1%%FS[!Cf$55J??&$t+r,   z	kwargs, XrT   r=   rt   r"   r   c                 <    [        SSS0UD6R                  U 5        g)zc
Tests that HDBSCAN works correctly for array-likes and precomputed inputs
with non-finite points.
min_samplesr=   Nr4   )r   rD   )rB   kwargss     r*   test_hdbscan_usable_inputsr   x  s     $$V$((+r,   c                     U " [         R                  " S5      5      nSn[        R                  " [        US9   [        SS9R                  U5        SSS5        g! , (       d  f       g= f)zX
Tests that HDBSCAN raises the correct error when there are too few
non-zero distances.
)r   r   z#There exists points with fewer thanrV   rT   rX   N)r:   zerosrZ   r[   r\   r   rD   r   rB   r_   s      r*   -test_hdbscan_sparse_distances_too_few_nonzeror     sI     	bhhx()A
/C	z	-}%))!, 
.	-	-s   A
A(c                 "   [         R                  " S5      nSUSS2SS24'   SUSS2SS24'   XR                  -   nU " U5      nSn[        R                  " [
        US9   [        SS	9R                  U5        SSS5        g! , (       d  f       g= f)
zi
Tests that HDBSCAN raises the correct error when the distance matrix
has multiple connected components.
)   r   r=   Nr>      z3HDBSCAN cannot be performed on a disconnected graphrV   rT   rX   )r:   r   TrZ   r[   r\   r   rD   r   s      r*   0test_hdbscan_sparse_distances_disconnected_graphr     s     	AAbqb"1"fIAab"#gJ	CCAaA
?C	z	-}%))!, 
.	-	-s   B  
Bc                     S n Sn[         R                  " [        US9   [        SU S9R	                  [
        5        SSS5        [         R                  " [        US9   [        SU S9R	                  [
        5        SSS5        [        [        [        R                  5      [        [        R                  5      -
  5      n[        U5      S:  aC  [         R                  " [        US9   [        SUS   S9R	                  [
        5        SSS5        gg! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       g= f)	zJ
Tests that HDBSCAN correctly raises an error for invalid metric choices.
c                     U $ r3   r4   )r6   s    r*   r7   2test_hdbscan_tree_invalid_metric.<locals>.<lambda>  s    r,   zV.* is not a valid metric for a .*-based algorithm\. Please select a different metric\.rV   r   )rq   rU   Nr   r   )rZ   r[   r\   r   rD   rB   rH   r$   r   r   r   r#   )metric_callabler_   metrics_not_kds      r*    test_hdbscan_tree_invalid_metricr     s     "O	  
z	-)O<@@C 
.	z	-+o>BB1E 
.
 #h445F<P<P8QQRN
>Q]]:S1iq0ABFFqI 21  
.	-	-	- 21s#   DD$(!D5
D!$
D25
Ec                      [        [        [        5      S-   S9n Sn[        R                  " [
        US9   U R                  [        5        SSS5        g! , (       d  f       g= f)zl
Tests that HDBSCAN correctly raises an error when setting `min_samples`
larger than the number of samples.
r=   )r   z min_samples (.*) must be at mostrV   N)r   r#   rB   rZ   r[   r\   rD   )r   r_   s     r*   !test_hdbscan_too_many_min_samplesr     s@    
 c!fqj
)C
-C	z	-
 
.	-	-s   A
A"c                      [         R                  5       n [        R                  U S'   Sn[	        SS9n[
        R                  " [        US9   UR                  U 5        SSS5        g! , (       d  f       g= f)zi
Tests that HDBSCAN correctly raises an error when providing precomputed
distances with `np.nan` values.
r   z(np.nan values found in precomputed-denserT   rX   rV   N)	rB   rC   r:   rA   r   rZ   r[   r\   rD   )X_nanr_   r   s      r*   "test_hdbscan_precomputed_dense_nanr     sP    
 FFHE&&E$K
4C

'C	z	- 
.	-	-s   A''
A5r   TFepsilonr   c                 B   Sn[        UU SS/SS/SS//S9u  pE[        5       R                  U5      n[        UR                  UR
                  S9nUS-   US-   US-   1nUS-   SUS-   S	US-   S0n	[        UUU	UUS
9n
[        [        U5      5       Vs0 s H!  o[        R                  " X[:H  5      S   S   _M#     nn[        [        U5      5       Vs0 s H
  oXU      _M     nn[        R                  " UR                  5      " U5      n[        X5        gs  snf s  snf )zJ
Tests that the `_do_labelling` helper function correctly assigns labels.
0   r   r   )r   r   r   rt   r"   r   r=   condensed_treeclusterscluster_label_mapr   r   N)r
   r   rD   r   _single_linkage_tree_r   r   rH   r$   r:   where	vectorizer~   r   )global_random_seedr   r   r   rB   r&   estr   r   r   r'   _yfirst_with_labely_to_labelsaligned_targets                  r*   test_labelling_distinctr    s=    I' FGG
		DA )--
C#!!C4H4HN Ay1}i!m<H"Q9q=!Y]AN%+1")F ?C3q6lKlBHHQW-a033lK>B3q6lKlvr233lKK\\+//215Nv. LKs   (DDc                  @   Sn Sn[         R                  " SSUS4SSSUS4SS/[        S	9n[        UU 1U SU S-   S0S
SS9nUS   S:  n[	        U5      [	        US:H  5      :X  d   e[        UU 1U SU S-   S0S
SS9nUS   U:  n[	        U5      [	        US:H  5      :X  d   eg)z
Tests that the `_do_labelling` helper function correctly thresholds the
incoming lambda values given various `cluster_selection_epsilon` values.
r>   g      ?rt   r=   )r>   r=   r   r=   r   )r>   r"   r   r=   )r>   r   r   r=   )dtypeTr   valuer   N)r:   arrayr   r   sum)r   
MAX_LAMBDAr   r'   	num_noises        r*   test_labelling_thresholdingr    s    
 IJXX:q!:q!	
 	N %$aQ:!"#F w'!+Iy>S2....%$aQ:!"#F w'*4Iy>S2....r,   r   r   r   c                    [         R                  R                  S5      nUR                  S5      n[        U5      nSn[        R
                  " [        US9   [        SU S9R                  U5        SSS5        g! , (       d  f       g= f)zCheck that we raise an error if the centers are requested together with
a precomputed input matrix.

Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27893
r   )d   rt   z>Cannot store centers when using a precomputed distance matrix.rV   rT   )rU   r   N)	r:   r   r   r   rZ   r[   r\   r   rD   )r   r   rB   X_disterr_msgs        r*   0test_hdbscan_error_precomputed_and_store_centersr  %  sd     ))


"C

8A #FNG	z	1}MBFFvN 
2	1	1s   A::
B
valid_algor   r   c                 >    [        SU S9R                  [        5        g)zTest that HDBSCAN works with the "cosine" metric when the algorithm is set
to "brute" or "auto".

Non-regression test for issue #28631
cosiner   N)r   rY   rB   )r  s    r*   *test_hdbscan_cosine_metric_valid_algorithmr  5  s     8z2>>qAr,   invalid_algoc                     [        SU S9n[        R                  " [        SS9   UR	                  [
        5        SSS5        g! , (       d  f       g= f)zxTest that HDBSCAN raises an informative error is raised when an unsupported
algorithm is used with the "cosine" metric.
r  r   zcosine is not a valid metricrV   N)r   rZ   r[   r\   rY   rB   )r  hdbscans     r*   ,test_hdbscan_cosine_metric_invalid_algorithmr  ?  s:    
 X>G	z)G	HA 
I	H	Hs   A
A)r   )J__doc__numpyr:   rZ   scipyr   scipy.spatialr   sklearn.clusterr   sklearn.cluster._hdbscan._treer   r   r    sklearn.cluster._hdbscan.hdbscanr	   sklearn.datasetsr
   sklearn.metricsr   sklearn.metrics.pairwiser   r   sklearn.neighborsr   r   sklearn.preprocessingr   sklearn.utilsr   sklearn.utils._testingr   r   sklearn.utils.fixesr   r   rB   r&   fit_transform
ALGORITHMSitemsr%   r+   markparametrizerR   r`   rk   rn   r   r   r   r   r   r   r   r   r   r   r   r   r  r@   r   r   r   r   r   r   r  r  r  r  r  )r   outs   00r*   <module>r/     s  
    " # 
 ? ' 1 H . 0 ! F >Cb11q!!$1""1%
 d1B1H1H1JK1Jvqc'l1JKK8 ):;J <J>50 -/Q/Q./QR  S "	  ,>2$ 3 -$N
0 78 884 -	H  )[!9: ; .9I :ID j1& 2&*,B,( 
M	"BHHq"&&kBFFA;-G$HI
M	"aVaV$45	q!fq!f,, .9	- :	- .9- :- J0
 /$?QH-!/ . @!/H&/R :x*@AO BO '89B :B )[)AB Cu Ls   *M