
    -ie6                         S r SSKrSSKJrJr  SSKJrJr  SSK	J
r
  SSKJrJr  SSKJr  S	S
KJr  S	SKJr  S	SKJrJrJrJr  S r " S S\\5      rg)z
This module contains the BinMapper class.

BinMapper is used for mapping a real-valued dataset into integer-valued bins.
Bin thresholds are computed with the quantiles so that each bin contains
approximately the same number of samples.
    N   )BaseEstimatorTransformerMixin)check_arraycheck_random_state)_openmp_effective_n_threads)Paralleldelayed)check_is_fitted   )_map_to_bins)set_bitset_memoryview)
ALMOST_INFX_BINNED_DTYPEX_BITSET_INNER_DTYPEX_DTYPEc                    [         R                  " U 5      nUR                  5       (       a  X)    n [         R                  " U 5      n [         R                  " U 5      R                  [        5      n[        U5      U::  a  USS USS -   nUS-  nO^[         R                  " SSUS-   S9nUSS n[         R                  " XSS	9R                  [        5      nUR                  S   US-
  :X  d   e[         R                  " US[        US
9  U$ )a  Extract quantiles from a continuous feature.

Missing values are ignored for finding the thresholds.

Parameters
----------
col_data : array-like, shape (n_samples,)
    The continuous feature to bin.
max_bins: int
    The maximum number of bins to use for non-missing values. If for a
    given feature the number of unique values is less than ``max_bins``,
    then those unique values will be used to compute the bin thresholds,
    instead of the quantiles

Return
------
binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)
    The increasing numeric values that can be used to separate the bins.
    A given value x will be mapped into bin value i iff
    bining_thresholds[i - 1] < x <= binning_thresholds[i]
Nr   g      ?r   d   )nummidpoint)method)a_mina_maxout)npisnananysortuniqueastyper   lenlinspace
percentileshapeclipr   )col_datamax_binsmissing_maskdistinct_values	midpointspercentiless         c/var/www/html/venv/lib/python3.13/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py_find_binning_thresholdsr.      s    . 88H%LM* wwx Hii)009O
?x'#CR(?12+>>	S	 kk!Shl;!!B'MM(
KRR
	 q!X\111 GGITC    c                   P    \ rS rSrSrS\" S5      SSSS4S jrSS jrS rS	 r	S
r
g)
_BinMapperK   a  Transformer that maps a dataset into integer-valued bins.

For continuous features, the bins are created in a feature-wise fashion,
using quantiles so that each bins contains approximately the same number
of samples. For large datasets, quantiles are computed on a subset of the
data to speed-up the binning, but the quantiles should remain stable.

For categorical features, the raw categorical values are expected to be
in [0, 254] (this is not validated here though) and each category
corresponds to a bin. All categorical values must be known at
initialization: transform() doesn't know how to bin unknown categorical
values. Note that transform() is only used on non-training data in the
case of early stopping.

Features with a small number of values may be binned into less than
``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
for missing values.

Parameters
----------
n_bins : int, default=256
    The maximum number of bins to use (including the bin for missing
    values). Should be in [3, 256]. Non-missing values are binned on
    ``max_bins = n_bins - 1`` bins. The last bin is always reserved for
    missing values. If for a given feature the number of unique values is
    less than ``max_bins``, then those unique values will be used to
    compute the bin thresholds, instead of the quantiles. For categorical
    features indicated by ``is_categorical``, the docstring for
    ``is_categorical`` details on this procedure.
subsample : int or None, default=2e5
    If ``n_samples > subsample``, then ``sub_samples`` samples will be
    randomly chosen to compute the quantiles. If ``None``, the whole data
    is used.
is_categorical : ndarray of bool of shape (n_features,), default=None
    Indicates categorical features. By default, all features are
    considered continuous.
known_categories : list of {ndarray, None} of shape (n_features,),             default=none
    For each categorical feature, the array indicates the set of unique
    categorical values. These should be the possible values over all the
    data, not just the training data. For continuous features, the
    corresponding entry should be None.
random_state: int, RandomState instance or None, default=None
    Pseudo-random number generator to control the random sub-sampling.
    Pass an int for reproducible output across multiple
    function calls.
    See :term:`Glossary <random_state>`.
n_threads : int, default=None
    Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
    to determine the effective number of threads use, which takes cgroups CPU
    quotes into account. See the docstring of `_openmp_effective_n_threads`
    for details.

Attributes
----------
bin_thresholds_ : list of ndarray
    For each feature, each array indicates how to map a feature into a
    binned feature. The semantic and size depends on the nature of the
    feature:
    - for real-valued features, the array corresponds to the real-valued
      bin thresholds (the upper bound of each bin). There are ``max_bins
      - 1`` thresholds, where ``max_bins = n_bins - 1`` is the number of
      bins used for non-missing values.
    - for categorical features, the array is a map from a binned category
      value to the raw category value. The size of the array is equal to
      ``min(max_bins, category_cardinality)`` where we ignore missing
      values in the cardinality.
n_bins_non_missing_ : ndarray, dtype=np.uint32
    For each feature, gives the number of bins actually used for
    non-missing values. For features with a lot of unique values, this is
    equal to ``n_bins - 1``.
is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8
    Indicator for categorical features.
missing_values_bin_idx_ : np.uint8
    The index of the bin where missing values are mapped. This is a
    constant across all features. This corresponds to the last bin, and
    it is always equal to ``n_bins - 1``. Note that if ``n_bins_non_missing_``
    is less than ``n_bins - 1`` for a given feature, then there are
    empty (and unused) bins.
   g     jANc                 L    Xl         X l        X0l        X@l        XPl        X`l        g N)n_bins	subsampleis_categoricalknown_categoriesrandom_state	n_threads)selfr6   r7   r8   r9   r:   r;   s          r-   __init___BinMapper.__init__   s%     ", 0("r/   c                   ^ ^^ ST R                   s=::  a  S::  d&  O  [        SR                  T R                   5      5      e[        T[        /SS9mT R                   S-
  m[        T R                  5      nT R                  bU  TR                  S   T R                  :  a8  UR                  TR                  S   T R                  SS	9nTR                  USS
9mT R                  c6  [        R                  " TR                  S   [        R                  S9T l        O2[        R                   " T R                  [        R                  S9T l        TR                  S   nT R"                  nUc  S/U-  n[%        U5       HJ  nT R                  U   nXg   n	U(       a  U	c  [        SU S35      eU(       a  M8  U	c  M=  [        SU S35      e   T R                   S-
  T l        S/U-  T l        S/U-  n
[+        T R,                  SS9" UUU 4S j[%        U5       5       5      nSn[%        U5       Ht  nT R                  U   (       a&  Xg   nUR                  S   X'   UT R(                  U'   M=  X   T R(                  U'   T R(                  U   R                  S   S-   X'   US-  nMv     [        R.                  " U
[        R0                  S9T l        T $ )a%  Fit data X by computing the binning thresholds.

The last bin is reserved for missing values, whether missing values
are present in the data or not.

Parameters
----------
X : array-like of shape (n_samples, n_features)
    The data to bin.
y: None
    Ignored.

Returns
-------
self : object
r   r3   z=n_bins={} should be no smaller than 3 and no larger than 256.Fdtypeensure_all_finiter   Nr   )replace)axisrA   zKnown categories for feature z must be provided.zFeature zC isn't marked as a categorical feature, but categories were passed.	threading)n_jobsbackendc              3      >#    U  H8  nTR                   U   (       a  M  [        [        5      " TS S 2U4   T5      v   M:     g 7fr5   )is_categorical_r
   r.   ).0f_idxXr(   r<   s     r-   	<genexpr>!_BinMapper.fit.<locals>.<genexpr>   sB      R
*''. EG,-a5k8DD*s
   A$A)r6   
ValueErrorformatr   r   r   r:   r7   r%   choicetaker8   r   zerosuint8rJ   asarrayr9   rangemissing_values_bin_idx_bin_thresholds_r	   r;   arrayuint32n_bins_non_missing_)r<   rM   yrngsubset
n_featuresr9   rL   r8   
known_catsn_bins_non_missingnon_cat_thresholdsnon_cat_idx
thresholdsr(   s   ``            @r-   fit_BinMapper.fit   s   " T[['C'OVVKK  'eD;;? !2!23>>%!''!*t~~*EZZ
DNNEZJFvA&A&#%88AGGAJbhh#GD #%::d.A.A#RD WWQZ
00# $v
2 :&E!11%8N)0J*"4 3E7:LM  ">j&< ug &2 2  ' (,{{Q$ $v
2"Vj0%T^^[Q R
z*R
 
 :&E##E*
 .4
,6,<,<Q,?").8$$U+.@.M$$U+,0,@,@,G,M,Ma,PST,T")q  ' $&88,>bii#P r/   c                    [        U[        /SS9n[        U 5        UR                  S   U R                  R                  S   :w  a?  [        SR                  U R                  R                  S   UR                  S   5      5      e[        U R                  5      n[        R                  " U[        SS9n[        UU R                  U R                  U R                  UU5        U$ )a  Bin data X.

Missing values will be mapped to the last bin.

For categorical features, the mapping will be incorrect for unknown
categories. Since the BinMapper is given known_categories of the
entire training data (i.e. before the call to train_test_split() in
case of early-stopping), this never happens.

Parameters
----------
X : array-like of shape (n_samples, n_features)
    The data to bin.

Returns
-------
X_binned : array-like of shape (n_samples, n_features)
    The binned data (fortran-aligned).
Fr@   r   r   zKThis estimator was fitted with {} features but {} got passed to transform()F)rA   order)r   r   r   r%   r\   rP   rQ   r   r;   r   
zeros_liker   r   rY   rJ   rX   )r<   rM   r;   binneds       r-   	transform_BinMapper.transform  s    ( 'eD771:1177::!!'(@(@(F(Fq(I177ST:!V 
 0?	qcB    ((	
 r/   c                    [         R                  " U R                  5      nU R                  R                  nUR                  n[         R                  " U[         R
                  S9n[         R                  " U[         R
                  S9XA'   U R                  n[         R                  " US4[        S9n[        U5       H  u  pxXX    H  n	[        Xg   U	5        M     M      Xd4$ )aP  Create bitsets of known categories.

Returns
-------
- known_cat_bitsets : ndarray of shape (n_categorical_features, 8)
    Array of bitsets of known categories, for each categorical feature.
- f_idx_map : ndarray of shape (n_features,)
    Map from original feature index to the corresponding index in the
    known_cat_bitsets array.
rE      )r   flatnonzerorJ   sizerT   r[   arangerY   r   	enumerater   )
r<   categorical_features_indicesr`   n_categorical_features	f_idx_mapr9   known_cat_bitsetsmapped_f_idxrL   raw_cat_vals
             r-   make_known_categories_bitsets(_BinMapper.make_known_categories_bitsets+  s     (*~~d6J6J'K$))..
!=!B!BHHZryy9	24))""))3
	/  //HH#Q'/C
 $--I#JL/6%&7&E{S  7 $K !++r/   )
rY   r8   rJ   r9   rX   r6   r\   r;   r:   r7   r5   )__name__
__module____qualname____firstlineno____doc__intr=   rf   rm   r{   __static_attributes__ r/   r-   r1   r1   K   s8    Of c(# Tl&P",r/   r1   )r   numpyr   baser   r   utilsr   r   utils._openmp_helpersr   utils.parallelr	   r
   utils.validationr   _binningr   _bitsetr   commonr   r   r   r   r.   r1   r   r/   r-   <module>r      sA     3 4 @ / / " * M M0fB,!= B,r/   