
    -iJ                         S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSK	r
SSKrSSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  S\S\S\4S jrS\S\S\
R2                  4S jrS r SS jr SS jr  SS jrg)z9Implementation of ARFF parsers: via LIAC-ARFF and pandas.    N)OrderedDict)	Generator)List   )_arff)ArffSparseDataType)chunk_generatorget_chunk_n_rows)check_pandas_support)	pd_fillna	arff_datainclude_columnsreturnc                 T   [        5       [        5       [        5       4n[        U5       VVs0 s H  u  p4XC_M	     nnn[        U S   U S   U S   5       HK  u  pgnX;   d  M  US   R                  U5        US   R                  U5        US   R                  XX   5        MM     U$ s  snnf )aY  Obtains several columns from sparse ARFF representation. Additionally,
the column indices are re-labelled, given the columns that are not
included. (e.g., when including [1, 2, 3], the columns will be relabelled
to [0, 1, 2]).

Parameters
----------
arff_data : tuple
    A tuple of three lists of equal size; first list indicating the value,
    second the x coordinate and the third the y coordinate.

include_columns : list
    A list of columns to include.

Returns
-------
arff_data_new : tuple
    Subset of arff data with only the include columns indicated by the
    include_columns argument.
r      r   )list	enumeratezipappend)	r   r   arff_data_new	array_idx
column_idxreindexed_columnsvalrow_idxcol_idxs	            P/var/www/html/venv/lib/python3.13/site-packages/sklearn/datasets/_arff_parser.py_split_sparse_columnsr      s    . *.(@M;D_;U;U"7)
;U   "%Yq\9Q<1!Ng%!##C(!##G,!##$5$>?	 "O
 s   B$c                 0   [        U S   5      S-   nU[        U5      4n[        U5       VVs0 s H  u  pEXT_M	     nnn[        R                  " U[        R
                  S9n[        U S   U S   U S   5       H  u  pn
X;   d  M  XXU
   4'   M     U$ s  snnf )Nr   dtyper   r   )maxlenr   npemptyfloat64r   )r   r   num_obsy_shaper   r   r   yr   r   r   s              r   _sparse_data_to_arrayr*   9   s    
 )A,!#GO,-G;D_;U;U"7)
;U   	

+A!$Yq\9Q<1!Ng%58g112 "O Hs   Bc                 r    X   n[        U5      S:  a  X   nX44$ [        U5      S:X  a
  XS      nX44$ SnX44$ )a  Post process a dataframe to select the desired columns in `X` and `y`.

Parameters
----------
frame : dataframe
    The dataframe to split into `X` and `y`.

feature_names : list of str
    The list of feature names to populate `X`.

target_names : list of str
    The list of target names to populate `y`.

Returns
-------
X : dataframe
    The dataframe containing the features.

y : {series, dataframe} or None
    The series or dataframe containing the target.
r   r   r   N)r#   )framefeature_namestarget_namesXr)   s        r   _post_process_framer0   K   s\    , 	A
<A
 4K	 
\	a	q/" 4K 4K    c                 	   S nU" U 5      nUS:X  a  [         R                  O[         R                  nUS:X  + n	[         R                  " XxU	S9n
X4-   nU
S    VVs0 s H%  u  p[	        U[
        5      (       d  M  X;   d  M#  X_M'     nnnUS:X  Ga  [        S5      n[        U
S   5      n[        UR                  5       5      n[        U
S   5      nUR                  U/USS	9nUR                  S
S9R                  5       n[        U5      nU Vs/ s H  nUU;   d  M  UPM     nnUU   /n[        U
S   U5       H&  nUR                  UR                  UUSS	9U   5        M(     [!        U5      S:  a$  US   R#                  US   R$                  5      US'   UR'                  US
S9n[)        UU5      nAA0 nUR*                   HR  nX,   S   nUR-                  5       S:X  a  SUU'   M%  UR-                  5       S:X  a  SUU'   M@  UR$                  U   UU'   MT     UR#                  U5      n[/        UX45      u  nnGOU
S   nU V s/ s H  n [1        UU    S   5      PM     n!n U V s/ s H  n [1        UU    S   5      PM     n"n [	        U[2        5      (       az  Uc  [5        S5      eUS   S:X  a  Sn#OUS   US   -  n#[6        R8                  " [:        R<                  R?                  U5      SU#S9nUR@                  " U6 nUSS2U!4   nUSS2U"4   nO[	        U[B        5      (       a  [E        UU!5      n$[G        US   5      S-   n%U%[!        U!5      4n&[H        RJ                  RM                  U$S   U$S   U$S   44U&[6        RN                  S9nURQ                  5       n[S        UU"5      nO[5        S[U        U5       35      eU V s1 s H  n U U;   iM
     n'n U'(       d  O[W        U'5      (       a  [6        RX                  " [[        U5       V(V s/ s H]  u  n(n [6        R\                  " [6        R^                  " URa                  U 5      SS9USS2U(U(S-   24   R#                  [0        SS95      PM_     sn n(5      nO[c        U'5      (       a  [5        S 5      eURd                  S   S:X  a  URA                  S!5      nOURd                  S   S:X  a  SnUS:X  a  UUWS4$ UUSU4$ s  snnf s  snf s  sn f s  sn f s  sn f s  sn n(f )"a  ARFF parser using the LIAC-ARFF library coded purely in Python.

This parser is quite slow but consumes a generator. Currently it is needed
to parse sparse datasets. For dense datasets, it is recommended to instead
use the pandas-based parser, although it does not always handles the
dtypes exactly the same.

Parameters
----------
gzip_file : GzipFile instance
    The file compressed to be read.

output_arrays_type : {"numpy", "sparse", "pandas"}
    The type of the arrays that will be returned. The possibilities ara:

    - `"numpy"`: both `X` and `y` will be NumPy arrays;
    - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
    - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
      pandas Series or DataFrame.

columns_info : dict
    The information provided by OpenML regarding the columns of the ARFF
    file.

feature_names_to_select : list of str
    A list of the feature names to be selected.

target_names_to_select : list of str
    A list of the target names to be selected.

Returns
-------
X : {ndarray, sparse matrix, dataframe}
    The data matrix.

y : {ndarray, dataframe, series}
    The target.

frame : dataframe or None
    A dataframe containing both `X` and `y`. `None` if
    `output_array_type != "pandas"`.

categories : list of str or None
    The names of the features that are categorical. `None` if
    `output_array_type == "pandas"`.
c              3   D   #    U  H  nUR                  S5      v   M     g 7f)Nutf-8)decode)	gzip_filelines     r   _io_to_generator+_liac_arff_parser.<locals>._io_to_generator   s     D++g&& s    sparsepandas)return_typeencode_nominal
attributeszfetch_openml with as_frame=TruedataF)columnscopyT)deepr   r   r   )ignore_index	data_typeintegerInt64nominalcategoryindexNz6shape must be provided when arr['data'] is a Generatorr&   )r!   count)shaper!   z-Unexpected type for data obtained from arff: Or    )rA   zAMix of nominal and non-nominal targets is not currently supported)rJ   )3r   COO	DENSE_GENload
isinstancer   r   r   keysnext	DataFramememory_usagesumr
   r	   r   r#   astypedtypesconcatr   r@   lowerr0   intr   
ValueErrorr$   fromiter	itertoolschainfrom_iterablereshapetupler   r"   spr:   
coo_matrixr&   tocsrr*   typeallhstackr   takeasarraypopanyrL   ))r6   output_arrays_typeopenml_columns_infofeature_names_to_selecttarget_names_to_selectrL   r8   streamr<   r=   arff_containercolumns_to_selectnamecat
categoriespdcolumns_infocolumns_names	first_rowfirst_df	row_bytes	chunksizecolcolumns_to_keepdfsr?   r,   rX   column_dtyper/   r)   r   col_namefeature_indices_to_selecttarget_indices_to_selectrK   arff_data_Xr'   X_shapeis_classificationis)                                            r   _liac_arff_parserr   k   sn   n' i(F  2X=%))5??K -9NZZN 0H (55IDc4  	%)%> 		5  
 X%!"CD">,#?@\..01 /0	<<]<O))t)488:	$Y/	 +8T-33BS;S3-T()#N6$:IFDJJT=uEoV G s8q=V]]3q6==1CF
 		#D	1"e$ MMD.4[AL!!#y0  't##%2)t$||D1t " V$"*
1 #6*	 4%
3 #H-g673 	" %
 3$
2 #H-g672 	! $

 i++} L  Qx2~a58+;;--i8D
 <<'DQ112AQ001A	5))/	;TUK)A,'!+G$= >?G		$$Q+a.+a.!ABjj % A
 	A%i1IJA ?Y?PQ 
 4J
3IxH
"3I 	 
 !"##		 (11G'H
 (I8	 GG

:>>(#;3G!QQY,..s.? (IA "##S  771:?		% AWWQZ1_AX%!UD  az!!E& UL%
$
N
s7   S8S?S
S!S!S&#S+ S0A$S5
c           
      R  ^ SSK nU  H8  nUR                  S5      R                  5       R                  S5      (       d  M8    O   0 nU H>  n	X)   S   n
U
R                  5       S:X  a  SX'   M$  U
R                  5       S:X  d  M:  S	X'   M@     [	        U5       VV	s0 s H  u  pX;   d  M  XU	   _M     nnn	SS
S/S
SSSSUS.	n0 UEU=(       d    0 EnUR
                  " U 40 UD6n U V	s/ s H  oPM     sn	Ul        X4-   nUR                   Vs/ s H  nUU;   d  M  UPM     nnUU   n[        R                  " S5      mU4S jnUR                  R                  5        V	Vs/ s H%  u  n	n[        UUR                  5      (       d  M#  U	PM'     nn	nU H$  nUU   R                   R#                  U5      UU'   M&     [%        XU5      u  nnUS:X  a  UUUS4$ UR'                  5       UR'                  5       nnUR                  R                  5        V	Vs0 s H>  u  n	n[        UUR                  5      (       d  M#  U	UR(                  R+                  5       _M@     nn	nUUSU4$ s  sn	nf s  sn	f ! [         a!  nUR                  R                  S5      UeSnAff = fs  snf s  snn	f s  snn	f )a  ARFF parser using `pandas.read_csv`.

This parser uses the metadata fetched directly from OpenML and skips the metadata
headers of ARFF file itself. The data is loaded as a CSV file.

Parameters
----------
gzip_file : GzipFile instance
    The GZip compressed file with the ARFF formatted payload.

output_arrays_type : {"numpy", "sparse", "pandas"}
    The type of the arrays that will be returned. The possibilities are:

    - `"numpy"`: both `X` and `y` will be NumPy arrays;
    - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
    - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
      pandas Series or DataFrame.

openml_columns_info : dict
    The information provided by OpenML regarding the columns of the ARFF
    file.

feature_names_to_select : list of str
    A list of the feature names to be selected to build `X`.

target_names_to_select : list of str
    A list of the target names to be selected to build `y`.

read_csv_kwargs : dict, default=None
    Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
    the default options.

Returns
-------
X : {ndarray, sparse matrix, dataframe}
    The data matrix.

y : {ndarray, dataframe, series}
    The target.

frame : dataframe or None
    A dataframe containing both `X` and `y`. `None` if
    `output_array_type != "pandas"`.

categories : list of str or None
    The names of the features that are categorical. `None` if
    `output_array_type == "pandas"`.
r   Nr4   z@datarD   rE   rF   rG   rH   F?%"T\)	header	index_col	na_valueskeep_default_nacomment	quotecharskipinitialspace
escapecharr!   zwThe number of columns provided by OpenML does not match the number of columns inferred by pandas when reading the file.z^'(?P<contents>.*)'$c                 ^   > [         R                  " TU 5      nUc  U $ UR                  S5      $ )Ncontents)researchgroup)input_stringmatchsingle_quote_patterns     r   strip_single_quotes0_pandas_arff_parser.<locals>.strip_single_quotes  s.    		.=={{:&&r1   r;   )r;   r5   rZ   
startswithr   read_csvr@   r\   errorsParserErrorr   compilerX   itemsrQ   CategoricalDtyperu   rename_categoriesr0   to_numpyrv   tolist)r6   rm   rn   ro   rp   read_csv_kwargsrw   r7   rX   rt   r   r   dtypes_positionaldefault_read_csv_kwargsr,   excrs   r~   r   r   r!   categorical_columnsr/   r)   rv   r   s                            @r   _pandas_arff_parserr   7  s   p  ;;w%%'227;;  F#*0=9, #FL!Y.%FL $ '':;;MG> 	;   U  "
 M0L_5JLOKK	5_5E

 +>>*=$*=> 0H&+mmPmss>O7OsmOP/"E ::&=>' !<<--//KD%eR001 	/  
 #3Z^^556IJc
 # u?UVDAqX%!UD  zz|QZZ\1 !<<--//KD%eR001 	(e%%''/  
 az!!W0 ? ii##@
 	 Q.sZ   I'	I#I* 'I%2I* 
JJ "JJ"J#7J#%I* *
J4JJc                 x    US:X  a  [        U UUUUU5      $ US:X  a  [        U UUUUU5      $ [        SU S35      e)a  Load a compressed ARFF file using a given parser.

Parameters
----------
gzip_file : GzipFile instance
    The file compressed to be read.

parser : {"pandas", "liac-arff"}
    The parser used to parse the ARFF file. "pandas" is recommended
    but only supports loading dense datasets.

output_type : {"numpy", "sparse", "pandas"}
    The type of the arrays that will be returned. The possibilities ara:

    - `"numpy"`: both `X` and `y` will be NumPy arrays;
    - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
    - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
      pandas Series or DataFrame.

openml_columns_info : dict
    The information provided by OpenML regarding the columns of the ARFF
    file.

feature_names_to_select : list of str
    A list of the feature names to be selected.

target_names_to_select : list of str
    A list of the target names to be selected.

read_csv_kwargs : dict, default=None
    Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
    the default options.

Returns
-------
X : {ndarray, sparse matrix, dataframe}
    The data matrix.

y : {ndarray, dataframe, series}
    The target.

frame : dataframe or None
    A dataframe containing both `X` and `y`. `None` if
    `output_array_type != "pandas"`.

categories : list of str or None
    The names of the features that are categorical. `None` if
    `output_array_type == "pandas"`.
z	liac-arffr;   zUnknown parser: 'z%'. Should be 'liac-arff' or 'pandas'.)r   r   r\   )r6   parseroutput_typern   ro   rp   rL   r   s           r   load_arff_from_gzip_filer     sq    v  #"
 	
 
8	"#"
 	
 x'LM
 	
r1   )N)NN)__doc__r^   r   collectionsr   collections.abcr   typingr   numpyr$   scipyrc   	externalsr   externals._arffr   utils._chunkingr	   r
   utils._optional_dependenciesr   utils.fixesr   r   ndarrayr*   r0   r   r   r    r1   r   <module>r      s    ?
  	 # %     0 ? ? # ! 48  F!48ZZ$L I"d U"~ P
r1   