
    bcgl                     P   d dl mZ d dlmZmZmZmZmZmZm	Z	 ddl
mZ ddlmZ ddlZddlZddlZej$                  Zej&                  Zej(                  Zej*                  Zej,                  Zej.                  Zej0                  Zej2                  Zej4                  Zej6                  Zej8                  Zed        Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 dd	ZddZ ddZ!d Z"d Z#ddZ$ddZ%ddZ&ddZ'ddZ(ddddd
d
ddddddddddddejR                  ejT                  fdZ+ddZ,d dZ-y)!   )	_catboost)PoolCatBoostErrorARRAY_TYPES
PATH_TYPESfspath_update_params_quantize_part_process_synonyms    )defaultdict)contextmanagerNc               #      K   	 dd l m}  |  y # t        $ r.}t        j                  d       t        t        |            d }~ww xY ww)Nr   z,To draw plots you should install matplotlib.)matplotlib.pyplotpyplotImportErrorwarningswarnstr)pltes     I/var/www/html/bid-api/venv/lib/python3.12/site-packages/catboost/utils.py_import_matplotlibr      sB     "' I  "DE#a&!!"s$   A	 A		A)AAA	c                 N   | j                  d       | j                  ||dd       | j                  d       | j                  d       | j	                  d       | j                  |d       | j                  |d       | j                  |d	       | j                          y )
N)      )figsizeg      ?   )alphalwr   )fontsizeT   )	figureplotxticksyticksgridxlabelylabeltitleshow)r   xyx_labely_labelr)   s         r   _drawr/   !   s    JJwJHHQH#JJJJJJHHTNJJwJ$JJwJ$IIebI!HHJ    c           	      $   dddddddd}t        d	       }t               j                         j                         D ]  \  }}|j	                  d
      r||dv rt        |t              r|g}|D ]  }t        |t              s#t        dj                  t        |                  ||v rt        dj                  |            |dk(  rddg||<   a|dk(  rddg||<   n|dk(  rddg||<   {ddg||<    |dvst        |t              s#t        dj                  t        |                  ||v rt        dj                  |            ||   dg||<    |D|j                         D ]1  \  }}||   d   dvrt        dj                  |            |||   d<   3 t        t        |      d      5 }t        |j                               D ]*  \  }\  }}|j                  dj                  |||             , 	 d d d        y # 1 sw Y   y xY w)NLabelWeightBaselineDocIdGroupId
SubgroupId	Timestamp)labelweightbaselinedoc_idgroup_idsubgroup_id	timestampc                  
    ddgS )NNum  rC   r0   r   <lambda>zcreate_cd.<locals>.<lambda>G   s    ubkr0   _)cat_featurestext_featuresembedding_featuresauxiliary_columnsz,Unsupported index type. Expected int, got {}z"The index {} occurs more than oncerF   CategrB   rG   TextrH   	NumVector	Auxiliary)feature_namesoutput_pathr   rA   rJ   rK   rL   zJfeature_names contains index {} that does not correspond to feature columnr   wz	{}	{}	{}
)r   localscopyitems
startswith
isinstanceintr   formattypeopenr   sortedwrite)r9   rF   rG   rH   r:   r;   r<   r=   r>   r?   rI   rN   rO   _from_param_to_cd_column_descriptionkeyvalueindexfeature_column_indexnamefr)   s                         r   	create_cdre   /   sT     #  &&9:hmmo++-
Us#u}bbeS)"GE"E%eS1+,Z,a,abfglbm,noo 33+,P,W,WX],^__n,6=r]+E2/6<b\+E2 446A25F+E26A25F+E2 # <<!%-'(V(](]^bch^i(jkk//'(L(S(STY(Z[[.?.Db-I#E*/ .0  *7*=*=*?& $"#78;Chh#$p$w$w  yM  %N  O  O;? 45a8 +@ 
f[!3	'1$*+>+D+D+F$G E=E4GGN))%=> %H 
(	'	's   5AHHFc                 $   dddd}|U|t        d      t        t        |            5 }t        |j	                         dd j                  d            }ddd       i i g }g }g }g g }	g }
t        t        |             5 }t        |      D ]  \  }}|j                         }t        |      d	k(  r%|j                  d      }t        |      d
vrt        d      t        |d	         }|d   }d}t        |      dk(  r|d   }|
j                  |||f        	 ddd       |
j                          fd}d}|
D ]  \  }}}||k(  rt        d       ||dz   |t        |	             |r|j                  ||      }j                  |g       j                  |       |dv r|t        |	      z
  }|d|z  }|dk(  r|j                  |       d|<   ni|dk(  r|j                  |       t        |<   nI|dk(  r|j                  |       t        |<   n)t        j                  |<   n|	j                  |       ||}j                  |       |}  ||dz   |t        |	             ||||	dS # 1 sw Y   !xY w# 1 sw Y   ixY w)a  
    Reads CatBoost column description file
    (see https://catboost.ai/docs/concepts/input-data_column-descfile.html#input-data_column-descfile)

    Parameters
    ----------
    cd_file : str or pathlib.Path
        path to column description file

    column_count : integer
        total number of columns

    data_file : str or pathlib.Path
        path to dataset file in CatBoost format
        specify either column_count directly or data_file to detect it

    canonize_column_types : bool
        if set to True types for columns with synonyms are renamed to canonical type.

    Returns
    -------
    dict with keys:
        "column_type_to_indices" :
            dict of column_type -> column_indices list, column_type is 'Label', 'Categ' etc.

        "column_dtypes" : dict of column_name -> numpy.dtype or 'category'

        "cat_feature_indices" : list of integers
            indices of categorical features in array of all features.
            Note: indices in array of features, not indices in array of all columns!

        "text_feature_indices" : list of integers
            indices of text features in array of all features.
            Note: indices in array of features, not indices in array of all columns!

        "embedding_feature_indices" : list of integers
            indices of embedding features in array of all features.
            Note: indices in array of features, not indices in array of all columns!

        "column_names" : list of strings

        "non_feature_column_indices" : list of integers
    r2   SampleIdr6   )Targetr5   QueryIdNzjCannot obtain column count: either specify column_count parameter or specify data_file parameter to get it	r   )r      z"Wrong number of columns in cd filer   rl   r   c                     t        | |      D ]O  }d||z
  z  }j                  |       j                  dg       j                  |       t        j                  |<   Q y )N
feature_%irA   )rangeappend
setdefaultnpfloat32)start_column_idxend_column_idxnon_feature_column_countmissed_column_idxcolumn_namecolumn_dtypescolumn_namescolumn_type_to_indicess        r   add_missed_columnsz#read_cd.<locals>.add_missed_columns   s`    !&'7!H&*;>V*VWK,"--eR8??@QR)+M+&	 "Ir0   z#Duplicate column indices in cd filerP   rn   rJ   categoryrK   rL   )r{   ry   cat_feature_indicestext_feature_indicesembedding_feature_indicesrz   non_feature_column_indices)	ExceptionrZ   r   lenreadlinesplit	enumeratestriprW   rp   sortgetrq   objectrr   rs   )cd_filecolumn_count	data_filecanonize_column_typescolumn_type_synonyms_maprd   r~   r   r   r   column_descriptionsline_idxlineline_columns
column_idxcolumn_typerx   r|   last_column_idxfeature_idxry   rz   r{   s                       @@@r   read_cdr   j   s   \   (  &#$qzz|CR066t<=L %  M "L!# 	fWo	!'lNHd::<D 4yA~::d+L< . DEE\!_-J&q/KK< A%*1o&&
K'MN' + 
, 4 O0C,
K(ABB?Q.
C@Z<[\ 266{KPK))+r:AA*M??$s+E'FFK"*[8g%#**;7-7k*&$++K8-3k*+)00=-3k*-/ZZk*&--j9")K($C 1DF *L#>X:YZ $:' 3!5&?%'A i %$ 
	s   ,I8BJ8JJrj   c	                    t        |       dkD  r+t        | d   t              rt        j                  |       n| g} t        |      dk(  rg g}t        |d   t              rt        j                  |      n|g}t        | ||||||||	      S )a  
    Evaluate metrics with raw approxes and labels.

    Parameters
    ----------
    label : list or numpy.ndarrays or pandas.DataFrame or pandas.Series
        Object labels with shape (n_objects,) or (n_object, n_target_dimension)

    approx : list or numpy.ndarrays or pandas.DataFrame or pandas.Series
        Object approxes with shape (n_objects,) or (n_object, n_approx_dimension).

    metric : string
        Metric name.

    weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
        Object weights.

    group_id : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
        Object group ids.

    group_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
        Group weights.

    subgroup_id : list or numpy.ndarray, optional (default=None)
        subgroup id for each instance.
        If not None, giving 1 dimensional array like data.

    pairs : list or numpy.ndarray or pandas.DataFrame or string or pathlib.Path
        The pairs description.
        If list or numpy.ndarrays or pandas.DataFrame, giving 2 dimensional.
        The shape should be Nx2, where N is the pairs' count. The first element of the pair is
        the index of winner object in the training set. The second element of the pair is
        the index of loser object in the training set.
        If string or pathlib.Path, giving the path to the file with pairs description.

    thread_count : int, optional (default=-1)
        Number of threads to work with.
        If -1, then the number of threads is set to the number of CPU cores.

    Returns
    -------
    metric results : list with metric values.
    r   )r   rV   r   rr   	transpose_eval_metric_util)	r9   approxmetricr:   r=   group_weightr>   pairsthread_counts	            r   eval_metricr     s    X 5zA~'1%(K'HU#ug
6{a%/q	;%GR\\&!fXFUFFFHlT_afhtuur0   c                  *    t        j                         S N)r   _get_gpu_device_countrC   r0   r   get_gpu_device_countr   6  s    **,,r0   c                 .    t        j                  |        y r   )r   _reset_trace_backend)filenames    r   reset_trace_backendr   :  s    ""8,r0   c                 f    t        |t              st        d      t        | j                  ||      S )a  
    Build confusion matrix.

    Parameters
    ----------
    model : catboost.CatBoost
        The trained model.

    data : catboost.Pool
        A set of samples to build confusion matrix with.

    thread_count : int (default=-1)
        Number of threads to work with.
        If -1, then the number of threads is set to the number of CPU cores.

    Returns
    -------
    confusion matrix : array, shape = [n_classes, n_classes]
    zdata must be a catboost.Pool)rV   r   r   _get_confusion_matrix_object)modeldatar   s      r   get_confusion_matrixr   >  s-    ( dD!:;; lCCr0   c           	      H   t        |t              r|g}t        |t              st        d      |D ]  }t        |t              rt        d       t	        | j
                  ||      }|r+t               5 }t        ||d   |d   ddd       ddd       |S |S # 1 sw Y   |S xY w)	a  
    Build points of ROC curve.

    Parameters
    ----------
    model : catboost.CatBoost
        The trained model.

    data : catboost.Pool or list of catboost.Pool
        A set of samples to build ROC curve with.

    thread_count : int (default=-1)
        Number of threads to work with.
        If -1, then the number of threads is set to the number of CPU cores.

    plot : bool, optional (default=False)
        If True, draw curve.

    Returns
    -------
    curve points : tuple of three arrays (fpr, tpr, thresholds)
    .data must be a catboost.Pool or list of pools.&one of data pools is not catboost.Poolr   r   False Positive RatezTrue Positive Ratez	ROC CurveN)rV   r   listr   _get_roc_curver   r   r/   )r   r   r   r#   pool	roc_curver   s          r   get_roc_curver   X  s    . $vdD!LMM$% HII  u}}dLAI!S#y|Yq\3HJ^`kl " 9 " s   4BB!c           	      l   |W|t        d      t        |t              st        |t              rt	        |      dk7  rt        d      |d   dd |d   dd }}n | |t        d      t        | ||      \  }}}|r't               5 }t        |||dd	d
       ddd       ||fS ||fS # 1 sw Y   ||fS xY w)a  
    Build points of FPR curve.

    Parameters
    ----------
    model : catboost.CatBoost
        The trained model.

    data : catboost.Pool or list of catboost.Pool
        A set of samples to build ROC curve with.

    curve : tuple of three arrays (fpr, tpr, thresholds)
        ROC curve points in format of get_roc_curve returned value.
        If set, data parameter must not be set.

    thread_count : int (default=-1)
        Number of threads to work with.
        If -1, then the number of threads is set to the number of CPU cores.

    plot : bool, optional (default=False)
        If True, draw curve.

    Returns
    -------
    curve points : tuple of two arrays (thresholds, fpr)
    N8Only one of the parameters data and curve should be set.rl   Ccurve must be list or tuple of three arrays (fpr, tpr, thresholds).r   r   Emodel and data parameters should be set when curve parameter is None.
Thresholdsr   z	FPR Curve)r   rV   r   tupler   r   r   r/   )	r   r   curver   r#   fpr
thresholdsrE   r   s	            r   get_fpr_curver     s    6  Z[[5$':eU+CE
VW eff(1+uQx{Z=DL ghh*5$EQ
!S#z36K[Y " s?:s? " s?s   B''B3c           	         |T|t        d      t        |t              st        |t              rt	        |      dk7  rt        d      |d   |d   dd }}n | |t        d      t        | ||      \  }}}t        j                  |D cg c]  }d|z
  	 c}      }	|r't               5 }
t        |
||	dd	d
       ddd       ||	fS ||	fS c c}w # 1 sw Y   ||	fS xY w)a  
    Build points of FNR curve.

    Parameters
    ----------
    model : catboost.CatBoost
        The trained model.

    data : catboost.Pool or list of catboost.Pool
        A set of samples to build ROC curve with.

    curve : tuple of three arrays (fpr, tpr, thresholds)
        ROC curve points in format of get_roc_curve returned value.
        If set, data parameter must not be set.

    thread_count : int (default=-1)
        Number of threads to work with.
        If -1, then the number of threads is set to the number of CPU cores.

    plot : bool, optional (default=False)
        If True, draw curve.

    Returns
    -------
    curve points : tuple of two arrays (thresholds, fnr)
    Nr   rl   r   r   r   r   r   zFalse Negative Ratez	FNR Curve)
r   rV   r   r   r   r   rr   arrayr   r/   )r   r   r   r   r#   tprr   rE   r+   fnrr   s              r   get_fnr_curver     s    6  Z[[5$':eU+CE
VW eff(E!HQKZ=DL ghh*5$E3

((3'3aAE3'
(C!S#z36K[Y " s?:s? ( " s?s   
C
)CCc                    ||t        d      | t        d      t        |t              r|g}t        |t              st        d      |D ]  }t        |t              rt        d       t	        | j
                  |d|||      S |It        |t              st        |t              rt        |      dk7  rt        d      t	        dd||||      S t        d      )	a
  
    Selects a threshold for prediction.

    Parameters
    ----------
    model : catboost.CatBoost
        The trained model.

    data : catboost.Pool or list of catboost.Pool
        Set of samples to build ROC curve with.
        If set, curve parameter must not be set.

    curve : tuple of three arrays (fpr, tpr, thresholds)
        ROC curve points in format of get_roc_curve returned value.
        If set, data parameter must not be set.

    FPR : desired false-positive rate

    FNR : desired false-negative rate (only one of FPR and FNR should be chosen)

    thread_count : int (default=-1)
        Number of threads to work with.
        If -1, then the number of threads is set to the number of CPU cores.

    Returns
    -------
    threshold : double
    Nr   r   r   r   rl   r   z3One of the parameters data and curve should be set.)r   rV   r   r   _select_thresholdr   r   r   )r   r   r   FPRFNRr   r   s          r   select_thresholdr     s    :  Z[[= ghhdD!6D$% PQQDdD)#$LMM  !dClSS		5$':eU+CE
VW eff tUClKKQRRr0   rk   c                 b   | st        d      t        | t              st        d      |t        |t              st        d      |t        |t              st        d      |t        |t              st        d      i }t        |       ||}d|v r|j	                  d      |d<   |j	                  dd      }|r(t        d	j                  |j                                     t        ||	|
||dd||||||       t        | ||||||||||
      }|j                  | |||||||||||       |S )aL  
    Construct quantized Pool from non-quantized pool stored in file.
    This method does not load whole non-quantized source dataset into memory
    so it can be used for huge datasets that fit in memory only after quantization.

    Parameters
    ----------
    data_path : string or pathlib.Path
        Path (with optional scheme) to non-quantized dataset.

    column_description : string, [default=None]
        ColumnsDescription parameter.
        There are several columns description types: Label, Categ, Num, Auxiliary, DocId, Weight, Baseline, GroupId, Timestamp.
        All columns are Num as default, it's not necessary to specify
        this type of columns. Default Label column index is 0 (zero).
        If None, Label column is 0 (zero) as default, all data columns are Num as default.
        If string or pathlib.Path, giving the path to the file with ColumnsDescription in column_description format.

    pairs : string or pathlib.Path, [default=None]
        Path to the file with pairs description.

    graph : string or pathlib.Path, [default=None]
        Path to the file with graph description.

    has_header : bool, [default=False]
        If True, read column names from first line.

    ignore_csv_quoting : bool optional (default=False)
        If True ignore quoting '"'.

    feature_names : string or pathlib.Path, [default=None]
        Path with scheme for feature names data to load.

    thread_count : int, [default=-1]
        Thread count for data processing.
        If -1, then the number of threads is set to the number of CPU cores.

    ignored_features : list, [default=None]
        Indices or names of features that should be excluded when training.

    per_float_feature_quantization : list of strings, [default=None]
        List of float binarization descriptions.
        Format : described in documentation on catboost.ai
        Example 1: ['0:1024'] means that feature 0 will have 1024 borders.
        Example 2: ['0:border_count=1024', '1:border_count=1024', ...] means that two first features have 1024 borders.
        Example 3: ['0:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum',
                    '1:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum'] - defines more quantization properties for first two features.

    border_count : int, [default = 254 for training on CPU or 128 for training on GPU]
        The number of partitions in numeric features binarization. Used in the preliminary calculation.
        range: [1,65535] on CPU, [1,255] on GPU

    max_bin : float, synonym for border_count.

    feature_border_type : string, [default='GreedyLogSum']
        The binarization mode in numeric features binarization. Used in the preliminary calculation.
        Possible values:
            - 'Median'
            - 'Uniform'
            - 'UniformAndQuantiles'
            - 'GreedyLogSum'
            - 'MaxLogSum'
            - 'MinEntropy'

    nan_mode : string, [default=None]
        Way to process missing values for numeric features.
        Possible values:
            - 'Forbidden' - raises an exception if there is a missing value for a numeric feature in a dataset.
            - 'Min' - each missing value will be processed as the minimum numerical value.
            - 'Max' - each missing value will be processed as the maximum numerical value.
        If None, then nan_mode=Min.

    input_borders : string or pathlib.Path, [default=None]
        input file with borders used in numeric features binarization.

    task_type : string, [default=None]
        The calcer type used to train the model.
        Possible values:
            - 'CPU'
            - 'GPU'

    used_ram_limit=None

    random_seed : int, [default=None]
        The random seed used for data sampling.
        If None, 0 is used.

    Returns
    -------
    pool : Pool
        Constructed and quantized pool.
    zData filename is empty.z4Data filename should be string or pathlib.Path type.NzZpairs should have None or string or pathlib.Path type when the pool is read from the file.zgcolumn_description should have None or string or pathlib.Path type when the pool is read from the file.zbfeature_names should have None or string or pathlib.Path type when the pool is read from the file.dev_block_size%dev_max_subset_size_for_build_bordersz'got an unexpected keyword arguments: {})
column_descriptionr   graphrN   	delimiter
has_headerignore_csv_quotingr   log_coutlog_cerr)r   r   )
r   rV   r   r
   poprX   keysr	   r   _read)	data_pathr   r   r   r   r   r   rN   r   ignored_featuresper_float_feature_quantizationborder_countmax_binfeature_border_typenan_modeinput_borders	task_typeused_ram_limitrandom_seedr   r   kwargsparamsr   r   s                            r   quantizer     s   h 566i,RSSE:!>xyy%j9KZ.X  F  G  	G M:)N  A  B  	BFf6!#)::.>#? ,2JJ7^`d,e)ELLV[[][\\ &-  -#-!D 	JJ   Kr0   c                    	 ddl }ddl}| j                         st        d      |j                         D ]B  \  }}|dk(  r|dvrt        j                  d       %|dk(  s+|.t        j                  d	       D d
}|r|j                  |t              }t        | j                  |      }	|j                  |	      }
|
S # t        $ r.}t        j                  d       t        t	        |            d}~ww xY w)a  
    Convert given CatBoost model to ONNX-ML model.
    Categorical Features are not supported.

    Parameters
    ----------
    model : CatBoost trained model
    export_parameters : dict [default=None]
        Parameters for ONNX-ML export:
            * onnx_graph_name : string
                The name property of onnx Graph
            * onnx_domain : string
                The domain component of onnx Model
            * onnx_model_version : int
                The model_version component of onnx Model
            * onnx_doc_string : string
                The doc_string component of onnx Model
    Returns
    -------
    onnx_object : ModelProto
        The model in ONNX format
    r   Nz2To get working onnx model you should install onnx.z^There is no trained model to use save_model(). Use fit() to train model. Then use this method.target_opset)Nr   zUtarget_opset argument is not supported. Default target_opset is 2 (ai.onnx.ml domain)initial_typesz'initial_types argument is not supportedrB   )cls)onnxr   r   r   r   json	is_fittedr   rT   dumps_NumpyAwareEncoder_get_onnx_modelr   load_model_from_string)r   export_parametersr   r   r   r   rc   r`   params_string	model_str
onnx_models              r   convert_to_onnx_objectr     s    ."
 ??ln 	n ||~e>!e9&<MMqr_$):MMCD	 & M

#4:L
M}=I,,Y7J+  "JK#a&!!"s   B> >	C5)C00C5c                 4    |dkD  sJ d       t        | ||      S )Nr   zBorder count should be > 0)_calculate_quantization_grid)valuesr   border_types      r   calculate_quantization_gridr     s%    !999'kJJr0   )NNNNNNNNNNNNztrain.cd)NNF)NNNNNrj   )rj   )rj   F)NNNrj   Fr   )Median).rB   r   corer   r   r   r   r   r	   r
   collectionsr   
contextlibr   sysnumpyrr   r   r   r   r   r   r   r   r   compute_wx_testTargetStatsDataMetaInfocompute_training_optionsr   r/   re   r   r   r   r   r   r   r   r   r   stdoutstderrr   r   r   rC   r0   r   <module>r	     sw    w w w # % 
  // ))!77 // 11 ++(EE ++##%%$==    8?vUp1vh--D4%P*Z+\/Sh 

#'ZZZZ+zz.bKr0   