
    bcgh                       d dl mZ d dlmZ d dlZd dlZd dlZej                  dk\  rd dlm	Z	m
Z
mZmZ nd dlm	Z	m
Z
mZmZ d dlmZmZ d dlmZmZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d d	lmZ d dl Z  ejB                         d
k(  r	  ejD                  d       	 d dl$m%Z%m&Z& d dl)Z*ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 e3Z4e0jj                  Z5e0jl                  Z6e0jn                  Z7e0jp                  Z8e0jr                  Z9e0jt                  Z:e0jv                  Z;e0jx                  Z<e0jz                  Z=e0j|                  Z>e0j~                  Z?e0j                  Z@e0j                  ZAe0j                  ZBe0j                  ZCe0j                  ZDe0j                  ZEe0j                  ZFe0j                  ZGe0j                  ZHe0j                  ZIe0j                  ZJe0j                  ZKe0j                  ZLe0j                  ZMe0j                  ZNe0j                  ZOe0j                  ZPe0j                  ZQe0j                  ZRe0j                  ZSe0j                  ZT ej                  eV      ZW e;         e0j                          eej                  fZZe[ej                  fZ]efZ^e_ej                  e%e&fZaej                  dk\  re^ej                  fz   Zcnej                  dk\  rd dldmeZe e^eefz   Zcne^Zcd Zf G d d      Zg G d de(      Zh eh       aied\d       Zjd Zkd Zld]d Zmd^d!Znd" Zo G d# d$e      Zp G d% d&e      Zq G d' d(e      Zr G d) d*e      Zsd+ Ztd, Zu	 d_d-Zvd. Zw G d/ d0e5      Zxd1 Zyd2 Zzd3 Z{d4 Z|ed5        Z}d6 Z~d7 Zd8 Zd9 Zd: Zd; Z G d< d=e(      Zd> Zd? Zd@ ZdA ZdB Z G dC dDe      Z G dE dFe      Z G dG dHe      Z G dI dJe      Z	 	 	 	 d`dKZdL Z	 	 	 	 dadMZ	 	 	 	 	 	 dbdNZ G dO dPe7      ZdcdQZdR ZdS ZdT ZdU ZdV ZdW ZdX ZdY Z G dZ d[e(      Zy# e#$ r Y ~w xY w# e'$ r  G d de(      Z% G d de(      Z&Y w xY w)d    )contextmanager)deepcopyN)   r   )IterableSequenceMappingMutableMapping)OrderedDictdefaultdict)	iteritemsstring_typesinteger_types)Enum)
itemgetterLinuxzlibrt.so)	DataFrameSeriesc                       e Zd Zy)r   N__name__
__module____qualname__     H/var/www/html/bid-api/venv/lib/python3.12/site-packages/catboost/core.pyr   r   $       r   r   c                       e Zd Zy)r   Nr   r   r   r   r   r   '   r   r   r      )save_plot_filetry_plot_offlineOfflineMetricVisualizer)	_catboost)BuiltinMetric)r      )r      )Pathc                 n    t         j                  j                  |       st        j                  |        y y N)ospathexistsmkdir)r*   s    r   create_dir_if_not_existr-   i   s"    77>>$
  r   c                       e Zd Zd Zd Zy)_StreamLikeWrapperc                     || _         y r(   callable_object)selfr2   s     r   __init__z_StreamLikeWrapper.__init__o   s
    .r   c                 &    | j                  |       y r(   r1   )r3   messages     r   writez_StreamLikeWrapper.writer   s    W%r   N)r   r   r   r4   r7   r   r   r   r/   r/   n   s    /&r   r/   c                   0    e Zd Zd Zed        ZddZd Zy)_CustomLoggersStackc                 R    t        j                         | _        d | _        g | _        y r(   )	threadingLock_lock_owning_thread_id_stackr3   s    r   r4   z_CustomLoggersStack.__init__w   s    ^^%
!%r   c                 b    t        | d      r| S t        | d      rt        |       S t        d      )Nr7   __call__z.Expected callable object or stream-like object)hasattrr/   CatBoostError)objs    r   _get_stream_like_objectz+_CustomLoggersStack._get_stream_like_object|   s6    3 J3
#%c**<
 	
r   Nc                       j                   5   j                  s$t        j                         j                   _        nD j
                  t        j                         j                  k7  r||t        d      	 d d d        y  fd} ||t        j                  d      } ||t        j                  d      }t                t        ||        j                  j                  ||f       d d d        y # 1 sw Y   y xY w)NzCatBoost custom loggers have been already set in another thread.  Setting custom loggers from different threads is not currently supportedc                 r    |  j                   s|S j                   d   |   S t        j                  |       S N)r?   r9   rF   )logdefaultindex_in_stackr3   s      r   init_logz*_CustomLoggersStack.push.<locals>.init_log   s8    ;;;&#{{2~>>*BB3GGr   r   r   )r=   r?   r;   current_threadidentr>   rD   sysstdoutstderr_reset_logger_set_loggerappend)r3   log_coutlog_cerrrN   coutcerrs   `     r   pushz_CustomLoggersStack.push   s    ZZ;;)2)A)A)C)I)I&''9+C+C+E+K+KK(h.B'f   ZH Hcjj!4DHcjj!4DOd#KKd|,1 ZZs   A,C4A(C44C=c                    | j                   5  | j                  t        j                         j                  k7  r
	 d d d        y | j
                  st        d      t                t        | j
                        dk7  rt        | j
                  d     nd | _        | j
                  j                          d d d        y # 1 sw Y   y xY w)Nz"Attempt to pop from an empty stackr   )r=   r>   r;   rO   rP   r?   RuntimeErrorrT   lenrU   popr@   s    r   r`   z_CustomLoggersStack.pop   s    ZZ%%)A)A)C)I)II Z ;;"#GHHO4;;1$T[[_-)-&KKOO ZZs   -B<A0B<<CNN)r   r   r   r4   staticmethodrF   r[   r`   r   r   r   r9   r9   v   s%    
 
 
-6r   r9   c              #      K   t         j                  | |       	 d  t         j                          y # t         j                          w xY wwr(   )_custom_loggers_stackr[   r`   )rW   rX   s     r   	log_fixupre      s9      x2$!!#!!#s   A3 AA		Ac           	      X   t        | t              r#t        |       D ]  \  }}t        |      | |<    | S t        | t        dd        r%g }| D ]  }|j                  t        |              |S t        | t        t        f      r#t        |       D ]  }t        | |         | |<    | S t        | t              r| S t        | t              rt        |       S t        | t              rt        |       S t        | t        t        t              t        t               z
              rt#        |       S | S )Nr   )
isinstancelist	enumerate_cast_to_base_typesARRAY_TYPESrV   r   r	   boolINTEGER_TYPESintFLOAT_TYPESfloattupleset
PATH_TYPESSTRING_TYPESfspath)valueindexelement	new_valuekeys        r   rj   rj      s   %'.NE7.w7E%L /%QR)	G09: %'>23;C,U3Z8E#J %%'5z%%U|%s:\1BBCDe}Lr   c                     t        |       S r(   )!_metric_description_or_str_to_str)descriptions    r    metric_description_or_str_to_strr~      s    ,[99r   c           	          t        | ||rt        d       fndz         s%t        dj                  ||t        |                   y )Nr   z-Parameter {} should have a type of {}, got {})rg   typerD   format)rv   nametypesor_nones       r   _check_param_typer      sE    eUwtDzmBGHKRRSWY^`dej`klmm Ir   c                    t        | dt        f       t        |dt        t        f       t        |dt        f       t        |dt        t        f       t        |dt        f       t	               g d}t        fd|D              }|dkD  rt        d	j                  |            |
|| }n||}|t        |      }| ||fS )
Nmetric_periodverboselogging_levelverbose_evalsilent)r   r   r   r   c              3   D   K   | ]  }j                  |      d u  y wr(   get).0	exclusiveparamss     r   	<genexpr>z#_process_verbose.<locals>.<genexpr>   s"     ZIYIfjj+47IYs    r   z'Only one of parameters {} should be set)r   rn   rl   r   localssumrD   r   )r   r   r   r   r   exclusive_paramsat_most_oner   s          @r   _process_verboser      s    m_sf=gy4+6m_|oFlNT3K@fh0XFMZIYZZKQELLM]^__ jG%"Gg,7M22r   c                     t        ||       r|S t        |t              r| |   S t        dt        |       z   dz   t        t        |            z         )Nzcan't create enum z from type )rg   str	Exceptionr   )	enum_typeargs     r   enum_from_enum_or_strr      sQ    #y!
	C	~,s9~=MPSTXY\T]P^^__r   c                   >    e Zd ZdZdZ	 dZ	 dZ	 dZ	 dZ	 dZ		 dZ
	 d	Zy
)	EFstrTypez3Calculate score for every feature by values change.r   r      r   r%      r$      N)r   r   r   __doc__PredictionValuesChangeLossFunctionChangeFeatureImportanceInteraction
ShapValuesPredictionDiffShapInteractionValues
SageValuesr   r   r   r   r     sF    =:X9K1JhN\1Jr   r   c                        e Zd ZdZdZ	 dZ	 dZy)EShapCalcTypezCalculate regular SHAP valuesRegularApproximateExactN)r   r   r   r   r   r   r   r   r   r   r   r     s    'G+K%Er   r   c                        e Zd ZdZdZ	 dZ	 dZy)EFeaturesSelectionAlgorithmzUUse prediction values change as feature strength, eliminate batch of features at once!RecursiveByPredictionValuesChangeRecursiveByLossFunctionChangeRecursiveByShapValuesN)r   r   r   r   r   r   r   r   r   r   r   r     s    _(K%`$C!Y3r   r   c                       e Zd ZdZdZ	 dZy)EFeaturesSelectionGroupingzSelect individual features
IndividualByTagsN)r   r   r   r   r   r   r   r   r   r   r   '  s    $J0Fr   r   c                 |   t        | t        t        j                  f      rt        | t        t
        t        f      rt        dt        |       z         |1| D cg c]%  }t        |t              r|j                  |      n|' c}S | D ],  }t        |t              st        dj                  |             | S c c}w )a  
        Parameters
        ----------
        features :
            must be a sequence of either integers or strings
            if it contains strings 'feature_names' parameter must be defined and string ids from 'features'
            must represent a subset of in 'feature_names'

        feature_names :
            A sequence of string ids for features or None.
            Used to get feature indices for string ids in 'features' parameter
    z,feature names should be a sequence, but got z_features parameter contains string value '{}' but feature names for a dataset are not specified)rg   r   npndarrayr   bytes	bytearrayrD   reprrt   rw   r   )featuresfeature_namesfs      r   _get_features_indicesr   .  s     x(BJJ!78ZSVX]_hRi=jJTRZ^[\\  
 '1L&AM"qH
 	

 A!\*# %FFLfQiQ Q  O
s   *B9c                    || j                  d|i       || j                  d|i       || j                  d|i       || j                  d|i       || j                  d|i       || j                  d|i       || j                  d|i       || j                  d|i       |	| j                  d	|	i       |
| j                  d
|
i       || j                  d|i       || j                  d|i       | S )Nignored_featuresper_float_feature_quantizationborder_countfeature_border_type!sparse_features_conflict_fractiondev_efb_max_bucketsnan_modeinput_borders	task_typeused_ram_limitrandom_seed%dev_max_subset_size_for_build_borders)update)r   r   r   r   r   r   r   r   r   r   r   r   r   s                r   _update_params_quantize_partr   J  sw    # 0
 	 &1,.L
 	 L
 	 &!#6
 	 )4/1R
 	 &!#6
 	 
 	  ]
 	 
 	 !n
 	 ;
 	 -835Z
 	 Mr   c                    d}	 dd l m} t        |      t        |      r|nt        t        t
        |            }
|d   }|d|z   dz      }|d   |j                         }|j                  j                  |       |d   d	<   d
}|j                  |j                  |||j                  j                  |      ddg|
z   d             t!              dkD  r~|j                  |j                  D cg c]  }||   	 c}D cg c]  }||   	 c}d|j                  j#                  dd      D cg c]  }|dkD  r|
|dz
     nd c}d             rO|j                  |j                  ||ddgt        t        t
        |            z   dt%        dd|      dd             r8|j                  |j                  ||ddg|z   t%        dd|      ddd             d}|j                  |j                  |d   |j                  j                  |      ddg|z   dd             |j                  |j                  |d   ddg|z   t%        dd|      dddd 	             t%        d!d"dd"d#d$d%      }|j'                  t%        d@d	d&|z   i|t%        d@d't%        |      t%        |      d(|)       6|j'                  t%        d@d*d+d,d-t%        |      t%        |      d.|/       g }fd0}|j)                  t%        d1|z   d2d3 |dd4      ig5             r(|j)                  t%        d6d2d3 |d"d4      ig5             r(|j)                  t%        d7d2d3 |dd"4      ig5             |j'                  t%        d|ddd8d"d9d:d;d<=      g>       |j'                  d?       |S # t        $ r.}	t        j                  |       t        t        |	            d }	~	ww xY wc c}w c c}w c c}w )AN(To draw plots you should install plotly.r   loss_valuesremoved__countmain_indices)textlayouttitlezrgb(51,160,44))colorlines+markers )xylinemoder   r   markers
   square)sizesymbolr   )r   r   r   markerr   r   r   zbottom centerz
sans serif   )familyr   r   F)r   r   r   r   textpositiontextfontr   visible)r   r   r   r   r   r   r   r   zrgb(160,44,44)y2)r   r   r   r   r   r   yaxis)	r   r   r   r   r   r   r   r   r   zrgb(255,255,255)Tzrgb(127,127,127)outside)	gridcolorshowgridshowlineshowticklabels	tickcolortickszerolineznumber of removed z
loss value)r   	titlefonttickfont)xaxisr   z
cost valuerightr   r   )r   sideanchor
overlayingr   r   )yaxis2c                     dg}t              dkD  r|j                  d       r|j                  |        r|j                  |       "|j                  d       |j                  |       |S )NTr   )r_   rV   )show_indices
show_namesvisible_arg
cost_graphindices_presentr   names_presents      r   get_visible_argz;plot_features_selection_loss_graph.<locals>.get_visible_arg	  so    f|q t$|,z*!t$z*r   zHide r   r   )r  r  labelmethodargszShow indicesz
Show namesrtg      пleftg{Gz?top)activebuttonspad
showactiver   xanchorr   yanchor)updatemenus)
showlegendr   )plotly.graph_objs
graph_objsImportErrorwarningswarnr   anyrh   mapFigurer   Title	add_traceScatterscatterLiner_   Markerdictupdate_layoutrV   )r   entities_nameentities_name_in_fieldseliminated_entities_indiceseliminated_entities_names
loss_graphr  warn_msggoenames_or_indicesr   removed_entities_cntfigloss_graph_coloridxcost_graph_coloraxis_optionsr  r  r  r   r  s         `             @@@r   "plot_features_selection_loss_graphr8    s    :H"&
 56O12M4A0tCPSUpLqGr]+K%j3J&JX&UVn-L
))+CYY__%_8CM''MM"**

ZZ__#3_4T$$    <1bjj4@ALS#C(LA+78<C{3<8::$$"X$>JVW,3sQw"37+B>,W ! 
 	 bjj"S&ABCC(B>NO ! 	
 		 bjj"11B>NO( ! 	
 		 (bjj"''78 11 ! 
 	 	bjj"'11B>NO( ! 

 
	 $te'9UZL N-=NN 
!12 01
 	
    "%56$45  	 
	
 G NN4%/uOPQ 
 t o4ERST
 	
 to5TRST
 	 r"	
 	      JA  "h#a&!!"6 B8 Xs)   N= 6O7O<7P=	O4)O//O4c           
          i }t        ddd| d   | d   | d         |d<   d| v rt        ddd	g | d   | d
   | d         |d	<   |S )NzLoss by eliminated featuresr   eliminated_featureseliminated_features_namesr-  eliminated_features_tagsz Loss by eliminated features tagszfeatures tagsfeatures_tagsfeatures_tags_loss_graphfeatures_tags_cost_graph)r  )r8  )summaryresults     r   #plot_features_selection_loss_graphsrB  <  s    F;%%&+,F: "W,"D.././9:#
 Mr   c                   v    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d0 fd	Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd1dZd Zd Z d Z!d  Z"d! Z#d" Z$d# Z%d$ Z&d% Z'd& Z(d' Z)d( Z*d) Z+	 	 	 d2d*Z,d+ Z-d, Z.	 	 	 d3d-Z/d4d.Z0d/ Z1 xZ2S )5PoolzH
    Pool used in CatBoost as a data structure to train model from.
    c                 ,   t        ||      5  |P| j                  |       | j                  |       |,t        |t              t        |t              k7  rt        d      |	,t        |t              t        |	t              k7  rt        d      |t        |t              st        d      t        |t              r`t        d |||||||||||fD              rt        d      |t        |t              st        d      | j                  ||||	||
|||	       nRt        |t              r#t        d |||||fD              rSt        d	      t        |t        j                        r|j                  j                  d
k(  r|t        |      dkD  rt        d      |j                  j                  d
k(  r|t        |      dkD  rt        d      |j                  j                  dk7  r|t        |      dkD  r|t        d      t        |t        j                  j                         rl|j                  j                  d
k(  r|t        |      dkD  rt        d      |t        |      dkD  rt        d      |t        |      dkD  r|t        d      ||t        d      t        |t"              r"t        |      t        |      k7  rVt        d      t        |t$              r0t'        |      t'        |j)                               k7  rt        d      t        d      t        |t              rt        d      | j+                  ||||||||	||||||||||       n|st        d      t,        t.        | c          ddd       y# 1 sw Y   yxY w)at  
        Pool is an internal data structure that is used by CatBoost.
        You can construct Pool from list, numpy.ndarray, pandas.DataFrame, pandas.Series.

        Parameters
        ----------
        data : list or numpy.ndarray or pandas.DataFrame or pandas.Series or FeaturesData or string or pathlib.Path
            Data source of Pool.
            If list or numpy.ndarrays or pandas.DataFrame or pandas.Series, giving 2 dimensional array like data.
            If FeaturesData - see FeaturesData description for details, 'cat_features' and 'feature_names'
              parameters must be equal to None in this case
            If string or pathlib.Path, giving the path to the file with data in catboost format.
              If string starts with "quantized://", the file has to contain quantized dataset saved with Pool.save().

        label : list or numpy.ndarrays or pandas.DataFrame or pandas.Series, optional (default=None)
            Labels data.
            If not None, can be a single- or two- dimensional array with either:
              - numerical values - for regression (including multiregression), ranking and binary classification problems
              - class labels (boolean, integer or string) - for classification (including multiclassification) problems
            If `data` parameter points to a file, Label data is loaded from it as well. This parameter must
              be None in this case.

        cat_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Categ features indices or names.
            If it contains feature names, Pool's feature names must be defined: either by passing 'feature_names'
              parameter or if data is pandas.DataFrame (feature names are initialized from it's column names)
            Must be None if 'data' parameter has FeaturesData type

        text_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Text features indices or names.
            If it contains feature names, Pool's feature names must be defined: either by passing 'feature_names'
              parameter or if data is pandas.DataFrame (feature names are initialized from it's column names)
            Must be None if 'data' parameter has FeaturesData type

        embedding_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Embedding features indices or names.
            If it contains feature names, Pool's feature names must be defined: either by passing 'feature_names'
              parameter or if data is pandas.DataFrame (feature names are initialized from it's column names)
            Must be None if 'data' parameter has FeaturesData type

        embedding_features_data : list or dict, optional (default=None)
            If not None, giving the data of Embedding features (instead of data in main 'data' parameter).
            If list - list containing 2d arrays (lists or numpy.ndarrays or scipy.sparse.spmatrix) with [n_data_size x embedding_size] elements
            If dict - dict containing 2d arrays (lists or numpy.ndarrays or scipy.sparse.spmatrix) with [n_data_size x embedding_size] elements
                Dict keys must be the same as specified in 'embedding_features' parameter

        column_description : string or pathlib.Path, optional (default=None)
            ColumnsDescription parameter.
            There are several columns description types: Label, Categ, Num, Auxiliary, DocId, Weight, Baseline, GroupId, Timestamp.
            All columns are Num as default, it's not necessary to specify
            this type of columns. Default Label column index is 0 (zero).
            If None, Label column is 0 (zero) as default, all data columns are Num as default.
            If string or pathlib.Path, giving the path to the file with ColumnsDescription in column_description format.

        pairs : list or numpy.ndarray or pandas.DataFrame or string or pathlib.Path
            The pairs description.
            If list or numpy.ndarrays or pandas.DataFrame, giving 2 dimensional.
            The shape should be Nx2, where N is the pairs' count. The first element of the pair is
            the index of winner object in the training set. The second element of the pair is
            the index of loser object in the training set.
            If string or pathlib.Path, giving the path to the file with pairs description.

        graph: list or numpy.ndarray or pandas.DataFrame or string or pathlib.Path
            The graph description.
            ...

        delimiter : string, optional (default='	')
            Delimiter to use for separate features in file.
            Should be only one symbol, otherwise would be taken only the first character of the string.

        has_header : bool optional (default=False)
            If True, read column names from first line.

        ignore_csv_quoting : bool optional (default=False)
            If True ignore quoting '"'.

        weight : list or numpy.ndarray, optional (default=None)
            Weight for each instance.
            If not None, giving 1 dimensional array like data.

        group_id : list or numpy.ndarray, optional (default=None)
            group id for each instance.
            If not None, giving 1 dimensional array like data.

        group_weight : list or numpy.ndarray, optional (default=None)
            Group weight for each instance.
            If not None, giving 1 dimensional array like data.

        subgroup_id : list or numpy.ndarray, optional (default=None)
            subgroup id for each instance.
            If not None, giving 1 dimensional array like data.

        pairs_weight : list or numpy.ndarray, optional (default=None)
            Weight for each pair.
            If not None, giving 1 dimensional array like pairs.

        baseline : list or numpy.ndarray, optional (default=None)
            Baseline for each instance.
            If not None, giving 2 dimensional array like data.

        timestamp: list or numpy.ndarray, optional (default=None)
            Timestamp for each instance.
            Should be a non-negative integer.
            Useful for sorting a learning dataset by this field during training.

        feature_names : list or string or pathlib.Path, optional (default=None)
            If list - list of names for each given data_feature.
            If string or pathlib.Path - path with scheme for feature names data to load.
            If this parameter is None and 'data' is pandas.DataFrame feature names will be initialized
              from DataFrame's column names.
            Must be None if 'data' parameter has FeaturesData type

        feature_tags : json, optional (default=None)
            Format:
            {'tag1':
                {
                    'features': [<ids or names of features>],
                    'cost': <positive integer>
                }
             'tag2':
                {
                 ...
                }
            ...
            }

        thread_count : int, optional (default=-1)
            Thread count for data processing.
            If -1, then the number of threads is set to the number of CPU cores.

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Nz3data and pairs parameters should be the same types.z3data and graph parameters should be the same types.z\data should be the string or pathlib.Path type if column_description parameter is specified.c              3   $   K   | ]  }|d u 
 y wr(   r   r   vs     r   r   z Pool.__init__.<locals>.<genexpr>	  s       ` 3_Q1D= 3_   zcat_features, text_features, embedding_features, embedding_features_data, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, label should have the None type when the pool is read from the file.zbfeature_names should have None or string or pathlib.Path type when the pool is read from the file.c              3   $   K   | ]  }|d u 
 y wr(   r   rG  s     r   r   z Pool.__init__.<locals>.<genexpr>  s        Q  7Pq}  7PrI  zcat_features, text_features, embedding_features, embedding_features_data, feature_names should have the None type when 'data' parameter has FeaturesData typer   r   z'data' is numpy array of floating point numerical type, it means no categorical features, but 'cat_features' parameter specifies nonzero number of categorical featuresz'data' is numpy array of floating point numerical type, it means no text features, but 'text_features' parameter specifies nonzero number of text featuresOz'data' is numpy array of non-object type, it means no embedding features, but 'embedding_features' parameter specifies nonzero number of embedding featuresz'data' is scipy.sparse.spmatrix of floating point numerical type, it means no categorical features, but 'cat_features' parameter specifies nonzero number of categorical featuresz'data' is scipy.sparse.spmatrix, it means no text features, but 'text_features' parameter specifies nonzero number of text featuresz'data' is scipy.sparse.spmatrix and 'embedding_features_data' is None, it means no embedding features, but 'embedding_features' parameter specifies nonzero number of embedding featureszZ'embedding_features_data' is not None, but 'embedding_features' parameter is not specifiedzX'embedding_features_data' and 'embedding_features' contain different numbers of featureszPkeys of 'embedding_features_data' dict do not correspond to 'embedding_features'z@'embedding_features_data' must have either 'list' or 'dict' typez`feature_names must be None or have non-string type when the pool is created from python objects.z'data' parameter can't be None)re   _check_data_type_check_data_emptyrg   rs   rD   r  _readFeaturesDatar   r   dtypekindr_   scipysparsespmatrixrh   r'  rr   keys_initsuperrD  r4   )r3   datar	  cat_featurestext_featuresembedding_featuresembedding_features_datacolumn_descriptionpairsgraph	delimiter
has_headerignore_csv_quotingweightgroup_idgroup_weightsubgroup_idpairs_weightbaseline	timestampr   feature_tagsthread_countrW   rX   data_can_be_none	__class__s                             r   r4   zPool.__init__Y  s9   J x*%%d+&&t,$D*)ETY[eIf)f'(]^^$D*)ETY[eIf)f'(]^^%1*T::V'  )G  H  HdJ/ `<Pbd{  ~D  FN  P\3>hX]3_ ` `+m  &1J}V`<a+ A  JJt%7}V_akm  BN  O!$5  Q|]Tfh  BO  7P  Q  Q"/!O#  $D"**5 JJOOs29QX[\hXilmXm"/!q#  !JJOOs29RY\]jYknoYo"/!k#  !JJOOs29K9W^abt^uxy^y6>&3%y'" !" $D%,,*?*?@ JJOOs29QX[\hXilmXm"/!q#  *5C<NQR<R"/!k#  /:EWAX[\A\cz  dC"/!u# 
 /:-5"/ |#  &&=tD"#56#>U:VV&3$~'" !" ((?F"#56#>U>Z>Z>\:]]&3$v'" !" #0 b#  "-<+. 
 JJtUL-I[]tv{  ~C  EK'{LRZ\egt  wC  EQR%#$DEE$&({ +**s   M3N

Nc                     t        ||||fd      D ]b  \  }}t        |      }||j                  d      dk(  s)t        j                  j                  |      rIt        dj                  ||             y)z(
        Check files existence.
        )rX  r]  r^  r_  Nz://rJ   z*Invalid {} path='{}': file does not exist.)zipru   findr)   r*   isfilerD   r   )r3   rX  r]  r^  r_  itemr   s          r   _check_fileszPool._check_files]  sq     t%7FHxyJD$$<D|yy2%bggnnT.B#$P$W$WX\^b$cdd zr   c                     t        |t              s#t        dj                  t	        |                  t        |      dk  r#t        dj                  t        |                  y )Nz*Invalid delimiter type={} : must be str().r   z*Invalid delimiter length={} : must be > 0.)rg   rt   rD   r   r   r_   )r3   r`  s     r   _check_delimiterzPool._check_delimiterh  sY    )\2 L S STXYbTc deey>A L S STWXaTb cdd r   c                 j    t        |t              s#t        dj                  t	        |                  y)z=
        Check type of column_description parameter.
        zDInvalid column_description type={}: must be str() or pathlib.Path().N)rg   rs   rD   r   r   )r3   r]  s     r   _check_column_description_typez#Pool._check_column_description_typen  sC     ,j9 f m mnr  tF  oG  !H  I  I :r   c                     t        |t        t        j                  f      s$t	        dj                  |t        |                  y)z6
        Check type of cat_feature parameter.
        z3Invalid {} type={}: must be list() or np.ndarray().N)rg   rh   r   r   rD   r   r   )r3   r   features_names      r   _check_string_feature_typezPool._check_string_feature_typeu  s=     (T2::$67 U \ \]jlpqylz {|| 8r   c                     t        |      D ]]  \  }}t        |t              s&t        dj	                  |||t        |                  ||k\  sBt        dj	                  ||||             y)zM
        Check values in cat_feature parameter. Must be int indices.
        z1Invalid {}[{}] = {} value type={}: must be int().z.Invalid {}[{}] = {} value: index must be < {}.N)ri   rg   rm   rD   r   r   )r3   r   features_country  indxfeatures         r   _check_string_feature_valuez Pool._check_string_feature_value|  s     'x0MD'g}5#$W$^$^_lnrt{  ~B  CJ  ~K  %L  M  M.(#$T$[$[\ikoqx  {I  %J  K  K	 1r   c                     t        |t        t        j                  t        f      s#t        dj                  t        |                  y)z0
        Check type of pairs parameter.
        zDInvalid pairs type={}: must be list(), np.ndarray() or pd.DataFrame.N)rg   rh   r   r   r   rD   r   r   r3   r^  s     r   _check_pairs_typezPool._check_pairs_type  s=     %$

I!>? f m mnrsxny z{{ @r   c                    t        |      D ]v  \  }}t        |      dk7  rt        dj                  |            t        |      D ];  \  }}t	        |t
              rt        dj                  |||t        |                   x y)zG
        Check values in pairs parameter. Must be int indices.
        r   z%Length of pairs[{}] isn't equal to 2.z8Invalid pairs[{}][{}] = {} value type={}: must be int().N)ri   r_   rD   r   rg   rm   r   )r3   r^  pair_idpairirw   s         r   _check_pairs_valuezPool._check_pairs_value  s     'u-MGTD	Q#$K$R$RSZ$[\\%dO5!%7'(b(i(ijqstv{  ~B  CH  ~I  )J  K  K , .r   c                     t        |t        t        t        t        f      s#t        dj                  t        |                  y)z%
        Check type of data.
        zInvalid data type={}: data must be list(), np.ndarray(), DataFrame(), Series(), FeaturesData  scipy.sparse matrix or filename str() or pathlib.Path().N)rg   rs   rk   SPARSE_MATRIX_TYPESrO  rD   r   r   )r3   rX  s     r   rL  zPool._check_data_type  sB     $[:M| \]MNTfUYZ^U_N`  ^r   c           	      X   t        |t              r|st        d      yt        |t        t        f      rt        |t
              r/t        j                  t        j                  |t                    }nt        j                  |      }t        |      dk(  rX|d   dkD  rPt        |d   t              r%t        |t        t        |d         g      z         }nt        |t        dg      z         }t        |      dk(  st        dj                  |            |d   dk(  rt        d      yy)	z
        Check that data is not empty (0 objects is ok).
        note: already checked if data is FeatureType, so no need to check again
        zFeatures filename is empty.rP  r   r   r   z7Input data has invalid shape: {}. Must be 2 dimensionalz)Input data must have at least one featureN)rg   rs   rD   rk   r  rh   r   shapeasarrayobjectr_   r   rq   r   )r3   rX  
data_shapes      r   rM  zPool._check_data_empty  s    dJ'#$ABB {,?@A$%XXbjjV&DE
XXd^
:!#
1(9d1gx0!&zE3tAw<.4I'I!JJ!&zE1#J'>!?Jz?a'#$]$d$deo$pqq!}!#$OPP " Br   c                 j    t        |t              s#t        dj                  t	        |                  y)z&
        Check type of label.
        z*Invalid label type={}: must be array like.Nrg   rk   rD   r   r   r3   r	  s     r   _check_label_typezPool._check_label_type  s1     %- L S STXY^T_ `aa .r   c                 6    t        |      dk(  rt        d      y)z+
        Check label is not empty.
        r   zLabels variable is empty.N)r_   rD   r  s     r   _check_label_emptyzPool._check_label_empty  s      u:? ;<< r   c                 h    t        |      |k7  r$t        dj                  t        |      |            y)z3
        Check label length and dimension.
        z6Length of label={} and length of data={} is different.Nr_   rD   r   )r3   r	  samples_counts      r   _check_label_shapezPool._check_label_shape  s6     u:& X _ _`cdi`jly z{{ 'r   c                 j    t        |t              s#t        dj                  t	        |                  y)z3
        Check type of baseline parameter.
        z-Invalid baseline type={}: must be array like.Nr  r3   rh  s     r   _check_baseline_typezPool._check_baseline_type  1     (K0 O V VW[\dWe fgg 1r   c                 "   t        |      |k7  r$t        dj                  t        |      |            t        |d   t              rt        |d   t
              rt        d      	 t        j                  |      j                  t        j                  d      t        j                  d      t        j                  d      fvr
t               y# t        $ r8 t        dj                  t        j                  |      j                              w xY w)	z6
        Check baseline length and dimension.
        z:Length of baseline={} and length of data={} are different.r   z=Baseline must be 2 dimensional data, 1 column for each class.rp   float32rn   z5Invalid baseline value type={}: must be float or int.N)	r_   rD   r   rg   r   rt   r   arrayrP  )r3   rh  r  s      r   _check_baseline_shapezPool._check_baseline_shape  s     x=M) \ c cdghpdq  tA  !B  C  C(1+x0Jx{L4Y _``	zxx!''0A288ICVXZX`X`afXg/hh#o% i 	z W ^ ^_a_g_ghp_q_w_w xyy	zs   %A'C ADc                 j    t        |t              s#t        dj                  t	        |                  y)z1
        Check type of weight parameter.
        z+Invalid weight type={}: must be array like.Nr  r3   rc  s     r   _check_weight_typezPool._check_weight_type  s1     &+. M T TUYZ`Ua bcc /r   c                     t        |      |k7  r$t        dj                  t        |      |            t        |d   t        t
        f      s&t        dj                  t        |d                     y)z&
        Check weight length.
        z8Length of weight={} and length of data={} are different.r   zWInvalid weight value type={}: must be 1 dimensional data with int, float or long types.N)r_   rD   r   rg   rm   ro   r   )r3   rc  r  s      r   _check_weight_shapezPool._check_weight_shape  s     v;-' Z a abeflbmo| }~~&)m[%AB y  !A  !A  BF  GM  NO  GP  BQ  !R  S  S Cr   c                 j    t        |t              s#t        dj                  t	        |                  y)z3
        Check type of group_id parameter.
        z-Invalid group_id type={}: must be array like.Nr  r3   rd  s     r   _check_group_id_typezPool._check_group_id_type  r  r   c                 h    t        |      |k7  r$t        dj                  t        |      |            y)z(
        Check group_id length.
        z:Length of group_id={} and length of data={} are different.Nr  )r3   rd  r  s      r   _check_group_id_shapezPool._check_group_id_shape  sC     x=M) \ c cdghpdq  tA  !B  C  C *r   c                 j    t        |t              s#t        dj                  t	        |                  y)z7
        Check type of group_weight parameter.
        z1Invalid group_weight type={}: must be array like.Nr  r3   re  s     r   _check_group_weight_typezPool._check_group_weight_type  s1     ,4 S Z Z[_`l[m noo 5r   c                     t        |      |k7  r$t        dj                  t        |      |            t        |d   t              s&t        dj                  t        |d                     y)z,
        Check group_weight length.
        z>Length of group_weight={} and length of data={} are different.r   zPInvalid group_weight value type={}: must be 1 dimensional data with float types.N)r_   rD   r   rg   ro   r   )r3   re  r  s      r   _check_group_weight_shapezPool._check_group_weight_shape	  s     |- ` g ghklxhy  |I  !J  K  K,q/K9 r y yz~  @L  MN  @O  {P  !Q  R  R :r   c                 j    t        |t              s#t        dj                  t	        |                  y)z6
        Check type of subgroup_id parameter.
        z0Invalid subgroup_id type={}: must be array like.Nr  r3   rf  s     r   _check_subgroup_id_typezPool._check_subgroup_id_type  s1     +{3 R Y YZ^_jZk lmm 4r   c                 h    t        |      |k7  r$t        dj                  t        |      |            y)z+
        Check subgroup_id length.
        z=Length of subgroup_id={} and length of data={} are different.Nr  )r3   rf  r  s      r   _check_subgroup_id_shapezPool._check_subgroup_id_shape  sD     {}, _ f fgjkvgw  zG  !H  I  I -r   c                 j    t        |t              s#t        dj                  t	        |                  y)z4
        Check type of timestamp parameter.
        z.Invalid timestamp type={}: must be array like.Nr  r3   ri  s     r   _check_timestamp_typezPool._check_timestamp_type   s1     )[1 P W WX\]fXg hii 2r   c                    t        |t              s#t        dj                  t	        |                  |j                         D ]  \  }}t        |t              s$t        dj                  |t	        |                  d|vrt        dj                  |            t        |d   t              s't        dj                  |t	        |d                     d|vrd|d<   nQt        |d   t        t        f      s't        dj                  |t	        |d                     t        |d         |d<   t        t        |d               D ]{  }t        |d   |   t              rt        |d   |   t              r#|!	 |j                  |d   |         }||d   |<   St        d
j                  |t	        |d   |                       |S # t        $ r" t        d	j                  ||d   |               w xY w)Nz0Invalid feature_tags type={}: must be dict like.zUInvalid type of value in feature_tags by key {}, value type is {}: must be dict like.r   zBInvalid value in feature_tags by key {}, key 'features' is needed.zbInvalid type of value in feature_tags by key {}, value type of features is {}: must be array like.cost      ?z[Invalid type of value in feature_tags by key {}, value type of cost is {}: must be integer.zUnknown feature in tag {}: {}zQInvalid type of feature in tag {}, value type is {}: must be int or feature name.)rg   r'  rD   r   r   itemsrk   rm   r   rn   ranger_   rw   
ValueError)r3   tagsr   tag_nametag_featuresr5  
feature_ids          r   _check_transform_tagszPool._check_transform_tags'  s   $% R Y YZ^_cZd eff&*jjl"HllD1#${  %C  %C  DL  NR  S_  N`  %a  b  b-#$h$o$opx$yzzl:6D#  %I  %P  %P  QY  [_  `l  mw  `x  [y  %z  {  {\)'*V$!,v"68LM'  )F  )M  )M  NV  X\  ]i  jp  ]q  Xr  )s  t  t'*<+?'@V$Sj!9:;l:6s;]KZ 8 =sCHa}%2%8%8j9QRU9V%W
 5?L,S1'({  )C  )C  DL  NR  S_  `j  Sk  lo  Sp  Nq  )r  s  s < '32  & }+,K,R,RS[]ijt]uvy]z,{||}s   <G+G7c                 h    t        |      |k7  r$t        dj                  t        |      |            y)z)
        Check timestamp length.
        z;Length of timestamp={} and length of data={} are different.Nr  )r3   ri  r  s      r   _check_timestamp_shapezPool._check_timestamp_shapeE  sC     y>]* ] d dehires  vC  !D  E  E +r   c                     || j                         }t        |t              s#t        dj	                  t        |                  t        |      |k7  r$t        dj	                  t        |      |            y )Nz,Invalid feature_names type={} : must be listzVInvalid length of feature_names={} : must be equal to the number of columns in data={})num_colrg   r   rD   r   r   r_   )r3   r   r  s      r   _check_feature_nameszPool._check_feature_namesL  s    ?llnG-2 N U UVZ[hVi jkk}( x    AD  ER  AS  U\  !]  ^  ^ )r   c                 j    t        |t              s#t        dj                  t	        |                  y )Nz*Invalid thread_count type={} : must be int)rg   rm   rD   r   r   )r3   rk  s     r   _check_thread_countzPool._check_thread_countT  s/    ,6 L S STXYeTf ghh 7r   c                     t        |t              s#t        dj                  t	        |                  t        d d      }|j                  | |       |S )Nz6Invalid rindex type={} : must be list or numpy.ndarrayTrl  )rg   rk   rD   r   r   rD  _take_slice)r3   rindex
slicedPools      r   slicez
Pool.sliceX  sL    &+. X _ _`dek`l mnn$6
tV,r   c                 j    t        d d      }t        d d      }| j                  ||||||       ||fS )NTr  )rD  _train_eval_split)r3   has_timeis_classificationeval_fractionsave_eval_pool
train_pool	eval_pools          r   train_eval_splitzPool.train_eval_split_  s@    $6
5	z9h@QS`bpq9$$r   c                     | j                  |       t        |t              r|j                  }| j	                  |       | j                  |       | S r(   )r  rg   r   valuesr  
_set_pairsr  s     r   	set_pairszPool.set_pairse  sB    u%eY'LLE&r   c                 J    | j                  |       | j                  |       | S r(   )r  _set_feature_namesr3   r   s     r   set_feature_nameszPool.set_feature_namesm  s#    !!-0.r   c                     | j                  |       | j                  |      }t        j                  || j	                         df      }| j                  || j	                                | j                  |       | S rI   )r  _if_pandas_to_numpyr   reshapenum_rowr  _set_baseliner  s     r   set_baselinezPool.set_baseliner  sd    !!(+++H5::h(<=""8T\\^<8$r   c                     | j                  |       | j                  |      }| j                  || j                                | j	                  |       | S r(   )r  r  r  r  _set_weightr  s     r   
set_weightzPool.set_weightz  sI    '))&1  8 r   c                     | j                  |       | j                  |      }| j                  || j                                | j	                  |       | S r(   )r  r  r  r  _set_group_idr  s     r   set_group_idzPool.set_group_id  sI    !!(+++H5""8T\\^<8$r   c                     | j                  |       | j                  |      }| j                  || j                                | j	                  |       | S r(   )r  r  r  r  _set_group_weightr  s     r   set_group_weightzPool.set_group_weight  sI    %%l3//=&&|T\\^D|,r   c                     | j                  |       | j                  |      }| j                  || j                                | j	                  |       | S r(   )r  r  r  r  _set_subgroup_idr  s     r   set_subgroup_idzPool.set_subgroup_id  sI    $$[1..{;%%k4<<>Bk*r   c                     | j                  |       | j                  |      }| j                  || j                                | j	                  |       | S r(   )r  r  r  	num_pairs_set_pairs_weight)r3   rg  s     r   set_pairs_weightzPool.set_pairs_weight  sJ    -//=  t~~/?@|,r   c                     | j                  |       | j                  |      }| j                  || j                                | j	                  |       | S r(   )r  r  r  r  _set_timestampr  s     r   set_timestampzPool.set_timestamp  sI    ""9-,,Y7	##It||~>I&r   c                     | j                         st        d      t        |t              s#t        dj	                  t        |                  | j                  |       y)z
        Save the quantized pool to a file.

        Parameters
        ----------
        fname : string or pathlib.Path
            Output file name.
        zPool is not quantized7Invalid fname type={}: must be str() or pathlib.Path().N)is_quantizedrD   rg   rs   r   r   _saver3   fnames     r   savez	Pool.save  sP       " 788%, Y ` `aefkal mnn

5r   c                 H   | j                         rt        d      i }t        |       ||}|j                  dd      }|j                  dd      }|r(t        dj	                  |j                                     t        ||||||||||	|
||       | j                  |       y)a  
        Quantize this pool

        Parameters
        ----------
        pool : catboost.Pool
            Dataset to quantize.

        ignored_features : list, [default=None]
            Indices or names of features that should be excluded when training.

        per_float_feature_quantization : list of strings, [default=None]
            List of float binarization descriptions.
            Format : described in documentation on catboost.ai
            Example 1: ['0:1024'] means that feature 0 will have 1024 borders.
            Example 2: ['0:border_count=1024', '1:border_count=1024', ...] means that two first features have 1024 borders.
            Example 3: ['0:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum',
                        '1:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum'] - defines more quantization properties for first two features.

        border_count : int, [default = 254 for training on CPU or 128 for training on GPU]
            The number of partitions in numeric features binarization. Used in the preliminary calculation.
            range: [1,65535] on CPU, [1,255] on GPU

        max_bin : float, synonym for border_count.

        feature_border_type : string, [default='GreedyLogSum']
            The binarization mode in numeric features binarization. Used in the preliminary calculation.
            Possible values:
                - 'Median'
                - 'Uniform'
                - 'UniformAndQuantiles'
                - 'GreedyLogSum'
                - 'MaxLogSum'
                - 'MinEntropy'

        sparse_features_conflict_fraction : float, [default=0.0]
            CPU only. Maximum allowed fraction of conflicting non-default values for features in exclusive features bundle.
            Should be a real value in [0, 1) interval.

        nan_mode : string, [default=None]
            Way to process missing values for numeric features.
            Possible values:
                - 'Forbidden' - raises an exception if there is a missing value for a numeric feature in a dataset.
                - 'Min' - each missing value will be processed as the minimum numerical value.
                - 'Max' - each missing value will be processed as the maximum numerical value.
            If None, then nan_mode=Min.

        input_borders : string or pathlib.Path, [default=None]
            input file with borders used in numeric features binarization.

        task_type : string, [default=None]
            The calcer type that will be used to train the model after quantization.
            Possible values:
                - 'CPU'
                - 'GPU'

        used_ram_limit=None

        random_seed : int, [default=None]
            The random seed used for data sampling.
            If None, 0 is used.
        zPool is already quantizedNr   r   z'got an unexpected keyword arguments: {})r  rD   _process_synonymsr`   r   rU  r   	_quantize)r3   r   r   r   max_binr   r   r   r   r   r   r   kwargsr   r   r   s                   r   quantizezPool.quantize  s    B  ;<<&!"L$jj)>E06

;bdh0i- I P PQWQ\Q\Q^ _``$V-=?]_k%8:[]p%-}iYd%J	L
 	vr   c                     t        |t              r|j                  }t        |t              r"t	        j
                  |j                        d   }|S )Nr   )rg   r   r  r   r   	transpose)r3   r  s     r   r  zPool._if_pandas_to_numpy  s;    eV$LLEeY'LL.q1Er   c                 v    t        |t              r|j                  }t        |t              r|j                  }|S r(   )rg   r   r  r   r  s     r   _label_if_pandas_to_numpyzPool._label_if_pandas_to_numpy  s-    eV$LLEeY'LLEr   c                 6   t        ||      5  | j                  ||||       | j                  |       |d}n| j                  |       |||fD ]  }|d}	 | j	                  |	       | j                  ||||||d   |||	|

       ddd       y# 1 sw Y   yxY w)z&
        Read Pool from file.
        Nr   r   )re   rs  ru  rw  r  
_read_pool)r3   	pool_filer]  r^  r_  feature_names_pathr`  ra  rb  rk  quantization_paramsrW   rX   rr  s                 r   rN  z
Pool._read  s    & x*i);UEJ!!),!)%'"334FG 2E:<D ; $$\2OO""!"# +**s   AB2BBc           	      0   t        |j                        }|t        |t              rBt        |j	                               }|"t        |      t        |      k7  rt        d      ||z   S |t        d      t        |D cg c]  }t        |t               c}      st        d      t        |      }g }d}	t        t        |      t        |      z         D ]4  }
|
|v r|j                  d|
z         |j                  ||	          |	dz  }	6 |S |S c c}w )NzDkeys of embedding_features_data and embedding_features are differentz`embedding_features is not specified but embedding_features_data without feature names is presentzeembedding_features contain feature names but embedding_features_data without feature names is presentr   z_embedding_feature_%ir   )rh   columnsrg   r'  rU  rr   rD   allrm   r  r_   rV   )r3   data_as_data_framer\  r[   non_embedding_data_feature_namesembedding_feature_namesembedding_feature_idembedding_features_setr   non_embedding_feature_idxfeature_idxs              r   _infer_feature_nameszPool._infer_feature_namesE  sI   +/0B0J0J+K(".148*./F/K/K/M*N'%1-.#6M2NN+,rss7:QQQ%-'  )K  L  Lhz{hzPdJ';]Khz{|'  )P  Q  Q),-?)@& ",-)#(-M)NQTUgQh)h#iK"&<<%,,-D{-RS%,,-MNg-hi1Q61 $j %$33! |s    Dc                 	   t        |t              r|| j                  |||      }t        |t              r|j                  j                         }t        |t              r!|j                         }|j                         }n||zt        |t              rt        j                  |t              }t        t        j                  |            dk(  rt        j                  |d      }t        j                  |      \  }}|Zt        |      z  }t        |t               rt        |j	                               }n|}|D ]  }t        |      k7  st#        d       d}||| j%                  |       | j'                  |       | j)                  |      }t        t        j                  |            dk(  rt        j                  |d      }| j+                  |       || j-                  |       |1t/        ||      }| j1                  |d       | j3                  |d       |1t/        ||      }| j1                  |d       | j3                  |d       |1t/        ||      }| j1                  |d       | j3                  |d       |V| j5                  |       t        |t              r|j                  }| j7                  |       t        j                  |      d   }|>| j5                  |       t        |t              r|j                  }| j7                  |       |	4| j9                  |	       | j;                  |	      }	| j=                  |	       |
4| j?                  |
       | j;                  |
      }
| jA                  |
       |4| jC                  |       | j;                  |      }| jE                  |       |4| jG                  |       | j;                  |      }| jI                  |       |4| j9                  |       | j;                  |      }| j=                  ||       |L| jK                  |       | j;                  |      }t        jL                  |d	f      }| jO                  ||       |4| jQ                  |       | j;                  |      }| jS                  |       || jU                  ||      }| jW                  |||||||||	|
||||||||       y)
z7
        Initialize Pool from array like data.
        Nr  r   z]samples count in 'embeddings_features_data' does not correspond to samples count in main datar   rY  rZ  r[  rJ   ),rg   r   r  r   r  tolistrO  get_object_countget_feature_countrh   r   r  r  r_   r  expand_dimsr'  rD   r  r  r  r  r  r   rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  
_init_pool)r3   rX  r	  rY  rZ  r[  r\  r^  r_  rc  rd  re  rf  rg  rh  ri  r   rj  rk  r  r|  embedding_features_data_valuesembedding_feature_data	pairs_lens                           r   rV  z
Pool._initd  s   , dI&$ $ 9 9$@WYk ldF#;;%%'DdL) 113M!335N$%zz$f5288D>"a'~~dA.,.HHTN)M>".c"9::N148156M6T6T6V1W.1H.*H&-.-?'w  +I
 	""5)##E*2259E288E?#q(ua0##E=9$%%m^D#0}ML++L.I,,\>>Z$1-OM++M?K,,]NO\)!67I=!Y++,>@TU,,-?Qef""5)%+##E**I""5)%+##E*##F+--f5F$$V];%%h///9H&&x?#)),733LAL**<G"((522;?K))+}E###L133LAL$$\9=%%h///9Hzz(]B,?@H&&x? &&y100;I''	=A#55lMRLe\=BTVmotv{  ~D ,\8U^`mo{  ~J	Kr   )NNNNNNNN	FFNNNNNNNNNrJ   NNFr(   )NNNNNNNNNNN)NNNra   )3r   r   r   r   r4   rs  ru  rw  rz  r  r  r  rL  rM  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rN  r  rV  __classcell__rm  s   @r   rD  rD  T  so     $ 5B)H	eeI}K|	KQ0b=|hzdShCpRnIj<E^i%
" ae[_eiUn$ !)V4>pKr   rD  c                    d }t        | t              re| }t        d ||||||	|
||f	D              rt        d      | j	                         s| j                         dk(  rt        d      |t        d      |S t        | t              rt        | |||      }|S |t        d      t        | |||||||||	|
||      }|S )	Nc              3   $   K   | ]  }|d u 
 y wr(   r   rG  s     r   r   z$_build_train_pool.<locals>.<genexpr>  s        e  'dq}  'drI  zcat_features, text_features, embedding_features, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline should have the None type when X has catboost.Pool type.r   z$Label in X has not been initialized.z\Incorrect value of y: X is catboost.Pool object, y must be initialized inside catboost.Pool.)rX  r^  r_  r]  z[y has not initialized in fit(): X is not catboost.Pool object, y must be not None in fit().)rY  rZ  r[  r^  r_  rc  rd  re  rf  rg  rh  )rg   rD  r  rD   	has_labelr  rs   )Xr   rY  rZ  r[  r^  r_  sample_weightrd  re  rf  rg  rh  r]  r  s                  r   _build_train_poolr-    s!   J!T
  e|]DVXegoq}  @K  MY  [c  'd  e  ec  1;;=A#5 FGG= ~  
Az	"qUOab
 	 9 }~~!Q\k}  FK  SX  an  yA'3[grz|
r   c                     dD ]W  }t         j                  j                  | |      }t         j                  j                  |      sCt        j                  |       Y y )N)zcatboost_training.json)r)   r*   joinr+   remove)	train_dirfilenamer*   s      r   _clear_training_filesr3    s;    .ww||Ix077>>$IIdO /r   c                 &    | j                  dd      S )Nr1  catboost_infor   r   s    r   _get_train_dirr7    s    ::k?33r   c                     | D ]  }t        |        	 ddlm}  ||       S # t        $ r.}t	        j
                  d       t        t        |            d }~ww xY w)Nr   )MetricVisualizerzGTo draw plots in fit() method you should install ipywidgets and ipython)r3  widgetr9  r  r  r  r   )
train_dirsr1  r9  r0  s       r   _get_catboost_widgetr<    sS    	i(  ",
++ "_`#a&!!"s   " 	A)AAc              #      K   | rt        |      }|j                          	 d  | rj                          	 |t        |      j	                  ||       y y # | rj                          w w xY wwr(   )r<  _run_update_stop_updater!   save_to_file)plot	plot_file
plot_titler;  r:  s        r   plot_wrapperrD    sm     %j1"!
+88YO  ! s   A/A 2A/A,,A/c                     t        |       dkD  sJ d       d }| D ]1  }||v s| t        ddj                  |       z   dz         ||   }||= 3 |	||| d   <   y y )Nr   z%there should be more than one synonymzonly one of the parameters , z should be initialized.r   )r_   rD   r/  )synonymsr   rv   synonyms       r   _process_synonyms_grouprI    s    x=1EEEEf #$ATYYxEX$Y\u$uvv7OEw  #x{ r   c                 8   t        ddg|        t        ddg|        t        ddg|        t        ddg|        t        d	d
g|        t        ddg|        t        g d|        t        ddg|        t        ddg|        t        ddg|        t        ddg|        y )Nlearning_rateetar   r  depth	max_depthrsmcolsample_bylevelr   random_statel2_leaf_reg
reg_lambda)
iterationsn_estimatorsnum_boost_round	num_treesod_waitearly_stopping_roundscustom_metriccustom_loss
max_leaves
num_leavesmin_data_in_leafmin_child_samples)rI  r6  s    r   _process_synonyms_groupsr`  "  s    _e4f=^Y7@Wk2F;U$78&A]N;VD]L96BZ\bcY(?@&I_m<fE\<8&A/1DEvNr   c                 `   d| v r| d   | d<   | d= d| v r7d| v r| d   dk7  rt        d      d| v sd| v rt        d      d	| d   g| d<   | d= d| v rt        | d   t        t        f      r| d   }g }d
| v rg| d
   bt	        |      t	        | d
         k7  rt        d      | d
   D ]4  }||vrt        dj                  |            |j                  ||          6 nAg }|j                         D ]'  \  }}|j                  |       |j                  |       ) || d
<   || d<   t        |        d }d| v r| d   }| d= d }d| v r| d   }| d= d }d| v r| d   }| d= d }	d| v r| d   }	| d= d }
d| v r| d   }
| d= t        ||||	|
      \  }}}||| d<   ||| d<   ||| d<   d| v rt        | d         | d<   y y )N	objectiveloss_functionscale_pos_weightLoglosszIscale_pos_weight is supported only for binary classification Logloss lossclass_weightsauto_class_weightszeonly one of the parameters scale_pos_weight, class_weights, auto_class_weights should be initialized.r  class_namesz9Number of classes in class_names and class_weights differzLclass "{}" is present in "class_names" but not in "class_weights" dictionaryr   r   r   r   r   r   )rD   rg   r'  r
   r_   r   rV   r  r`  r   r   )r   class_weights_dictclass_weights_listclass_labelclass_labels_listclass_weightr   r   r   r   r   s              r   r  r  0  s   f"("5;V#f$)@I)M kllf$(<(F  !H  I  I#&/A(B"C%&6!z&2IDR]K^'_#O4V#&*?*K%&#f].C*DD#$_``%m4&88'fmm' 
 #))*<[*IJ  5 !#-?-E-E-G)\!((5")),7 .H %6F=!"4V$M& /?#GF#9M& /?#Ln->"F6!8,<w|V-E)M7M  "/#y "/6!#&v.>'?#@  "r   c                     dD ][  }|| vr| |   }t        |t              rt        |      | |<   ,t        |t              r=t        |t              sNt        |      | |<   ] | S )zKReplace all occurrences of BuiltinMetric with their string representations.)rc  rb  eval_metricrZ  r[  )rg   r#   r   rt   r   stringify_builtin_metrics_list)r   r   vals      r   stringify_builtin_metricsrr  {  sc    ZF?Qic=)CF1I\*X&6s;F1I [ Mr   c                 4    t        t        t        |             S r(   )rh   r  r   )metricss    r   rp  rp    s    C!""r   c                     | j                  d      }||S |dk(  r\t        |t              st        d      |j	                         }|t        d      	 t        t        |            dkD  xr d| v}|rdS dS |d	k(  ry
y)zp
        estimator_type must be 'classifier', 'regressor', 'ranker' or None
        train_pool must be Pool
    rc  
classifierz$train_pool param must have Pool typez:loss function has not been specified and cannot be deducedr   target_border
MultiClassre  rankerYetiRankRMSE)r   rg   rD  rD   	get_labelr_   rr   )r   estimator_typer  loss_function_paramr	  is_multiclass_tasks         r   _get_loss_function_for_trainr    s     !**_5&""%*d+ FGG$$&= \]]	 !U_q0R_F5R1|@y@	8	#r   c                   T   e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z d Z!d  Z"d! Z#d" Z$d# Z%d$ Z&d% Z'd& Z(d' Z)dJd)Z*d* Z+d+ Z,d, Z-d- Z.e/d.        Z0e/d/        Z1e/d0        Z2e/d1        Z3e/d2        Z4e/d3        Z5d4 Z6e7d5        Z8e7d6        Z9e7d7        Z:e7d8        Z;e7d9        Z<e7d:        Z=e7d;        Z>e7d<        Z?e7d=        Z@d> ZAd? ZBd@ ZCdA ZDdB ZEdC ZFdD ZGdE ZHdF ZIdG ZJdH ZKdI ZLy()K_CatBoostBasec                 <   ||j                         ni }t        |       || _        d| j                  v r-| j                  d   dk(  r| j                  j                  d       d| j                  v r| j                  d   g | j                  d<   t	               | _        y )Nrk  rJ   fixed_binary_splits)copyrr  _init_paramsr`   	_CatBoost_object)r3   r   init_paramss      r   r4   z_CatBoostBase.__init__  s    '-'9fkkmr!+.'T...43D3D^3TXZ3Z!!.1 D$5$55$:K:KLa:b:j79D34 {r   c                    | j                   j                         }| j                  j                         }|r||d<   | j	                         r| j                         |d<   dD ]   }t        | |d       t        | |d       ||<   " |S )N_test_evals__model_prediction_values_change_loss_value_change)r  r  r  _get_test_evals	is_fitted_serialize_modelgetattr)r3   r   
test_evalsattrs       r   __getstate__z_CatBoostBase.__getstate__  s    ""'')\\113
$.F=!>> $ 5 5 7F9GDtT4(4&tT48t H r   c                    dt        | j                  j                               vrt               | _        dt        | j                  j                               vri | _        d|v r| j                  |d          |d= d|v r| j                  |d   g       |d= d|v r| j                  |d          |d= dD ]  }||v st        | |||          ||=  | j
                  j                  |       y )Nr  r  r  
_test_evalr  r  )
r'  __dict__r  r  r  r  _load_from_string_set_test_evalssetattrr   )r3   stater  s      r   __setstate__z_CatBoostBase.__setstate__  s    D!4!4!677$;DLdmm&9&9&;!<< "D""5#34i 5   %"5!67l#E!  }!56m$GDu}dE$K0$K H 	  'r   c                 $    | j                  d       S r(   )__deepcopy__r@   s    r   __copy__z_CatBoostBase.__copy__  s      &&r   c                 h    | j                         }| j                         }|j                  |       |S r(   )r  rm  r  )r3   _r  models       r   r  z_CatBoostBase.__deepcopy__  s/    !!# 5!r   c                 Z    | j                  |      xr | j                  |j                  k(  S r(   _is_comparable_tor  r3   others     r   __eq__z_CatBoostBase.__eq__  s%    %%e,N1NNr   c                 \    | j                  |       xs | j                  |j                  k7  S r(   r  r  s     r   __ne__z_CatBoostBase.__ne__  s(    ))%00QDLLEMM4QQr   c                 "    | j                         S r(   )r  r@   s    r   r  z_CatBoostBase.copy  s    }}r   c                      t        | dd       d uS N_random_seed)r  r@   s    r   r  z_CatBoostBase.is_fitted  s    t^T2$>>r   c                     t        |t              syd| fd|ffD ]1  \  }}|j                         rd}t        |j	                  |             y)NFr  r   zDThe {} argument is not fitted, only fitted models could be compared.T)rg   r  r  rD   r   )r3   rhsr   	estimatorr6   s        r   r  z_CatBoostBase._is_comparable_to  sW    #}-!'#?OD)&&(0#GNN4$899	  @
 r   c                 F   t        | dd       t        | d| j                  j                                t        | d| j                  j                                t        | d| j                  j	                                t        | d| j                  j                                y )N_is_fitted_Tr  _learning_rate_tree_count_n_features_in)r  r  _get_random_seed_get_learning_rate_get_tree_count_get_n_features_inr@   s    r   _set_trained_model_attributesz+_CatBoostBase._set_trained_model_attributes  st    mT*ndll&C&C&EF&(G(G(IJmT\\%A%A%CD&(G(G(IJr   c                 ~    | j                   j                  |||||r|j                   nd        | j                          y r(   )r  _trainr  )r3   r  	test_poolr   allow_clear_pool
init_models         r   r  z_CatBoostBase._train  s5    J	6;KcmZM_M_swx**,r   c                 :    | j                   j                  |       y r(   )r  r  r3   r  s     r   r  z_CatBoostBase._set_test_evals  s    $$Z0r   c                    | j                   j                         }t        |      dk(  r&| j                         rt	        d      t	        d      t        |      dkD  rt	        d      |d   }t        |      dk(  r|d   S |S )Nr   /The model has been trained without an eval set.!You should train the model first.r   z.With multiple eval sets use 'get_test_evals()'r  r  r_   r  rD   )r3   r  	test_evals      r   get_test_evalz_CatBoostBase.get_test_eval  s}    \\113
z?a~~#$UVV#$GHHz?Q PQQqM	"9~2y|A	Ar   c                     | j                   j                         }t        |      dk(  r&| j                         rt	        d      t	        d      |S )Nr   r  r  r  r  s     r   get_test_evalsz_CatBoostBase.get_test_evals  sH    \\113
z?a~~#$UVV#$GHHr   c                 6    | j                   j                         S r(   )r  _get_metrics_evalsr@   s    r   get_evals_resultz_CatBoostBase.get_evals_result      ||..00r   c                 6    | j                   j                         S r(   )r  _get_best_scorer@   s    r   get_best_scorez_CatBoostBase.get_best_score  s    ||++--r   c                 6    | j                   j                         S r(   )r  _get_best_iterationr@   s    r   get_best_iterationz _CatBoostBase.get_best_iteration      ||//11r   c                 6    | j                   j                         S r(   )r  r  r@   s    r   get_n_features_inz_CatBoostBase.get_n_features_in"  r  r   c                 6    | j                   j                         S r(   )r  _get_float_feature_indicesr@   s    r   r  z(_CatBoostBase._get_float_feature_indices%  s    ||6688r   c                 6    | j                   j                         S r(   )r  _get_cat_feature_indicesr@   s    r   r  z&_CatBoostBase._get_cat_feature_indices(  s    ||4466r   c                 6    | j                   j                         S r(   )r  _get_text_feature_indicesr@   s    r   r  z'_CatBoostBase._get_text_feature_indices+  s    ||5577r   c                 6    | j                   j                         S r(   )r  _get_embedding_feature_indicesr@   s    r   r  z,_CatBoostBase._get_embedding_feature_indices.  s    ||::<<r   c           	      D    | j                   j                  |||||||      S r(   )r  _base_predict)r3   poolprediction_typentree_start	ntree_endrk  r   r   s           r   r  z_CatBoostBase._base_predict1  s&    ||))$iYegnpyzzr   c                 B    | j                   j                  ||||||      S r(   )r  _base_virtual_ensembles_predict)r3   r  r  r  virtual_ensembles_countrk  r   s          r   r  z-_CatBoostBase._base_virtual_ensembles_predict4  s3    ||;;D/S\^u  xD  FM  N  	Nr   c           	      D    | j                   j                  |||||||      S r(   )r  _staged_predict_iterator)r3   r  r  r  r  eval_periodrk  r   s           r   r  z&_CatBoostBase._staged_predict_iterator7  s1    ||44T?KYbdoq}  @G  H  	Hr   c                 <    | j                   j                  |||      S r(   )r  _leaf_indexes_iterator)r3   r  r  r  s       r   r  z$_CatBoostBase._leaf_indexes_iterator:  s    ||224iPPr   c                 @    | j                   j                  |||||      S r(   )r  _base_calc_leaf_indexes)r3   r  r  r  rk  r   s         r   r  z%_CatBoostBase._base_calc_leaf_indexes=  s!    ||33D+yR^`ghhr   c	           
      p    t        |t              r|n|g}	| j                  j                  ||	||||||      S r(   )rg   rh   r  _base_eval_metrics)
r3   r  metrics_descriptionr  r  r  rk  
result_dirtmp_dirmetrics_description_lists
             r   r  z _CatBoostBase._base_eval_metrics@  sU    :DEXZ^:_#6fyez ||..t5M{\egr  uA  CM  OV  W  	Wr   c                 b    | j                   j                  |j                  ||||||||	|
||      S r(   )r  
_calc_fstrr   )r3   r   r  reference_datark  r   model_output	shap_modeinteraction_indicesshap_calc_typesage_n_samplessage_batch_sizesage_detect_convergences                r   r  z_CatBoostBase._calc_fstrD  sC     ||&&II#
 	
r   c	           
      F    | j                   j                  ||||||||      S r(   )r  
_calc_ostr)	r3   r  r  top_size	ostr_typeupdate_methodimportance_values_signrk  r   s	            r   r  z_CatBoostBase._calc_ostrb  s7    ||&&z9h	S`bx  {G  IP  Q  	Qr   c                 \    | j                   j                  ||       | j                          y r(   )r  _base_shrinkr  )r3   r  r  s      r   r	  z_CatBoostBase._base_shrinke  s"    !!+y9**,r   c                 8    | j                   j                          y r(   )r  _base_drop_unused_featuresr@   s    r   r  z(_CatBoostBase._base_drop_unused_featuresi  s    //1r   c                     dd l }| j                         r;d}|r |j                  |t              }| j                  j                  ||||       y y )Nr   r   )cls)jsonr  dumps_NumpyAwareEncoderr  _save_model)r3   output_filer   export_parametersr  r  params_strings          r   r  z_CatBoostBase._save_modell  sH    >>M  *

+<BT ULL$$[&-N r   c                 0   t        |t              s#t        dj                  t	        |                  i | _        | j                  j                  ||       | j                          t        | j                               D ]  \  }}|| j
                  |<    y )Nr  )rg   rs   rD   r   r   r  r  _load_modelr  r   _get_params)r3   
model_filer   rz   rv   s        r   r  z_CatBoostBase._load_modelu  s~    *j1 Y ` `aefpaq rss  V4**,#D$4$4$67JC%*Dc" 8r   c                 6    | j                   j                         S r(   )r  r  r@   s    r   r  z_CatBoostBase._serialize_model~  s    ||,,..r   c                 h    t        |t              sJ d       | j                  j                  |       y )NzNot bytes passed as argument)rg   r   r  _deserialize_modelr3   dump_model_strs     r   r  z _CatBoostBase._deserialize_model  s*    .%0P2PP0''7r   c                 F    | j                  |       | j                          y r(   )r  r  r  s     r   r  z_CatBoostBase._load_from_string  s    /**,r   c                 Z    | j                   j                  |       | j                          y r(   )r  _load_from_streamr  )r3   streams     r   r   z_CatBoostBase._load_from_stream  s     &&v.**,r   Nc                 $   ||D cg c]  }d }}|D cg c]  }|j                    }}| j                   j                  |||       t        | dd       t        | dd       t        | d| j                   j                                y c c}w c c}w )Nr  r  r   r  r  )r  _sum_modelsr  r  )r3   models_baseweightsctr_merge_policyr  r  models_inners          r   r#  z_CatBoostBase._sum_models  s    ?$/0KqsKG03>?;%;?  w8HIna(&*mT\\%A%A%CD 1?s
   	BBc                 p    | j                         st        d      | j                  j                  |       y )NzcThere is no trained model to use save_borders(). Use fit() to train model. Then use save_borders().)r  rD   r  _save_borders)r3   r  s     r   r)  z_CatBoostBase._save_borders  s3    ~~  !F  G  G"";/r   c                 l    | j                         st        d      | j                  j                         S )NziThere is no trained model to use get_feature_borders(). Use fit() to train model. Then use get_borders().)r  rD   r  _get_bordersr@   s    r   r+  z_CatBoostBase._get_borders  s4    ~~  !L  M  M||((**r   c                 Z    | j                         sJ | j                  j                         S r(   )r  r  _get_nan_treatmentsr@   s    r   r-  z!_CatBoostBase._get_nan_treatments  s%    ~~||//11r   c                     | j                   j                         }| j                  j                         }t	        |      D ]  \  }}||vs|||<    |S r(   )r  r  r  r  r   )r3   r   r  rz   rv   s        r   r  z_CatBoostBase._get_params  sS    ))+'',,.#K0JC& #s 1 r   c                 <    t        | t              xr t        |       S r(   )rg   r   is_classification_objectiverc  s    r   _is_classification_objectivez*_CatBoostBase._is_classification_objective  s    --\2Mm2\\r   c                 <    t        | t              xr t        |       S r(   )rg   r   is_regression_objectiver1  s    r   _is_regression_objectivez&_CatBoostBase._is_regression_objective  s    --X2I-2XXr   c                 <    t        | t              xr t        |       S r(   )rg   r   is_multiregression_objectiver1  s    r   _is_multiregression_objectivez+_CatBoostBase._is_multiregression_objective  s    --]2N}2]]r   c                 <    t        | t              xr t        |       S r(   )rg   r   is_multitarget_objectiver1  s    r   _is_multitarget_objectivez'_CatBoostBase._is_multitarget_objective  s    --Y2J=2YYr   c                 <    t        | t              xr t        |       S r(   )rg   r   is_survivalregression_objectiver1  s    r    _is_survivalregression_objectivez._CatBoostBase._is_survivalregression_objective  s    --`2QR_2``r   c                 <    t        | t              xr t        |       S r(   )rg   r   is_ranking_metricr1  s    r   _is_ranking_objectivez#_CatBoostBase._is_ranking_objective  s    --R2CM2RRr   c                 6    | j                   j                         S r(   )r  _get_metadata_wrapperr@   s    r   get_metadataz_CatBoostBase.get_metadata  s    ||1133r   c                 >    | j                         rt        | d      S d S )Nr  r  r  r@   s    r   tree_count_z_CatBoostBase.tree_count_  s    /3~~/?wt]+ITIr   c                 >    | j                         rt        | d      S d S r  rF  r@   s    r   random_seed_z_CatBoostBase.random_seed_  s    040@wt^,JdJr   c                 >    | j                         rt        | d      S d S )Nr  rF  r@   s    r   learning_rate_z_CatBoostBase.learning_rate_      26..2Bwt-.LLr   c                 >    | j                         rt        | d      S d S )Nr  rF  r@   s    r   n_features_in_z_CatBoostBase.n_features_in_  rL  r   c                 Z    | j                         r| j                  j                         S d S r(   )r  r  _get_feature_namesr@   s    r   feature_names_z_CatBoostBase.feature_names_  s#    48NN4Dt||..0N$Nr   c                 Z    | j                         r| j                  j                         S d S r(   )r  r  _get_class_labelsr@   s    r   classes_z_CatBoostBase.classes_  s#    37>>3Ct||--/MMr   c                 "    | j                         S r(   )r  r@   s    r   evals_result_z_CatBoostBase.evals_result_  s    $$&&r   c                 "    | j                         S r(   )r  r@   s    r   best_score_z_CatBoostBase.best_score_  s    ""$$r   c                 "    | j                         S r(   )r  r@   s    r   best_iteration_z_CatBoostBase.best_iteration_  s    &&((r   c                 :    | j                   j                  ||      S r(   )r  _get_tree_splits)r3   tree_idxr  s      r   r\  z_CatBoostBase._get_tree_splits  s    ||,,Xt<<r   c                 8    | j                   j                  |      S r(   )r  _get_tree_leaf_valuesr3   r]  s     r   r_  z#_CatBoostBase._get_tree_leaf_values  s    ||11(;;r   c                 8    | j                   j                  |      S r(   )r  _get_tree_step_nodesr`  s     r   rb  z"_CatBoostBase._get_tree_step_nodes  s    ||00::r   c                 8    | j                   j                  |      S r(   )r  _get_tree_node_to_leafr`  s     r   rd  z$_CatBoostBase._get_tree_node_to_leaf  s    ||228<<r   c                 6    | j                   j                         S )z
        Returns
        -------
        tree_leaf_counts : 1d-array of numpy.uint32 of size tree_count_.
        tree_leaf_counts[i] equals to the number of leafs in i-th tree of the ensemble.
        )r  _get_tree_leaf_countsr@   s    r   get_tree_leaf_countsz"_CatBoostBase.get_tree_leaf_counts  s     ||1133r   c                 6    | j                   j                         S )a  
        Returns
        -------
        leaf_values : 1d-array of leaf values for all trees.
        Value corresponding to j-th leaf of i-th tree is at position
        sum(get_tree_leaf_counts()[:i]) + j (leaf and tree indexing starts from zero).
        )r  _get_leaf_valuesr@   s    r   get_leaf_valuesz_CatBoostBase.get_leaf_values   s     ||,,..r   c                 6    | j                   j                         S )z
        Returns
        -------
        leaf_weights : 1d-array of leaf weights for all trees.
        Weight of j-th leaf of i-th tree is at position
        sum(get_tree_leaf_counts()[:i]) + j (leaf and tree indexing starts from zero).
        )r  _get_leaf_weightsr@   s    r   get_leaf_weightsz_CatBoostBase.get_leaf_weights
  s     ||--//r   c                 :    | j                   j                  |       y)a  
        Sets values at tree leafs of ensemble equal to new_leaf_values.

        Parameters
        ----------
        new_leaf_values : 1d-array with new leaf values for all trees.
        It's size should be equal to sum(get_tree_leaf_counts()).
        Value corresponding to j-th leaf of i-th tree should be at position
        sum(get_tree_leaf_counts()[:i]) + j (leaf and tree indexing starts from zero).
        N)r  _set_leaf_values)r3   new_leaf_valuess     r   set_leaf_valuesz_CatBoostBase.set_leaf_values  s     	%%o6r   c                 :    | j                   j                  |       y)z
        Sets feature names equal to feature_names

        Parameters
        ----------
        feature_names: 1-d array of strings with new feature names in the same order as in pool
        N)r  r  r  s     r   r  z_CatBoostBase.set_feature_names!  s     	''6r   c                     ddddddddddddd}t        | j                        }|i }t        |       d|v xr |d   dk(  |d<   |j                  dd      }|d	k(  xs |d
k(  |d<   d|vxs |d   dk7  |d<   |S )NFT)requires_positive_Xrequires_positive_y
requires_y
poor_scoreno_validation	statelesspairwise
multilabel
_skip_testmultioutput_onlybinary_onlyrequires_fitr   GPUnon_deterministicrc  r   	MultiRMSERMSEWithUncertaintymultioutputr   	Forbidden	allow_nan)r   r  r  r   )r3   r  r   rc  s       r   	_get_tagsz_CatBoostBase._get_tags+  s    #(#(! %  " $++,>F&!$/6$9$Zf[>QUZ>Z !

?B7,;e}Pe?e]&f4Yz8Jk8Y[r   c                 6    | j                   j                         S r(   )r  _get_scale_and_biasr@   s    r   get_scale_and_biasz _CatBoostBase.get_scale_and_biasF  r  r   c                     t        |t              r| j                  j                  ||g       y | j                  j                  ||       y r(   )rg   ro   r  _set_scale_and_bias)r3   scalebiass      r   set_scale_and_biasz _CatBoostBase.set_scale_and_biasI  s6    dK(LL,,UTF;LL,,UD9r   NIntersectingCountersAverage)Mr   r   r   r4   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r  r  r   r#  r)  r+  r-  r  rb   r2  r5  r8  r;  r>  rA  rD  propertyrG  rI  rK  rN  rQ  rT  rV  rX  rZ  r\  r_  rb  rd  rg  rj  rm  rq  r  r  r  r  r   r   r   r  r    sp   #
(('OR?K-1
B1.21978={NHQiW
<Q-2O+/8--E0
+
2 ] ] Y Y ^ ^ Z Z a a S S4 J J K K M M M M O O N N ' ' % % ) )=<;=4/07762:r   r  c           	          || v r\t        | |   t              r	| |   g| |<   t        | |   t              s,t        d|z   dj	                  t        | |               z         y y )Nz	Invalid `z.` type={} : must be string or list of strings.)rg   rt   r   rD   r   r   )r   rz   s     r   _cast_value_to_list_of_stringsr  P  sl    
f}fSk<0!#;-F3K&+x0c 14d4k4klpqwx{q|l}4~ ~ 1 r   c                    t        | t        t        f      s#t        dj	                  t        |                   d| v r9t        | d   t              s&t        dj	                  t        | d                     d| v r9t        | d   t              s&t        dj	                  t        | d                     t        | d       t        | d       t        | d       d	| v rHt        | d	   t        t        z   t        fz         s&t        d
j	                  t        | d	                     d| v rHt        | d   t        t        z   t        fz         s&t        dj	                  t        | d                     d| v rHt        | d   t        t        z   t        fz         s&t        dj	                  t        | d                     d| v rIt        | d   t        t        z   t        fz         s&t        dj	                  t        | d                     y y )Nz'Invalid params type={}: must be dict().ctr_descriptionz9Invalid ctr_description type={} : must be list of stringsctr_target_border_countz>Invalid ctr_target_border_count type={} : must be integer typer[  rZ  r   monotone_constraintszfInvalid `monotone_constraints` type={} : must be string or list of ints in range {{-1, 0, 1}} or dict.feature_weightszMInvalid `feature_weights` type={} : must be string or list of floats or dict.first_feature_use_penaltieszYInvalid `first_feature_use_penalties` type={} : must be string or list of floats or dict.per_object_feature_penaltieszZInvalid `per_object_feature_penalties` type={} : must be string or list of floats or dict.)rg   r   r	   rD   r   r   r   rm   r  rt   rk   r'  r6  s    r   _check_param_typesr  X  s   fw78ELLTRX\Z[[F"&!23X> [ b bcghn  pA  iB  dC  !D  E  E F*&!:;]K ` g ghlms  uN  nO  iP  !Q  R  R"6=9"6?;"6+KL'&!78,:TX\W^:^_  !I  !P  !P  QU  V\  ]s  Vt  Qu  !v  w  wF"&!23\K5OSWRY5YZ o v vw{  }C  DU  }V  xW  !X  Y  Y$.&!>?P[A[_c^eAef {  !C  !C  DH  IO  Pm  In  Do  !p  q  q%/&!?@,Q\B\`d_fBfg |  !D  !D  EI  JP  Qo  Jp  Eq  !r  s  s h 0r   c                 P    i }t        |       D ]  \  }}t        |      }|||<    |S r(   )r   rj   )r   casted_paramsrz   rv   s       r   _params_type_castr  r  s6    M'
U#E*"c ( r   c                     t        | t        t        t        ft        z         ryt        | t
              s#t        dj                  t        |                   t        t        j                  |             dk(  S )NFzInvalid data type={} : must be list, numpy.ndarray, pandas.Series, pandas.DataFrame, scipy.sparse matrix, catboost.FeaturesData or catboost.Poolr   )rg   rD  rO  r   r  rk   rD   r   r   r_   r   r  )rX  s    r   _is_data_single_objectr  z  sd    $|Y7:MMNdK(KKQ6RVW[R\K]
 	
 rxx~!##r   c                    ||vr| S |dk(  rd}n|dk(  rd}n|dk(  rd}nt        d|z         t        |t              rt        ||   |j	                               }|dk(  r|j                         }n&|dk(  r|j                         }n|j                         }t        |      t        |      k7  rt        |dz   t        |      z   d	z   |z   d
z   t        |      z         t        |t              rt        d      | Lt        |       t        ||         k7  r2t        |dz   t        ||         z   dz   |z   dz   t        |       z         ||   } ||= | S )NrY  categoricalrZ  r   r[  	embeddingzUnknown params_name=z* features indices in the model are set to z and train dataset z features indices are set to zkCategorical features are set in the model. It is not allowed to use FeaturesData type for training dataset.z" features in the model are set to z. z, features passed to fit function are set to )rD   rg   rD  r   get_feature_namesget_cat_feature_indicesget_text_feature_indicesget_embedding_feature_indicesrr   r   rO  )feature_indicesr  r   
param_namefeature_type_namefeature_indices_from_paramsfeature_indices_from_pools          r   _process_feature_indicesr    s   ^#)		&"	+	+'2Z?@@$&;F:<NPTPfPfPh&i#'(,(D(D(F%?*(,(E(E(G%(,(J(J(L%()S1L-MM 14` `"%&A"B!C 5!68I!JLk!l !$$= >!? @ @ 
D,	'y{ 	{ &3+?3vjGYCZ+Z 14X X[^_efp_q[r r $!%'8!9;i!jlop  mA!A B B ,zr   c                       e Zd ZdZd; fd	Zd Z	 	 	 	 	 	 d<dZ	 d=dZ	 	 	 	 	 	 d>dZd;d	Z	d?d
Z
d@dZdAdZd ZdBdZd ZdCdZd ZdDdZd ZdEdZd Zd Zd Zd=dZdFdZdGdZdHdZed        Zdej>                  dddddddd dd!d"d#ddfd$Z 	 	 	 dId%Z!dJd&Z"d' Z#dKd(Z$dLd)Z%d* Z&dMd+Z'd, Z(d- Z)d. Z*d/ Z+dNd0Z,dOd1Z-	 	 	 dPd2Z.d3 Z/d4 Z0d;d5Z1	 	 	 	 dQd6Z2	 	 	 	 dRd7Z3	 	 	 	 dSd8Z4	 	 	 	 dTd9Z5d: Z6 xZ7S )UCatBoostzO
    CatBoost model. Contains training, prediction and evaluation methods.
    Nc                 ,    t         t        |   |       y)a  
        Initialize the CatBoost.

        Parameters
        ----------
        params : dict
            Parameters for CatBoost.
            If  None, all params are set to their defaults.
            If  dict, overriding parameters present in dict.
        N)rW  r  r4   )r3   r   rm  s     r   r4   zCatBoost.__init__  s     	h&v.r   c                     t        | dd      dk(  xs% t        j                  |j                  dd            }|j	                  |j                  dd      ||d   |      S )	z
        returns:
            train_pool, eval_pool
                eval_pool will be uninitialized if save_eval_pool is false
        _estimator_typeNrv  rc  r{  r  Fr  )r  r  r2  r   r  )r3   r  r   r  r  s        r   _dataset_train_eval_splitz"CatBoost._dataset_train_eval_split  s     %T+<dC|S  hXe  YC  YC  DJ  DN  DN  O^  `f  Dg  Yh**JJz5)?#	
 	
r   c                 T   t        | j                        }|i }t        |       t        |t              rt        j                  d       t        |||d      }t        |||d      }t        |||d      }t        |||||||||	|
||||      }|j                  rt        d      t        |t               } t        |t        | dd       |      |d<   t        |||||      \  }}}|||d<   |||d	<   |||d
<   |||d<   |d|d<   ||d<   d|v r|d= |||d<   |||d<   |||d<   |t        |      |d<   t!        |       t#        |      }t%        |       |j'                  dd      dk7  r$|t        d      | j)                  ||d      \  }}t        |t*              r|n|g}!g }"d}#|!D ]  }t        |t              rI|"j-                  |       |#|"d   j/                         z  }#|"d   j/                         dk(  sRt        d      t        |t0              rc|"j-                  t        ||             |#|"d   j/                         z  }#|"d   j/                         dk(  st        dj3                  |            t        |t4              rt7        |      dk7  r3t        dj3                  t9        t5        d  |D                                |d   |d!   t        d"      |"j-                  t        |d   |d!   |j;                         |j=                         |j?                         #             |#|"d   j/                         z  }#|"d   j/                         dk(  st        d$      |t7        |!      d!kD  st        d%      t        d&j3                  tA        |                   | jC                  d      r|#dk(  rt        d'      |*t        |t0              r	 tE               jG                  |      }||"|| |d)S # tH        $ r}$t        d(j3                  |$            d }$~$ww xY w)*NzFeaturesData is deprecated for using in fit function and soon will not be supported. If you want to use FeaturesData, please pass it to Pool initialization and use Pool in fitrY  rZ  r[  zX is empty.r  rc  r   r   r   use_best_modelIterod_typerX  od_pvalsave_snapshotsnapshot_filesnapshot_interval	callbacksr          z)Both eval_fraction and eval_set specifiedTr  r   rJ   zEmpty 'eval_set' in Pool)r]  zEmpty 'eval_set' in file {}r   z0Invalid shape of 'eval_set': {}, must be (X, y).c              3   2   K   | ]  }t        |        y wr(   r   )r   r  s     r   r   z1CatBoost._prepare_train_params.<locals>.<genexpr>(	  s'       lG  ~Fxylpqrls  ~Fs   r   z1'eval_set' tuple contains at least one None value)rY  rZ  r[  zEmpty 'eval_set' in tuplez(Multiple eval set shall not contain Nonez[Invalid type of 'eval_set': {}, while expected Pool or (X, y) or filename, or list thereof.zFTo employ param {'use_best_model': True} provide non-empty 'eval_set'.z"Error while loading init_model: {})r  	eval_setsr   r  r  )%r   r  r  rg   rO  r  r  r  r-  	is_empty_rD   rD  r  r  r   _TrainCallbacksWrapperr  r  _check_train_paramsr   r  rh   rV   r  rs   r   rq   r_   r   r  r  r  r   	get_paramr  
load_modelr   )%r3   r+  r   rY  rZ  r[  r^  r_  r,  rd  re  rf  rg  rh  r  eval_setr   r   rA  rB  r]  r   r   r   rY  r  r  r  r  r  r   r  r  eval_set_listr  eval_total_row_countr0  s%                                        r   _prepare_train_paramszCatBoost._prepare_train_params  s    $++,>F&!a&MM V W 0aX06?[56H!VUij&q!\=J\^cej'4hk[g'/1CE
 ..)!T22">D+T2#
 1A7M<1I-w $&3F?#$&3F?# 'F9%'5F#$ , &F9 5F9F"9%$&3F?#$&3F?#(*;F&' "8"CF;6""6*F#::os+s2##$OPP#'#A#A*fei#A#j J$.x$>XJ	 %H(D)  *$	"(=(=(??$R=((*a/'(BCCHj1  hCU!VW$	"(=(=(??$R=((*a/'(E(L(LX(VWWHe,x=A%'(Z(a(abefk  lG  ~F  lG  gG  cH  )I  J  JA;&(1+*='([\\    %/%G%G%I&0&I&I&K+5+S+S+U %	"(=(=(??$R=((*a/'(CDD!}%)'(RSS#  %B  %I  %I  JN  OW  JX  %Y  Z  ZC &F >>*+0D0I hii"
:z(JT%Z22:>

 %" 0$
 	
  T#$H$O$OPQ$RSSTs   O? ?	P'P""P'c                     t        ||      5  |t        d      |#t        |t        t        fz         st        d       | j
                  d,i d|d|d|d|d|d|d	|d
|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|} | d    }!| d!   }"| d"   }#t        ||d#t        | j                               g      5  | j                  |"| d$   |!|#| d          d d d        | j                  j                         }$|$rt        |$      rnt        | j                               d%kD  rn| j                  j                         sh|#rDt!        |||||||||	|
||||      }"|!j#                  d&d'      d'k7  r| j%                  |"|!d()      \  }"}%| j'                  |"t(        j*                  *       n | j'                  t(        j*                  +       d d d        | S # 1 sw Y   xY w# 1 sw Y   | S xY w)-NX must not be NonezCy may be None only when X is an instance of catboost.Pool or stringr+  r   rY  rZ  r[  r^  r_  r,  rd  re  rf  rg  rh  r  r  r   r   rA  rB  r]  r   r   r   rY  r  r  r  r  r  r   r  r  zTraining plotsr  r   r  r  Fr  )rX  r   r  r   )re   rD   rg   rs   rD  r  rD  r7  
get_paramsr  r  _get_loss_function_nameis_groupwise_metricr_   r  _has_leaf_weights_in_modelr-  r   r  get_feature_importancer   r   )&r3   r+  r   rY  rZ  r[  r^  r_  r,  rd  re  rf  rg  rh  r  r  r   r   rA  rB  r]  r   r   r   rY  r  r  r  r  r  rW   rX   train_paramsr   r  r  lossr  s&                                         r   _fitzCatBoost._fitO	  s   
 x*y#$899yAzTG/C!D#$ijj5455 			'3	CP	ew		#(	8E	PX	gs	 (	 7C	 NV	 gu	 "		 ,3		 CP		 W[		 gp		
 $6	
 EQ	
 an	 	 6K	 [h	 ,	 @Q	 ^h	 $	L "(+F%l3J+,>?dI/?.QUQ`Q`QbBcAde -$ . f <<779D+D1d88:;a?||>>@'%6().!!)$('($.&
  "::os;sB,0,J,J:W]ns,J,tMJ//ZiFfFf/g//Y5U5U/Vy +| S fe+ +| s%   CG=G18C/G=1G:	6G==HFc                       | j                   g |||||||||	|
||||||||||||||||||||| S )a>  
        Fit the CatBoost model.

        Parameters
        ----------
        X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series
             or string.
            If not catboost.Pool or catboost.FeaturesData it must be 2 dimensional Feature matrix
             or string - file with dataset.

             Must be non-empty (contain > 0 objects)

        y : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Labels of the training data.
            If not None, can be a single- or two- dimensional array with either:
              - numerical values - for regression (including multiregression), ranking and binary classification problems
              - class labels (boolean, integer or string) - for classification (including multiclassification) problems
            Use only if X is not catboost.Pool and does not point to a file.

        cat_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Categ columns indices.
            Use only if X is not catboost.Pool and not catboost.FeaturesData

        text_features: list or numpy.ndarray, optional (default=None)
            If not none, giving the list of Text columns indices.
            Use only if X is not catboost.Pool and not catboost.FeaturesData

        embedding_features: list or numpy.ndarray, optional (default=None)
            If not none, giving the list of Embedding columns indices.
            Use only if X is not catboost.Pool and not catboost.FeaturesData

        pairs : list or numpy.ndarray or pandas.DataFrame
            The pairs description.
            If list or numpy.ndarrays or pandas.DataFrame, giving 2 dimensional.
            The shape should be Nx2, where N is the pairs' count. The first element of the pair is
            the index of the winner object in the training set. The second element of the pair is
            the index of the loser object in the training set.

        graph : list or numpy.ndarray or pandas.DataFrame
            The graph edges list description.
            If list or numpy.ndarrays or pandas.DataFrame, giving 2 dimensional.

        sample_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Instance weights, 1 dimensional array like.

        group_id : list or numpy.ndarray, optional (default=None)
            group id for each instance.
            If not None, giving 1 dimensional array like data.
            Use only if X is not catboost.Pool.

        group_weight : list or numpy.ndarray, optional (default=None)
            Group weight for each instance.
            If not None, giving 1 dimensional array like data.

        subgroup_id : list or numpy.ndarray, optional (default=None)
            subgroup id for each instance.
            If not None, giving 1 dimensional array like data.
            Use only if X is not catboost.Pool.

        pairs_weight : list or numpy.ndarray, optional (default=None)
            Weight for each pair.
            If not None, giving 1 dimensional array like pairs.

        baseline : list or numpy.ndarray, optional (default=None)
            If not None, giving 2 dimensional array like data.
            Use only if X is not catboost.Pool.

        use_best_model : bool, optional (default=None)
            Flag to use best model

        eval_set : catboost.Pool or list of catboost.Pool or tuple (X, y) or list [(X, y)], optional (default=None)
            Validation dataset or datasets for metrics calculation and possibly early stopping.

        logging_level : string, optional (default=None)
            Possible values:
                - 'Silent'
                - 'Verbose'
                - 'Info'
                - 'Debug'

        metric_period : int
            Frequency of evaluating metrics.

        verbose : bool or int
            If verbose is bool, then if set to True, logging_level is set to Verbose,
            if set to False, logging_level is set to Silent.
            If verbose is int, it determines the frequency of writing metrics to output and
            logging_level is set to Verbose.

        silent : bool
            If silent is True, logging_level is set to Silent.
            If silent is False, logging_level is set to Verbose.

        verbose_eval : bool or int
            Synonym for verbose. Only one of these parameters should be set.

        plot : bool, optional (default=False)
            If True, draw train and eval error in Jupyter notebook

        plot_file : file-like or str, optional (default=None)
            If not None, save train and eval error graphs to file

        early_stopping_rounds : int
            Activates Iter overfitting detector with od_wait parameter set to early_stopping_rounds.

        save_snapshot : bool, [default=None]
            Enable progress snapshotting for restoring progress after crashes or interruptions

        snapshot_file : string or pathlib.Path, [default=None]
            Learn progress snapshot file path, if None will use default filename

        snapshot_interval: int, [default=600]
            Interval between saving snapshots (seconds)

        init_model : CatBoost class or string or pathlib.Path, [default=None]
            Continue training starting from the existing model.
            If this parameter is a string or pathlib.Path, load initial model from the path specified by this string.

        callbacks : list, optional (default=None)
            List of callback objects that are applied at end of each iteration.

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Returns
        -------
        model : CatBoost
        )r  ) r3   r+  r   rY  rZ  r[  r^  r_  r,  rd  re  rf  rg  rh  r  r  r   r   rA  rB  r]  r   r   r   rY  r  r  r  r  r  rW   rX   s                                    r   fitzCatBoost.fit	  sL   R tyy u uA u| u] u<N uPU uW\ u^k umu u  xD u  FQ u%u'/u1?uAIuKRuTaucguiru+u-9u;HuJPuRgu 'u )6u 8Iu KUu W`u bju ltu 	ur   c           
         | j                         r| j                  t        dj                  |            t	        |      }t        |t              svt        |r|gn||t        |t              s| j                         nd t        |t              s| j                         nd t        |t              s| j                         nd |      }||fS )NzVThere is no trained model to use {}(). Use fit() to train model. Then use this method.)rX  r	  rY  rZ  r[  rk  )r  rG  rD   r   r  rg   rD  rO  r  r  r  )r3   rX  parent_method_namerk  r	  is_single_objects         r   _process_predict_input_dataz$CatBoost._process_predict_input_data"
  s    ~~4#3#3#; "STZTZ[mTnp p1$7$%/dVTDNtUaDbT::<hlFPQUWcFdd<<>jnPZ[_amPn4#F#F#Htx)D %%%r   c                     t        |t              s#t        dj                  t	        |                  ||vr*t        dj                  |dj                  |                  y )Nz/Invalid prediction_type type={}: must be str().z0Invalid value of prediction_type={}: must be {}.rF  )rg   rt   rD   r   r   r/  )r3   r  valid_prediction_typess      r   _validate_prediction_typez"CatBoost._validate_prediction_type2
  sp    /<8 Q X XY]^mYn opp"88 R Y YZikoktkt  vL  lM  !N  O  O 9r   c	           	          |xs | j                  d      }|d}| j                  |||      \  }}	| j                  |       | j                  |||||||      }
|	r|
d   S |
S Nr   Fr   )r  r  r  r  )r3   rX  r  r  r  rk  r   r  r   data_is_single_objectpredictionss              r   _predictzCatBoost._predict8
  sy    6T^^I6?G&*&F&FtM_am&n##&&7((YXdfmoxy!6{1~GKGr   rJ   c           
      2    | j                  ||||||d|      S )a
  
        Predict with data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        prediction_type : string, optional (default='RawFormulaVal')
            Can be:
            - 'RawFormulaVal' : return raw value.
            - 'Class' : return class label.
            - 'Probability' : return probability for every class.
            - 'Exponent' : return Exponent of raw formula value.
            - 'RMSEWithUncertainty': return standard deviation for RMSEWithUncertainty loss function
              (logarithm of the standard deviation is returned by default).

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool, optional (default=False)
            If True, writes the evaluation metric measured set to stderr.

        task_type : string, [default=None]
            The evaluator type.
            Possible values:
                - 'CPU'
                - 'GPU' (models with only numerical features are supported for now)

        Returns
        -------
        prediction :
            If data is for a single object, the return value depends on prediction_type value:
                - 'RawFormulaVal' : return raw formula value.
                - 'Class' : return class label.
                - 'Probability' : return one-dimensional numpy.ndarray with probability for every class.
            otherwise numpy.ndarray, with values that depend on prediction_type value:
                - 'RawFormulaVal' : one-dimensional array of raw formula value for each object.
                - 'Class' : one-dimensional array of class label for each object.
                - 'Probability' : two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                  with probability for every class for each object.
        predictr  r3   rX  r  r  r  rk  r   r   s           r   r  zCatBoost.predictB
  s%    n }}T?KLZaclnwxxr   c                 T   |xs | j                  d      }|d}| j                  |||      \  }}| j                  |ddg       |dk(  r| j                  }| j	                  ||||||      }	|dk(  r1|	j
                  }
|	j                  |
d   |t        |
d   |z              }	|r|	d   S |	S )Nr   FVirtEnsemblesTotalUncertaintyr   r   )r  r  r  rG  r  r  r  rn   )r3   rX  r  r  r  rk  r   r  r  r  r  s              r   _virtual_ensembles_predictz#CatBoost._virtual_ensembles_predict{
  s    6T^^I6?G&*&F&FtM_am&n##&&J\8]^>((I::4R[]t  wC  EL  Mo-%%E%--eAh8OQTUZ[\U]`wUwQxyK!6{1~GKGr   c           	      0    | j                  ||||||d      S )a
  
        Predict with data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        prediction_type : string, optional (default='RawFormulaVal')
            Can be:
            - 'VirtEnsembles': return V (virtual_ensembles_count) predictions.
                k-th virtEnsemle consists of trees [0, T/2] + [T/2 + T/(2V) * k, T/2 + T/(2V) * (k + 1)]  * constant.
            - 'TotalUncertainty': see returned predictions format in 'Returns' part

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        virtual_ensembles_count: int, optional (default=10)
            virtual ensembles count for 'TotalUncertainty' and 'VirtEnsembles' prediction types.

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool, optional (default=False)
            If True, writes the evaluation metric measured set to stderr.

        Returns
        -------
        prediction :
            (with V as virtual_ensembles_count and T as trees count,
            k-th virtEnsemle consists of trees [0, T/2] + [T/2 + T/(2V) * k, T/2 + T/(2V) * (k + 1)]  * constant)
            If data is for a single object, return 1-dimensional array of predictions with size depends on prediction type,
            otherwise return 2-dimensional numpy.ndarray with shape (number_of_objects x size depends on prediction type);
            Returned predictions depends on prediction type:
            If loss-function was RMSEWithUncertainty:
                - 'VirtEnsembles': [mean0, var0, mean1, var1, ..., vark-1].
                - 'TotalUncertainty': [mean_predict, KnowledgeUnc, DataUnc].
            otherwise for regression:
                - 'VirtEnsembles':  [mean0, mean1, ...].
                - 'TotalUncertainty': [mean_predicts, KnowledgeUnc].
            otherwise for binary classification:
                - 'VirtEnsembles':  [ApproxRawFormulaVal0, ApproxRawFormulaVal1, ..., ApproxRawFormulaValk-1].
                - 'TotalUncertainty':  [DataUnc, TotalUnc].
        virtual_ensembles_predict)r  )r3   rX  r  r  r  rk  r   s          r   r  z"CatBoost.virtual_ensembles_predict
  s0    f ..t_iQhjvx  B]  ^  	^r   c	           	   #      K   |xs | j                  d      }|d}| j                  |||      \  }}	| j                  |       |dk(  r| j                  }| j	                  |||||||      }
|
D ]  }|	r|d   n|  y wr  )r  r  r  rG  r  )r3   rX  r  r  r  r  rk  r   r  r  staged_predict_iteratorr  s               r   _staged_predictzCatBoost._staged_predict
  s     6T^^I6?G&*&F&FtM_am&n##&&7>((I"&"?"?oWbdmoz  }I  KR  #S2K$9+a.{J 3s   A<A>c           
      2    | j                  |||||||d      S )a
  
        Predict target at each stage for data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        prediction_type : string, optional (default='RawFormulaVal')
            Can be:
            - 'RawFormulaVal' : return raw formula value.
            - 'Class' : return class label.
            - 'Probability' : return probability for every class.
            - 'RMSEWithUncertainty': return standard deviation for RMSEWithUncertainty loss function
              (logarithm of the standard deviation is returned by default).

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        eval_period: int, optional (default=1)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool
            If True, writes the evaluation metric measured set to stderr.

        Returns
        -------
        prediction : generator for each iteration that generates:
            If data is for a single object, the return value depends on prediction_type value:
                - 'RawFormulaVal' : return raw formula value.
                - 'Class' : return class label.
                - 'Probability' : return one-dimensional numpy.ndarray with probability for every class.
            otherwise numpy.ndarray, with values that depend on prediction_type value:
                - 'RawFormulaVal' : one-dimensional array of raw formula value for each object.
                - 'Class' : one-dimensional array of class label for each object.
                - 'Probability' : two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                  with probability for every class for each object.
        staged_predictr  r3   rX  r  r  r  r  rk  r   s           r   r  zCatBoost.staged_predict
  s2    f ##D/;	S^`lnu  xH  I  	Ir   c              #      K   |dk(  r| j                   }| j                  |dd      \  }}| j                  |||      }|D ]  }|  y w)Nr   iterate_leaf_indexesrJ   rk  )rG  r  r  )r3   rX  r  r  r  leaf_indexes_iterator
leaf_indexs          r   _iterate_leaf_indexeszCatBoost._iterate_leaf_indexes  sZ     >((I2249O^`2aa $ ; ;D+y Y/J 0s   AA
c                 (    | j                  |||      S )a  
        Returns indexes of leafs to which objects from pool are mapped by model trees.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        ntree_start: int, optional (default=0)
            Index of first tree for which leaf indexes will be calculated (zero-based indexing).

        ntree_end: int, optional (default=0)
            Index of the tree after last tree for which leaf indexes will be calculated (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        Returns
        -------
        leaf_indexes : generator. For each object in pool yields one-dimensional numpy.ndarray of leaf indexes.
        )r  )r3   rX  r  r  s       r   r  zCatBoost.iterate_leaf_indexes
  s    . ))$YGGr   c                 z    |dk(  r| j                   }| j                  |d|      \  }}| j                  |||||      S )Nr   calc_leaf_indexes)rG  r  r  )r3   rX  r  r  rk  r   r  s          r   _calc_leaf_indexeszCatBoost._calc_leaf_indexes#  sG    >((I2249Ll[a++D+y,X_``r   c                 ,    | j                  |||||      S )a  
        Returns indexes of leafs to which objects from pool are mapped by model trees.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        ntree_start: int, optional (default=0)
            Index of first tree for which leaf indexes will be calculated (zero-based indexing).

        ntree_end: int, optional (default=0)
            Index of the tree after last tree for which leaf indexes will be calculated (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool (default=False)
            Enable debug logging level.

        Returns
        -------
        leaf_indexes : 2-dimensional numpy.ndarray of numpy.uint32 with shape (object count, ntree_end - ntree_start).
            i-th row is an array of leaf indexes for i-th object.
        )r  )r3   rX  r  r  rk  r   s         r   r  zCatBoost.calc_leaf_indexes)  s    @ &&t[)\SZ[[r   c                 X    | j                         st        d      | j                         S NzModel is not fitted)r  rD   r  r@   s    r   r  z CatBoost.get_cat_feature_indicesK  s'    ~~ 566,,..r   c                 X    | j                         st        d      | j                         S r  )r  rD   r  r@   s    r   r  z!CatBoost.get_text_feature_indicesP  s'    ~~ 566--//r   c                 X    | j                         st        d      | j                         S r  )r  rD   r  r@   s    r   r  z&CatBoost.get_embedding_feature_indicesU  s'    ~~ 5662244r   c                 &   | j                         st        d      t        |t              s#t        dj	                  t        |                  |j                  rt        d      t        |t              sCt        |t              s3t        |t              s#t        dj	                  t        |                  t        t        d |            st        d      |t        j                         }t        |t              st        |t              r|g}t        |      }t        ||      5  t!        |	|
d|g      5  | j#                  ||||||||      \  }}d d d        d d d        t%        t'                    S # 1 sw Y   &xY w# 1 sw Y   *xY w)NbThere is no trained model to evaluate metrics on. Use fit() to train model. Then call this method.,Invalid data type={}, must be catboost.Pool.zData is empty.zdInvalid metrics type={}, must be list(), str() or one of builtin catboost.metrics.* class instances.c                 F    t        | t              xs t        | t              S r(   )rg   r   r#   )metrics    r   <lambda>z(CatBoost._eval_metrics.<locals>.<lambda>c  s    j&F&k*U[]jJk&kr   z[Invalid metric type: must be string() or one of builtin catboost.metrics.* class instances.zEval metrics plot)r  rD   rg   rD  r   r   r  rk   rt   r#   r  r  tempfilemkdtemprp  re   rD  r  r'  ro  )r3   rX  rt  r  r  r  rk  res_dirr  rA  rB  rW   rX   metrics_scoremetric_namess                  r   _eval_metricszCatBoost._eval_metricsZ  s   ~~  !E  F  F$% N U UVZ[_V` abb>> 011';/
7L8Yblmt  wD  cE  !G  !N  !N  OS  T[  O\  !]  ^  ^3kmtuv }~~?&&(Gg|,
7M0RiG09x*LyJ]`g_h,i*.*A*A$Q\^git  wC  EL  NU  +V'M< -j* Cm455 -j,i**s$   *F:E;F;F	 FFc                 h    | j                  ||||||t        | j                               |||	|
|      S )a]  
        Calculate metrics.

        Parameters
        ----------
        data : catboost.Pool
            Data to evaluate metrics on.

        metrics : list of strings or catboost.metrics.BuiltinMetric
            List of evaluated metrics.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        eval_period: int, optional (default=1)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        tmp_dir : string or pathlib.Path (default=None)
            The name of the temporary directory for intermediate results.
            If None, then the name will be generated.

        plot : bool, optional (default=False)
            If True, draw train and eval error in Jupyter notebook

        plot_file : file-like or str, optional (default=None)
            If not None, save train and eval error graphs to file

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Returns
        -------
        prediction : dict: metric -> array of shape [(ntree_end - ntree_start) / eval_period]
        )r  r7  r  )r3   rX  rt  r  r  r  rk  r  rA  rB  rW   rX   s               r   eval_metricszCatBoost.eval_metricsp  sl    ^ !!$iVbdrsw  tC  tC  tE  eF  HO  QU  W`  bj  lt  u  	ur   c                 8   |t        d      |t        d      |t        d      d}|d}t        j                         }t        j                  j                  |d      }t        j                  j                  |d      }t        |       t        |       t        |       t        d|	d	||g
      5  | j                  ||||||||dd|
|       |j                  ||||||||dd|
|       ddd       |rt        j                  |       yy# 1 sw Y   "xY w)a  
        Draw train and eval errors in Jupyter notebook for both models

        Parameters
        ----------
        model: CatBoost model
            Another model to draw metrics

        data : catboost.Pool
            Data to evaluate metrics on.

        metrics : list of strings or catboost.metrics.BuiltinMetric
            List of evaluated metrics.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        eval_period: int, optional (default=1)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        tmp_dir : string or pathlib.Path (default=None)
            The name of the temporary directory for intermediate results.
            If None, then the name will be generated.

        plot_file : file-like or str, optional (default=None)
            If not None, save eval error graphs to file

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used
        Nz(You should provide model for comparison.z'You should provide data for comparison.z*You should provide metrics for comparison.FTfirst_modelsecond_modelzCompare modelsrB  rC  r;  )rA  rB  rW   rX   )rD   r
  r  r)   r*   r/  r-   rD  r  shutilrmtree)r3   r  rX  rt  r  r  r  rk  r  rB  rW   rX   need_to_remove	first_dir
second_dirs                  r   comparezCatBoost.compare  s3   X = JKK< IJJ? LMM?!N&&(GGGLL-8	WW\\'>:
(	*
+$)@P^gis]tutWk9kS_ajls$)THW_  ag{I{T`blnu%*dXX`   b v MM'"  vus   4;DDc           	      n    | j                         st        d      t        | j                  ||||||      S )a*  
        Create batch metric calcer. Could be used to aggregate metric on several pools
        Parameters
        ----------
            Same as in eval_metrics except data
        Returns
        -------
            BatchMetricCalcer object

        Usage example
        -------
        # Large dataset is partitioned into parts [part1, part2]
        model.fit(params)
        batch_calcer = model.create_metric_calcer(['Logloss'])
        batch_calcer.add(part1)
        batch_calcer.add(part2)
        metrics = batch_calcer.eval_metrics()
        r  )r  rD   BatchMetricCalcerr  )r3   rt  r  r  r  rk  r  s          r   create_metric_calcerzCatBoost.create_metric_calcer  sA    & ~~  !E  F  F wYP[]ikrssr   c                     | j                   j                         }|r+t        |      r t        j                  t        | dd             S t        j                  t        | dd             S )Nr  r  )r  r  r  r   r  r  )r3   r  s     r   feature_importances_zCatBoost.feature_importances_  sQ    ||335'-88GD*>EFF88GD*EtLMMr   AutoRawr      i   Tc                 $   t        ||      5  t        |t              st        |t              st	        d      t        |      }|dk  rt	        d      ||}t        t        |      }|t        j                  k(  rH| j                  j                         }|rt        |      rt        j                  }nt        j                  }|t        j                  k(  rR| j                  |d|      \  }}|j                         dk7  r^t	        dj!                  ||j                                     |3t        |t"              s#t	        dj!                  t%        |                  |t        j                  k(  }|xr | j                  j'                         xs |t        j(                  k(  }|s@|'|rt	        d	      t	        d
j!                  |            |j*                  rt	        d      t        t,        |
      j.                  }
| j1                  ||||||||	|
|||      \  }}|t        j                  t        j                  t        j                  t        j2                  fv r|D cg c]  }|d   	 }}d}|t        j                  k(  rd}|t        j                  k(  rd}|rt5        | ||       |r;t7        t9        ||      t;        d      d      }ddg}t=        ||      cddd       S t?        j@                  |      cddd       S |t        jB                  k(  rt        |d   d   tD              rtt?        j@                  |D cg c]H  }t?        j@                  |D cg c]%  }t?        j@                  |D cg c]  }| c}      ' c}}      J c}}}      cddd       S |D cg c]  }|D cg c]  }| c} }}}|rt=        |      cddd       S t?        j@                  |      cddd       S |t        jF                  k(  r%t        |d   d   tD              rtt?        j@                  |D cg c]H  }t?        j@                  |D cg c]%  }t?        j@                  |D cg c]  }| c}      ' c}}      J c}}}      cddd       S t?        j@                  |D cg c]m  }t?        j@                  |D cg c]H  }t?        j@                  |D cg c]%  }t?        j@                  |D cg c]  }| c}      ' c}}      J c}}}      o c}}}}      cddd       S |t        j(                  k(  rg|D  cg c]"  } t        | d         t        | d         | d   g$ }} |rg d}t=        ||      cddd       S t?        j@                  |      cddd       S 	 ddd       yc c}w c c}w c c}}w c c}}}w c c}w c c}}w c c}w c c}}w c c}}}w c c}w c c}}w c c}}}w c c}}}}w c c} w # 1 sw Y   yxY w)a  
        Parameters
        ----------
        data :
            Data to get feature importance.
            If type in ('LossFunctionChange', 'ShapValues', 'ShapInteractionValues') data must of Pool type.
                For every object in this dataset feature importances will be calculated.
            if type == 'SageValues' data must of Pool type.
                For every feature in this dataset importance will be calculated.
            If type == 'PredictionValuesChange', data is None or a dataset of Pool type
                Dataset specification is needed only in case if the model does not contain leaf weight information (trained with CatBoost v < 0.9).
            If type == 'PredictionDiff' data must contain a matrix of feature values of shape (2, n_features).
                Possible types are catboost.Pool or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData or pandas.SparseDataFrame or scipy.sparse.spmatrix
            If type == 'FeatureImportance'
                See 'PredictionValuesChange' for non-ranking metrics and 'LossFunctionChange' for ranking metrics.
            If type == 'Interaction'
                This parameter is not used.

        type : EFstrType or string (converted to EFstrType), optional
                    (default=EFstrType.FeatureImportance)
            Possible values:
                - PredictionValuesChange
                    Calculate score for every feature.
                - LossFunctionChange
                    Calculate score for every feature by loss.
                - FeatureImportance
                    PredictionValuesChange for non-ranking metrics and LossFunctionChange for ranking metrics
                - ShapValues
                    Calculate SHAP Values for every object.
                - ShapInteractionValues
                    Calculate SHAP Interaction Values between each pair of features for every object
                - Interaction
                    Calculate pairwise score between every feature.
                - PredictionDiff
                    Calculate most important features explaining difference in predictions for a pair of documents.
                - SageValues
                    Calculate SAGE value for every feature

        prettified : bool, optional (default=False)
            change returned data format to the list of (feature_id, importance) pairs sorted by importance

        thread_count : int, optional (default=-1)
            Number of threads.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool or int
            If False, then evaluation is not logged. If True, then each possible iteration is logged.
            If a positive integer, then it stands for the size of batch N. After processing each batch, print progress
            and remaining time.

        fstr_type : string, deprecated, use type instead

        shap_mode : string, optional (default="Auto")
            used only for ShapValues type
            Possible values:
                - "Auto"
                    Use direct SHAP Values calculation only if data size is smaller than average leaves number
                    (the best of two strategies below is chosen).
                - "UsePreCalc"
                    Calculate SHAP Values for every leaf in preprocessing. Final complexity is
                    O(NT(D+F))+O(TL^2 D^2) where N is the number of documents(objects), T - number of trees,
                    D - average tree depth, F - average number of features in tree, L - average number of leaves in tree
                    This is much faster (because of a smaller constant) than direct calculation when N >> L
                - "NoPreCalc"
                    Use direct SHAP Values calculation calculation with complexity O(NTLD^2). Direct algorithm
                    is faster when N < L (algorithm from https://arxiv.org/abs/1802.03888)

        shap_calc_type : EShapCalcType or string, optional (default="Regular")
            used only for ShapValues type
            Possible values:
                - "Regular"
                    Calculate regular SHAP values
                - "Approximate"
                    Calculate approximate SHAP values
                - "Exact"
                    Calculate exact SHAP values

        interaction_indices : list of int or string (feature_idx_1, feature_idx_2), optional (default=None)
            used only for ShapInteractionValues type
            Calculate SHAP Interaction Values between pair of features feature_idx_1 and feature_idx_2 for every object

        reference_data: catboost.Pool or None
            Reference data for Independent Tree SHAP values from https://arxiv.org/abs/1905.04610v1
            if type == 'ShapValues' and reference_data is not None, then Independent Tree SHAP values are calculated

        sage_n_samples: int, optional (default=32)
            Number of outer samples used in SAGE values approximation algorithm
        sage_batch_size: int, optional (default=min(512, number of samples in dataset))
            Number of samples used on each step of SAGE values approximation algorithm
        sage_detect_convergence: bool, optional (default=False)
            If set True, sage values calculation will be stopped either when sage values converge
            or when sage_n_samples iterations of algorithm pass

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Returns
        -------
        depends on type:
            - FeatureImportance
                See PredictionValuesChange for non-ranking metrics and LossFunctionChange for ranking metrics.
            - PredictionValuesChange, LossFunctionChange, PredictionDiff, SageValues with prettified=False (default)
                list of length [n_features] with feature_importance values (float) for feature
            - PredictionValuesChange, LossFunctionChange, PredictionDiff, SageValues with prettified=True
                list of length [n_features] with (feature_id (string), feature_importance (float)) pairs, sorted by feature_importance in descending order
            - ShapValues
                np.ndarray of shape (n_objects, n_features + 1) with Shap values (float) for (object, feature).
                In case of multiclass the returned value is np.ndarray of shape
                (n_objects, classes_count, n_features + 1). For each object it contains Shap values (float).
                Values are calculated for RawFormulaVal predictions.
            - ShapInteractionValues
                np.ndarray of shape (n_objects, n_features + 1, n_features + 1) with Shap interaction values (float) for (object, feature(i), feature(j)).
                In case of multiclass the returned value is np.ndarray of shape
                (n_objects, classes_count, n_features + 1, n_features + 1). For each object it contains Shap interaction values (float).
                Values are calculated for RawFormulaVal predictions.
            - Interaction
                list of length [n_features] of 3-element lists of (first_feature_index, second_feature_index, interaction_score (float))
        verbose should be bool or int.r   verbose should be non-negative.Nr  r   z){} requires a pair of documents, found {}r  zModel has no meta information needed to calculate feature importances.                             Pass training dataset to this function.zoFeature importance type {} requires training dataset                             to be passed to this function.zdata is empty.r  r  r   T)rz   reversez
Feature IdImportances)r  )zFirst Feature IndexzSecond Feature Indexr   )$re   rg   rl   rn   rD   r   r   r   r  r  r  r   r   r   r  r  r   rD  _typeofr  r   r  r   rv   r  r   r  sortedro  r   r   r   r  r   rk   r   )!r3   rX  r   
prettifiedrk  r   	fstr_typer  r  r  r  r  r  r   r  rW   rX   r  r  need_meta_infoempty_data_is_okfstrr   rv   feature_importancesattribute_namer  doc	dimensionrA  feature1feature2rows!                                    r   r  zCatBoost.get_feature_importance  s   @ x*gt,Z5M#$DEE'lG{#$EFF$ (D9Dy222||;;=/5$77D$;;Dy///::4AY[gha<<>Q&'(S(Z(Z[_aeamamao(pqq#JtT,B'(V(](]^efj^k(lmm!Y%E%EEN-[$,,2Y2Y2[|_cgpg|g|_|#<%+EF F ,<<BF4LJ J >>'(8992=.QWWN"&//$nl\ceqs|  R2@.Racz#|D-	88):V:V!00)2F2FH H=A&BTEuQxT#&B!%9;;;%@N9777%9N!&+ *0]DW1X^hij^kuy*z'+];G$%8'J} +*@ 88$78A +*B ---d1gaj+688Y]%_Y]RU &(XXHK/MHK9 02xx+496+4%996 07HK/M &NY]%_ `G +*L CGG$3#6#u#6$FG!(0Q +*T  "xx/U +*V 888d1gaj+688]a%c]aVY &(XXLO/QLO 02xx199;19X9; 0<LO/Q &R]a%c d[ +*` 88z~%@z~sv &(XXil/nil\e 02xxLU9WLU :<19C;19XC; :<LU9W 0Xil/n &oz~%@ Aa +*d ...HLM3s1v;CFSV<M\G$VW=m +*p 88F+q +*d /e +*\ 'C*96 /M %_ 7G9; /Q %cC; 9W /n %@ Ng +*s3  IX"V3.A0X(X>XWV=5	V8$>	V=
WX#X(	W1	W
:W?XX6?X5W W&	W$/	W8
W XX*W9W2W,*5	W'4>	W,*
W2W9	X/X'X-XX3X8V==WX
WXWW  X'W,*,W22W99XXc                    | j                         r%| j                  j                         st        d      t	        |t
              st	        |t              st        d      t        |      }|dk  rt        d      |	|	}t        j                  d       t        |
|      5  | j                  ||||||||      }ddd       |S # 1 sw Y   S xY w)a  
        This is the implementation of the LeafInfluence algorithm from the following paper:
        https://arxiv.org/pdf/1802.06640.pdf

        Parameters
        ----------
        pool : Pool
            The pool for which you want to evaluate the object importances.

        train_pool : Pool
            The pool on which the model has been trained.

        top_size : int (default=-1)
            Method returns the result of the top_size most important train objects.
            If -1, then the top size is not limited.

        type : string, optional (default='Average')
            Possible values:
                - Average (Method returns the mean train objects scores for all input objects)
                - PerObject (Method returns the train objects scores for every input object)

        importance_values_sign : string, optional (default='All')
            Method returns only Positive, Negative or All values.
            Possible values:
                - Positive
                - Negative
                - All

        update_method : string, optional (default='SinglePoint')
            Possible values:
                - SinglePoint
                - TopKLeaves (It is posible to set top size : TopKLeaves:top=2)
                - AllPoints
            Description of the update set methods are given in section 3.1.3 of the paper.

        thread_count : int, optional (default=-1)
            Number of threads.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool or int
            If False, then evaluation is not logged. If True, then each possible iteration is logged.
            If a positive integer, then it stands for the size of batch N. After processing each batch, print progress
            and remaining time.

        ostr_type : string, deprecated, use type instead

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Returns
        -------
        object_importances : tuple of two arrays (indices and scores) of shape = [top_size]
        z:Object importance is not supported for non symmetric treesr%  r   r&  NzK'ostr_type' parameter will be deprecated soon, use 'type' parameter instead)r  r  _is_obliviousrD   rg   rl   rn   r  r  re   r  )r3   r  r  r  r   r  r  rk  r   r  rW   rX   rA  s                r   get_object_importancezCatBoost.get_object_importance  s    z >>DLL$>$>$@ \]]'4(GS1I @AAg,Q; ABB DMMghx*__Zx}Vlnz  }D  EF + +s   !CCc                 H    ||kD  rt        d      | j                  ||       y)af  
        Shrink the model.

        Parameters
        ----------
        ntree_end: int
            Leave the trees with indices from the interval [ntree_start, ntree_end) (zero-based indexing).
        ntree_start: int, optional (default=0)
            Leave the trees with indices from the interval [ntree_start, ntree_end) (zero-based indexing).
        z*ntree_start should be less than ntree_end.N)rD   r	  )r3   r  r  s      r   shrinkzCatBoost.shrink/  s(     " LMM+y1r   c                 $    | j                          y)z=
        Drop unused features information from model
        N)r  r@   s    r   drop_unused_featureszCatBoost.drop_unused_features>  s     	'')r   c           	         | j                         st        d      t        |t              s#t        dj	                  t        |                  |t        |t              sot        |t        |t              s| j                         ndt        |t              s| j                         ndt        |t              s| j                         nd      }| j                  ||||       y)a  
        Save the model to a file.

        Parameters
        ----------
        fname : string
            Output file name.
        format : string
            Possible values:
                * 'cbm' for catboost binary format,
                * 'coreml' to export into Apple CoreML format
                * 'onnx' to export into ONNX-ML format
                * 'pmml' to export into PMML format
                * 'cpp' to export as C++ code
                * 'python' to export as Python code.
        export_parameters : dict
            Parameters for CoreML export:
                * prediction_type : string - either 'probability' or 'raw'
                * coreml_description : string
                * coreml_model_version : string
                * coreml_model_author : string
                * coreml_model_license: string
            Parameters for PMML export:
                * pmml_copyright : string
                * pmml_description : string
                * pmml_model_version : string
        pool : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series or catboost.FeaturesData
            Training pool.
        z^There is no trained model to use save_model(). Use fit() to train model. Then use this method.r  N)rX  rY  rZ  r[  )r  rD   rg   rs   r   r   rD  rO  r  r  r  r  )r3   r   r   r  r  s        r   
save_modelzCatBoost.save_modelD  s    < ~~  !A  B  B%, Y ` `aefkal mnnJtT$:DNtUaDbT::<hlFPQUWcFdd<<>jnPZ[_amPn4#F#F#Htx	D 	(94@r   c                     |du |du z   |du z   dk7  rt        d      || j                  ||       | S || j                  |       | S || j                  |       | S )z
        Load model from a file, stream or blob.

        Parameters
        ----------
        fname : string
            Input file name.
        Nr   z:Exactly one of fname/stream/blob arguments mustn't be None)rD   r  r   r  )r3   r   r   r!  blobs        r   r  zCatBoost.load_modelo  s     TMfn->!C \]]UF+
 	 ""6*  ""4(r   c                 L    | j                         }|i S |j                  |      S )a  
        Get param value from CatBoost model.

        Parameters
        ----------
        key : string
            The key to get param value from.

        Returns
        -------
        value :
            The param value of the key, returns None if param do not exist.
        )r  r   )r3   rz   r   s      r   r  zCatBoost.get_param  s(     ">Izz#r   c                 T    | j                   j                         }|rt        |      S |S )z
        Get all params from CatBoost model.

        Returns
        -------
        result : dict
            Dictionary of {param_key: param_value}.
        )r  r  r   )r3   deepr   s      r   r  zCatBoost.get_params  s+     ""'')F##Mr   c                 l    | j                         st        d      | j                  j                         S )ad  
        Get all params (specified by user and default params) that were set in training from CatBoost model.
        Full parameters documentation could be found here: https://catboost.ai/docs/concepts/python-reference_parameters-list.html

        Returns
        -------
        result : dict
            Dictionary of {param_key: param_value}.
        zbThere is no trained model to use get_all_params(). Use fit() to train model. Then use this method.)r  rD   r  _get_plain_paramsr@   s    r   get_all_paramszCatBoost.get_all_params  s6     ~~  !E  F  F||--//r   c                     t        |t              s#t        dj                  t	        |                  | j                  |       y)z
        Save the model borders to a file.

        Parameters
        ----------
        fname : string or pathlib.Path
            Output file name.
        r  N)rg   rs   rD   r   r   r)  r  s     r   save_borderszCatBoost.save_borders  s:     %, Y ` `aefkal mnn5!r   c                 "    | j                         S )zH
        Return map feature_index: borders for float features.

        )r+  r@   s    r   get_borderszCatBoost.get_borders  s    
   ""r   c                     | j                         rt        d      t        |      D ]  \  }}|| j                  |<    d| j                  v r-| j                  d   dk(  r| j                  j	                  d       | S )z
        Set parameters into CatBoost model.

        Parameters
        ----------
        **params : key=value format
            List of key=value paris. Example: model.set_params(iterations=500, thread_count=2).
        z(You can't change params of fitted model.rk  rJ   )r  rD   r   r  r`   )r3   r   rz   rv   s       r   
set_paramszCatBoost.set_params  sw     >> JKK#F+JC%*Dc" ,T...43D3D^3TXZ3Z!!.1r   c                      fd}d }	 ddl m}  j                         }	 j                  |dd      \  }}
g }i g|j                         z  } j                         }|D ]"  }t        |t              sP j                  | j                  vrt        d	j                  |             j                  j                  |      }n|} j                  |   }||	v sJ d
       |	|   }t!        |      dk(  r<|j"                  j%                  ddg      }||j'                  g  ||||            z  }|j"                  j%                  ddt)        t+        t!        |      dz               dj                  |d         gt-        |dd |dd       D cg c]  \  }}dj                  ||       c}}z   dj                  |d         gz   d      }g }t/        |j1                               D ]x  \  }} ||||||         \  }}|||   |<   |j3                  |j5                  |ddj                  |                   |j3                  |j5                  |g||   gd             z  ||||      }||j'                  ||      gz  }% |rt7        |       |rt9        |d|       ||fS # t        $ r.}t        j                  d       t        t        |            d}~ww xY wc c}}w )a*  
        To use this function, you should install plotly.

        data: numpy.ndarray or pandas.DataFrame or catboost.Pool
        features_to_change: list-like with int (for indices) or str (for names) elements
            Numerical features indices or names in `data` for which you want to vary prediction value.
        plot: bool
            Plot predictions.
        plot_file: str
            Output file for plot predictions.
        Returns
        -------
            List of list of predictions for all buckets for all samples in data
        c                    t        d|d   z  d      }t        d|d   z  d      }|g|z   |gz   }g }g }d }	t        j                  | |         r|dk(  rt	        |      nd}	t        t	        |      dz
        D ]P  }
|||
   ||
dz      z   dz  gz  }|	| |   ||
dz      k  r|
}	| |   }|d   | |<   |j                  |       gz  }|| |<   R |	t	        |      }	||	fS )Nr   r   rJ   r   AsTrueg       @)minmaxr   isnanr_   r  r  )r2  r  bordersnan_treatmentleft_extend_borderright_extend_borderextended_borderspointsr  
border_idxr  bufr3   s               r   r  z*CatBoost.plot_predictions.<locals>.predict  s-   !$Q^R!8"%a'"+oq"9 23g=AT@UUFKJxxK()-:h-FS\A
3/0145,Q/2B1q52IIROPP%#k*:=MaRSe=T*T!"J+&#)":K S 122#&K  6 ! \

**r   c                 P    | j                  dj                  |      dddd|      S )Nz%Prediction variation for feature '{}'
Predictionr  r   r   r   r   r   r   r   )Layoutr   )r/  r~  r   s      r   
get_layoutz-CatBoost.plot_predictions.<locals>.get_layout  s9    99=DDWM)""&
    r   r   Nr   vary_feature_value_and_applyrJ   r  No feature named "{}" in model)only float features indexes are supportedBinsr   tickvalsrX  r   r  r   (-inf, {:.4f}]({:.4f}, {:.4f}]({:.4f}, +inf)Fr   tickmoderg  ticktextr   r   zDocument {} predictions)r   r   r   )r   r   r  zPredictions for all buckets)r  r  r  r  r  r   r+  r  r  r-  rg   rn   rQ  rD   r   rw   r_   r   XAxisr   rh   r  ro  ri   get_featuresrV   r#  r    r   )r3   rX  features_to_changerA  rB  r  ra  r/  r0  model_bordersr  figsall_predictionsnan_treatmentsr~  r  rT  r   val_1val_2tracer5  r   r  rZ  r   s   `                         r   plot_predictionszCatBoost.plot_predictions  s    	+*			&*
 ))+2249Wfh2ia$/113)Ggs+&&.'ATAT2T'(H(O(OPW(XYY"1177@%--k:-/\1\\/#K0G7|q 		fsC		r*Re2T	UUIIOO eCL1$456*11'!*=>.1'#2,.LN.LleU -33E5A.LNN +11'"+>?@  % $ 	E E!*4+<+<+>!?X*1(KR`alRm*n'Z4?$[1JJ?IcIjIjknIoJp RZZ:,;z;R:S`eZfg "@  GU3FRYYE&Y9::DK *N T"9&CTJ$$o  	&MMDEc!f%%	&<Ns   J& K &	K/)KKc                     	 ddl m  fdfd}ddfd}fd}	 ||      }
|
D cg c]  } j                         |    }}t        |
      d	vr#t        d
j                  t        |
                  t        |
      dk(  } j                  |d|      \  }}t        j                   j                  j                  ||
|            }|r6|j                  |D cg c]  }t        |      dz    c}      } ||
||      }n |	|
d   |d   |      }|rt!        |       |rt#        |dj                  |      |       ||fS # t        $ r.}t        j                  d       t        t        |            d}~ww xY wc c}w c c}w )a  
        To use this function, you should install plotly.
        data: numpy.ndarray or pandas.DataFrame or catboost.Pool
        features: int, str, list<int>, tuple<int>, list<string>, tuple<string>
            Float features to calculate partial dependence for. Number of features should be 1 or 2.
        plot: bool
            Plot predictions.
        plot_file: str
            Output file for plot predictions.
        thread_count: int
            Number of threads to use. If -1 use maximum available number of threads.
        Returns
        -------
            If number of features is one - 1d numpy array and figure with line plot.
            If number of features is two - 2d numpy array and figure with 2d heatmap.
        r   Nr   c                 h   t        | t              sPj                  | j                  vrt        dj	                  |             j                  j                  |       }n| }|j                         v sJ d       t        j                         |         dkD  sJ dj	                  |             |S )Nrc  rd  r   z(feature with idx {} is not used in model)rg   rn   rQ  rD   r   rw   r+  r_   )r~  r  r3   s     r   getFeatureIdxz7CatBoost.plot_partial_dependence.<locals>.getFeatureIdxZ  s    gs+&&.'ATAT2T'(H(O(OPW(XYY"1177@%$"3"3"55b7bb5t((*;781<|>h>o>op{>||<r   c                     t        | t              st        | t              r| D cg c]
  } |       }}|S t        | t              st        | t              r |       g}|S t        d      c c}w )NzyUnsupported type for argument 'features'. Must be one of: int, string, list<string>, list<int>, tuple<int>, tuple<string>)rg   rh   rq   rn   r   rD   )r   r~  features_idxsr|  s      r   getFeatureIndicesz;CatBoost.plot_partial_dependence.<locals>.getFeatureIndicese  s~    (D)Z%-HGO PxGw!7x P
 ! 	 Hc*j3.G!.x!8 9 !  $  %b  c  c	 !Qs   A/c                 >   |dndj                  |      dt        t        t        |       dz               dj                  | d         gt	        | d d | dd        D cg c]  \  }}dj                  ||       c}}z   d	j                  | d         gz   d
dS c c}}w )Nre  zBins of feature '{}'r  r   ri  r   rJ   rj  rk  Frl  )r   rh   r  r_   ro  )rT  feature_namerv  rw  s       r   getAxisParamsz7CatBoost.plot_partial_dependence.<locals>.getAxisParamsn  s    #/#7=U=\=\]i=j# s7|a'7!89-44WQZ@A14WSb\712;1OQ1O 066ueD1OQQ .44WR[ABC #() )
Qs   Bc                     j                   j                  d
i  |d   | d         } j                   j                  d
i  |d   | d         }j                  dj	                  dj	                  dj                  t        t        |                         ||      }j                  j                  |      |	      }|S )Nr   )r  r   z'Partial dependence plot for features {}z'{}'z', 'r_  )zrh  r   )
r   ro  YAxisr`  r   r/  r  r   r   Heatmap)	r   rT  r  r   r   r   r3  r  r/  s	          r   plot2dz0CatBoost.plot_partial_dependence.<locals>.plot2dy  s    #BIIOO_mGAJ][\M]&^_E#BIIOO_mGAJ][\M]&^_EYY?FFxW_WdWdehiln{e|W}G~  F
 ))k!:6)JCJr   c                      j                   j                  d	i  |      }ddd}j                  dj                  |       ||      }j	                  j                  |d      |      }|S )
NzMean Predictionr  )r   r   z(Partial dependence plot for feature '{}'r_  r   )r   r   rh  r   )r   ro  r`  r   r   r#  )	r~  rT  r  r   r   r   r3  r  r/  s	          r   plot1dz0CatBoost.plot_partial_dependence.<locals>.plot1d  s|    #BIIOO=mG&<=E*E YY@GGP  F
 ))k!PY_)`CJr   )r   r   z-Number of 'features' should be 1 or 2, got {}r   plot_partial_dependencer  r   z)Partial dependence plot for features '{}'r(   )r  r  r  r  r  r   r+  r_   rD   r   r  r   r  r  _calc_partial_dependencer  r    r   )r3   rX  r   rA  rB  rk  r0  r  r  r  features_idxr5  rT  
is_2d_plotr  rt  r   r3  r  r|  r/  s   `                 @@@r   r  z CatBoost.plot_partial_dependenceB  s   $	&*
			!		)			 )27CD|4$$&s+|D|F* Q X XY\]iYj kll&!+
2249Ram2na((4<<#H#H|]i#jk-5576S7as1vz76STOwACa'!*oFCS!9&Q&X&XYa&bdgh##c  	&MMDEc!f%%	&z E 7Ts"   E
 F$F	
	F)E<<Fc
           
      `
   |du }
t        |t              sd|i}t        |t              sd|i}|
st        |      t        |      k(  sJ d       |
s(|j                         |j                         k(  sJ d       |j                         D ]/  }| j	                  ||   d||j                  |d            \  ||<   }1 t        |j                               t        |j                               }}|| j                  d      dv rdnd	}|d
vrt        dj                  |            || j                  }|i }n/t        |t              st        |t              rt        d      ||i}t        |t              st        |t              r|g}d}n|}d}g }g }g }g }i }|D ]  }t        |t              sP| j                  || j                  vrt        dj                  |            | j                  j                  |      }n|}| j                  |   }||v rz||v r||   ||<   |j                  |       |||<   | j                   j#                  |      \  }}|dk(  r#|j                  |       |j                  d       |j                  |       |j                  d        |D cg c]!  }| j                   j%                  |||||      # }}t'        t              }t        |      }d|}}t)        |      D ]z  \  }}||   } ||    }|dk(  rA| |vr/| j                   j+                  |d   |      }!|!D "cg c]  }"|" }!}"n||    }!t        |!t,              st        dj                  |            t               }#|!D ]$  }"| j                   j/                  |"||         |#|"<   & |#j1                         D "$ci c]  \  }"}$|$|"
 }%}"}$t)        |      D ]t  \  }}&t3        j4                  t7        |%j                               D cg c]  }|%|   	 c}      |&|   d<   |&|   j9                  dd       ||   j                  |&|          v |dz  }X|D ]  }&||   j                  |&|           |dz  }} |s|	5t;        ||| j                  |      }'|rt=        |'       |	t?        |	d|'g       |j                         D ]  }t        ||         dk(  s||   d   ||<   ! |r|||d         S i }(|D ]&  }t        |t              r	||   |(|<   |||      |(|<   ( |(S c c}w c c}"w c c}$}"w c c}w )a  
        Get statistics for the feature using the model, dataset and target.
        To use this function, you should install plotly.

        The catboost model has borders for the float features used in it. The borders divide
        feature values into bins, and the model's prediction depends on the number of the bin where the
        feature value falls in.

        For float features this function takes model's borders and computes
        1) Mean target value for every bin;
        2) Mean model prediction for every bin;
        3) The number of objects in dataset which fall into each bin;
        4) Predictions on varying feature. For every object, varies the feature value
        so that it falls into bin #0, bin #1, ... and counts model predictions.
        Then counts average prediction for each bin.

        For categorical features (only one-hot supported) does the same, but takes feature values
        provided in cat_feature_values instead of borders.

        Parameters
        ----------
        data: numpy.ndarray or pandas.DataFrame or catboost. Pool or dict {'pool_name': pool} if you want several pools
            Data to compute statistics on
        target: numpy.ndarray or pandas.Series or dict {'pool_name': target} if you want several pools or None
            Target corresponding to data
            Use only if data is not catboost.Pool.
        feature: None, int, string, or list of int or strings
            Features indexes or names in pd.DataFrame for which you want to get statistics.
            None, if you need statistics for all features.
        prediction_type: str
            Prediction type used for counting mean_prediction: 'Class', 'Probability' or 'RawFormulaVal'.
            If not specified, is derived from the model.
        cat_feature_values: list or numpy.ndarray or pandas.Series or
                            dict: int or string to list or numpy.ndarray or pandas.Series
            Contains categorical feature values you need to get statistics on.
            Use dict, when parameter 'feature' is a list to specify cat values for different features.
            When parameter 'feature' is int or str, you can just pass list of cat values.
        plot: bool
            Plot statistics.
        max_cat_features_on_plot: int
            If categorical feature takes more than max_cat_features_on_plot different unique values,
            output result on several plots, not more than max_cat_features_on_plot feature values on each.
            Used only if plot=True or plot_file is not None.
        thread_count: int
            Number of threads to use for getting statistics.
        plot_file: str
            Output file for plot statistics.

        Returns
        -------
        dict if parameter 'feature' is int or string, else dict of dicts:
            For each unique feature contain
            python dict with binarized feature statistics.
            For float feature, includes
                    'borders' -- borders for the specified feature in model
                    'binarized_feature' -- numbers of bins where feature values fall
                    'mean_target' -- mean value of target over each bin
                    'mean_prediction' -- mean value of model prediction over each bin
                    'objects_per_bin' -- number of objects per bin
                    'predictions_on_varying_feature' -- averaged over dataset predictions for
                    varying feature (see above)
            For one-hot feature, returns the same, but with 'cat_values' instead of 'borders'
        Nr   z$inconsistent size of data and targetz*inconsistent pool_names of data and targetget_binarized_statisticsrc  )CrossEntropyre  ProbabilityRawFormulaVal)Classr  r  ExponentzUnknown prediction type "{}"z9cat_feature_values should be dict when features is a listTFrc  r  catrp   r   zfFeature '{}' is categorical. Please provide values for which you need statistics in cat_feature_values
cat_valuesrT  r   zCatboost metrics graph) rg   r'  r_   rU  r  r   rh   r  r  rD   r   rQ  r   rn   rw   rV   r  $_get_feature_type_and_internal_index_get_binarized_statisticsr   ri   _get_cat_feature_valuesrk   _calc_cat_feature_perfect_hashr  r   r  r*  r`   _plot_feature_statisticsr    r   ))r3   rX  targetr~  r  cat_feature_valuesrA  max_cat_features_on_plotrk  rB  target_is_nonerz   r  
pool_namesr   is_for_one_featurecat_features_numsfloat_features_numsfeature_type_mapperr   feature_name_to_numfeature_numfeature_typefeature_internal_index	data_itemresultsstatistics_by_featureto_float_offset	cat_indexfloat_indexr  r   r  cat_feature_values_rq  val_to_hashhashhash_to_valresr3  return_statss)                                            r   calc_feature_statisticsz CatBoost.calc_feature_statistics  s   D  4$%:D&$'&\FTc&k!9a;aa9$))+!=k?kk=99;C;;DIGacoqwq{q{|  BF  rG  HLDIq .TYY[0Aj"/3~~o/NRm/mm$  "WW > E Eo VWW?))G%!#0$7gt,'(cdd*13E)F&gs#z'3'?yH!%H!&   Ggs+&&.'ATAT2T'(H(O(OPW(XYY"1177@%--k:-'00.@.M"7+  )+6(37<<3d3dep3q0L0},!(()?@#**51#**+AB#**73)  6  !  i <<99
   	 ! !,D 1/0!"O;	 !45GAt(+L-l;Ku}'99*.,,*N*NtTUwXc*d':M*N:M33:M'*N*<\*J'!"5{C'd 
 #f.C'+||'R'RSVXijsXt'uK$ /:E:K:K:MN:MYS$tSy:MN'0FAs3588U[\g\l\l\nUo<pUoPQ[^Uo<p3qC	N<0	N&&y$7)+6==c)nM 1 Q	"C)+6==c+>NO #q 7 6> 9(*%##(	*C
  %$y*BSEJ(--/C(-.!3-B3-G-J%c* 0 ()<]1=M)NOO G'3'(=g(FW%(=>QRY>Z([W%	   O!$ +O O<ps   %&T4	T 5T% T+c                    ddl m}  |       }d}d}t        t        |      dz
  dd      D ]  }t        |      D ]  }|dk\  r,||   j	                  ddd      j	                  ddd      }	d	}
d
}n	||   }	d}
d}	 |	j                  d      }	|j                  t        |      |	|
|       |dkD  r9|dz
  dz  }|dz  dk(  rdnd}|j                  t        |      t        |      |       |dz  } |dz  } |S # t        $ r Y ww xY w)Nr   Digraphr   r]   rJ   bin=value>border=blackellipseredrectutf-8r   r  r   YesNo)
graphvizr  r  r_   replacedecoder   noder   edge)r3   splitsleaf_valuesr  r_  
layer_sizecurrent_size	split_numnode_num
node_labelr   r  parent
edge_labels                 r   _plot_oblivious_treezCatBoost._plot_oblivious_trees  s3   $	
s6{QB7I!*->!'	!2!:!:68Q!O!W!WXackmn!oJ#E%E!,X!6J!E"E!+!2!27!;J 

3|,jU
S!#*Q.14F*6*:a*?TJJJs6{C,=zJ!- .0 !OJ3 86  ! s   7C44	D ?D c                 Z    ddl m}  |       }fdfd d|       |S )Nr   r  c                 j    dj                  |          }|       }|j                  ||dd       |S )Nzleaf_{}r  r  r  )r   r  )node_idxr_  cur_idr  r  node_to_leafs       r   	plot_leafz3CatBoost._plot_nonsymmetric_tree.<locals>.plot_leaf  s@    %%l8&<=F$\(%;<JJJvzfJEMr   c                    |    dk(  r	 | |      S dj                  |       }|    j                  ddd      j                  ddd      }|j                  ||dd	       |    d
   d
k(  r
 | |      }n | |    d
   z   |      }|j                  ||d       |    d   d
k(  r
 | |      }n | |    d   z   |      }|j                  ||d       |S )Nr   r   znode_{}r  r  r   r  r  r  r  r   r  r  )r   r  r  r  )	r  r_  r  r  child_idr  plot_subtreer  
step_nodess	        r   r  z6CatBoost._plot_nonsymmetric_tree.<locals>.plot_subtree  s   (#v- 511"))(3#H-55fhJRRS\^fhij


6:WI
Nh'*a/(59H+Hz(7KA7N,NPUVH

68T2h'*a/(59H+Hz(7KA7N,NPUVH

68U3Mr   )r  r  )	r3   r  r  r  r  r  r_  r  r  s	    ````  @@r   _plot_nonsymmetric_treez CatBoost._plot_nonsymmetric_tree  s(    $			* 	Qr   c                 B   || j                  |dd      nd\  }}| j                  ||      }| j                  |      }| j                  j	                         r| j                  ||      S | j                  |      }| j                  |      }| j                  ||||      S )N	plot_treerJ   r  ra   )	r  r\  r_  r  r8  r  rb  rd  r  )r3   r]  r  r  r  r  r  r  s           r   r  zCatBoost.plot_tree  s    Z^Zj$224SU2Vp|a&&x600:<<%%',,V[AA228<J66x@L//ZQ]^^r   c                    |	r| j                         rt        d      t        ||      5  h d}t        |t              r|g}t        |      D ]@  \  }}t        |       t        |      }|D ]   }||v st        dj                  |             B |t        d      |#t        |t        t        fz         st        d      t        |t        t        f      st        dj                  |            | j                  ||      }|d   }d }d	}t        |t              r4|}|j                  d
d       }|Et        |t               xr t#        |      }n't%        |d      st%        |d      st'        d      |}d}
|/|j                  d
d       }t        |t               xr t#        |      }t)        ||dt+        |      g      5  | j,                  j/                  ||d   |||||
||||||      }d d d        |	r;| j                         rJ  | j0                  di d    | j3                  ||d       d d d        S # 1 sw Y   PxY w# 1 sw Y   S xY w)NzaModel was fitted before hyperparameters tuning. You can't change hyperparameters of fitted model.>   ro  r   rc  r   zBParameter '{}' currently is not supported in hyperparaneter searchr  zNy may be None only when X is an instance of catboost.Pool, str or pathlib.Pathz-Parameter grid is not a dict or a list ({!r}))r+  r   r   r   rc  __iter__splitaM  cv should be one of possible things:
- None, to use the default 3-fold cross validation,
- integer, to specify the number of folds in a (Stratified)KFold
- one of the scikit-learn splitter classes (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
- An iterable yielding (train, test) splits as arrays of indicesFzHyperparameters search plotr  T)r   r   )r  rD   re   rg   r   ri   r`  r  r   rs   rD  r   	TypeErrorr  rm   r   rt   is_cv_stratified_objectiverC   AttributeErrorrD  r7  r  _tune_hyperparamsrM  r  )r3   
param_gridr+  r   cvn_iterpartition_random_seedcalc_cv_statisticssearch_by_train_test_splitrefitshuffle
stratified
train_sizer   rA  rB  rW   rX   currently_not_supported_paramsgrid_numgridparamr  r   custom_folds
fold_countrc  	cv_results                               r   r  zCatBoost._tune_hyperparams  sv   
 T^^%  !D  E  Ex*.* *g.(\
"+J"7$(.(.;E}+,p,w,wx},~ <	 #8 y#$899yAzTG/C!D#$tuuj7H*=> O V VWa bcc55Q5?L!(+FLJ"m,
 &

?D A%!+M<!H!vMghuMvJr:.wr77K(]   "! &

?D A'|DrIcdqIr
dI/L~^dOeNfg LL::\ :FF 5w
J.0BLRY	 h >>+++6)H"56Ad+A +B  hgm +B s,   AI3EI+H<-AI<I	IIc                 >   t        |t              r|g}|D ]d  }t        |t              st        dj                  |            |D ]3  }t        ||   t              rt        dj                  |||                f | j                  ||||d|||||	|
||||||      S )aW  
        Exhaustive search over specified parameter values for a model.
        After calling this method model is fitted and can be used, if not specified otherwise (refit=False).

        Parameters
        ----------
        param_grid: dict or list of dictionaries
            Dictionary with parameters names (string) as keys and lists of parameter settings
            to try as values, or a list of such dictionaries, in which case the grids spanned by each
            dictionary in the list are explored.
            This enables searching over any sequence of parameter settings.

        X: numpy.ndarray or pandas.DataFrame or catboost.Pool
            Data to compute statistics on

        y: list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Labels of the training data.
            If not None, can be a single- or two- dimensional array with either:
              - numerical values - for regression (including multiregression), ranking and binary classification problems
              - class labels (boolean, integer or string) - for classification (including multiclassification) problems
            Use only if X is not catboost.Pool and does not point to a file.

        cv: int, cross-validation generator or an iterable, optional (default=None)
            Determines the cross-validation splitting strategy. Possible inputs for cv are:
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a (Stratified)KFold
            - one of the scikit-learn splitter classes
                (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
            - An iterable yielding (train, test) splits as arrays of indices.

        partition_random_seed: int, optional (default=0)
            Use this as the seed value for random permutation of the data.
            Permutation is performed before splitting the data for cross validation.
            Each seed generates unique data splits.
            Used only when cv is None or int.

        search_by_train_test_split: bool, optional (default=True)
            If True, source dataset is splitted into train and test parts, models are trained
            on the train part and parameters are compared by loss function score on the test part.
            After that, if calc_cv_statistics=true, statistics on metrics are calculated
            using cross-validation using best parameters and the model is fitted with these parameters.

            If False, every iteration of grid search evaluates results on cross-validation.
            It is recommended to set parameter to True for large datasets, and to False for small datasets.

        calc_cv_statistics: bool, optional (default=True)
            The parameter determines whether quality should be estimated.
            using cross-validation with the found best parameters. Used only when search_by_train_test_split=True.

        refit: bool (default=True)
            Refit an estimator using the best found parameters on the whole dataset.

        shuffle: bool, optional (default=True)
            Shuffle the dataset objects before parameters searching.

        stratified: bool, optional (default=None)
            Perform stratified sampling. True for classification and False otherwise.
            Currently supported only for final cross-validation.

        train_size: float, optional (default=0.8)
            Should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split.

        verbose: bool or int, optional (default=True)
            If verbose is int, it determines the frequency of writing metrics to output
            verbose==True is equal to verbose==1
            When verbose==False, there is no messages

        plot : bool, optional (default=False)
            If True, draw train and eval error for every set of parameters in Jupyter notebook

        plot_file : file-like or str, optional (default=None)
            If not None, save train and eval error for every set of parameters to file

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Returns
        -------
        dict with two fields:
            'params': dict of best found parameters
            'cv_results': dict or pandas.core.frame.DataFrame with cross-validation results
                columns are: test-error-mean  test-error-std  train-error-mean  train-error-std
        z#Parameter grid is not a dict ({!r})z;Parameter grid value is not iterable (key={!r}, value={!r})rJ   r  r+  r   r  r  r  r  r  r  r  r  r  r   rA  rB  rW   rX   )rg   r   r  r   r   r  )r3   r  r+  r   r  r  r  r  r  r  r  r  r   rA  rB  rW   rX   r  rz   s                      r   grid_searchzCatBoost.grid_search  s    t j'*$JDdG, E L LT RSS!$s)X6#$a$h$hilnrsvnw$xyy   %%!Q!2"7L^'AX_!j'PT`i & 
 	
r   c                 4   |dk  rt        d      sJ t        |t              st        d      sJ |D ]C  }t        ||   t              rt	        ||   d      r't        dj                  |||                | j                  |||||||||	|
|||||||      S )a  
        Randomized search on hyper parameters.
        After calling this method model is fitted and can be used, if not specified otherwise (refit=False).

        In contrast to grid_search, not all parameter values are tried out,
        but rather a fixed number of parameter settings is sampled from the specified distributions.
        The number of parameter settings that are tried is given by n_iter.

        Parameters
        ----------
        param_distributions: dict
            Dictionary with parameters names (string) as keys and distributions or lists of parameters to try.
            Distributions must provide a rvs method for sampling (such as those from scipy.stats.distributions).
            If a list is given, it is sampled uniformly.

        X: numpy.ndarray or pandas.DataFrame or catboost.Pool
            Data to compute statistics on

        y: list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Labels of the training data.
            If not None, can be a single- or two- dimensional array with either:
              - numerical values - for regression (including multiregression), ranking and binary classification problems
              - class labels (boolean, integer or string) - for classification (including multiclassification) problems
            Use only if X is not catboost.Pool and does not point to a file.

        cv: int, cross-validation generator or an iterable, optional (default=None)
            Determines the cross-validation splitting strategy. Possible inputs for cv are:
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a (Stratified)KFold
            - one of the scikit-learn splitter classes
                (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
            - An iterable yielding (train, test) splits as arrays of indices.

        n_iter: int
            Number of parameter settings that are sampled.
            n_iter trades off runtime vs quality of the solution.

        partition_random_seed: int, optional (default=0)
            Use this as the seed value for random permutation of the data.
            Permutation is performed before splitting the data for cross validation.
            Each seed generates unique data splits.
            Used only when cv is None or int.

        search_by_train_test_split: bool, optional (default=True)
            If True, source dataset is splitted into train and test parts, models are trained
            on the train part and parameters are compared by loss function score on the test part.
            After that, if calc_cv_statistics=true, statistics on metrics are calculated
            using cross-validation using best parameters and the model is fitted with these parameters.

            If False, every iteration of grid search evaluates results on cross-validation.
            It is recommended to set parameter to True for large datasets, and to False for small datasets.

        calc_cv_statistics: bool, optional (default=True)
            The parameter determines whether quality should be estimated.
            using cross-validation with the found best parameters. Used only when search_by_train_test_split=True.

        refit: bool (default=True)
            Refit an estimator using the best found parameters on the whole dataset.

        shuffle: bool, optional (default=True)
            Shuffle the dataset objects before parameters searching.

        stratified: bool, optional (default=None)
            Perform stratified sampling. True for classification and False otherwise.
            Currently supported only for cross-validation.

        train_size: float, optional (default=0.8)
            Should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split.

        verbose: bool or int, optional (default=True)
            If verbose is int, it determines the frequency of writing metrics to output
            verbose==True is equal to verbose==1
            When verbose==False, there is no messages

        plot : bool, optional (default=False)
            If True, draw train and eval error for every set of parameters in Jupyter notebook

        plot_file : file-like or str, optional (default=None)
            If not None, save train and eval error for every set of parameters to file

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Returns
        -------
        dict with two fields:
            'params': dict of best found parameters
            'cv_results': dict or pandas.core.frame.DataFrame with cross-validation results
                columns are: test-error-mean  test-error-std  train-error-mean  train-error-std
        r   z"n_iter should be a positive numberz*param_distributions should be a dictionaryrvszXParameter grid value is not iterable and do not have 'rvs' method (key={!r}, value={!r})r  )rD   rg   r   r   rC   r  r   r  )r3   param_distributionsr+  r   r  r  r  r  r  r  r  r  r  r   rA  rB  rW   rX   rz   s                      r   randomized_searchzCatBoost.randomized_search{  s    B Q; !EFFF-w7 !MNNN&C1#6A'RefiRjlqJr |  !D  !D  EH  J]  ^a  Jb  !c  d  d ' %%*a1F"7L^'AX_!j'PT`i & 
 	
r   c           
      Z   |	r| j                         rt        d      |t        d      |#t        |t        t        fz         st        d      t        ||      5  | j                  ||||
|      }|d   }|t        j                  }nt        t        |      j                  }||d<   |t        j                  k(  r~t        |t              r/t        |t              sdj                  t        t        |            }|t        d	      |t        d
      |t        d      |t        d      ||d<   ||d<   nY|t        d      t        |t               st        d      |t        d      |t        d      |t        d      ||d<   ||d<   |j#                  d      }|duxr t        |t$               }|rt        d      |t        t&        |      j                  |d<   |||d<   |t        t(        |      j                  |d<   |	rd|d<   |d   }d}t+        |d         dkD  rt        d       t+        |d         dk(  r|d   d!   }t-        | j/                               }t1        |       g }t3        |xs d      D ]@  }|j5                  t6        j8                  j                  |d"j;                  |                   B |	r/|j5                  t6        j8                  j                  |d#             |D ]  }t1        |        t=        ||d$|%      5  | j>                  jA                  |||      }ddd       |	r| jC                          |r5tE              }|d&   jG                          d'|v r|d'   jG                          ddd       S # 1 sw Y   \xY w# 1 sw Y   S xY w)(a  
        Select best features from pool according to loss value.

        Parameters
        ----------
        X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series
            If not catboost.Pool, 2 dimensional Feature matrix or string - file with dataset.

        y : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Labels of the training data.
            If not None, can be a single- or two- dimensional array with either:
              - numerical values - for regression (including multiregression), ranking and binary classification problems
              - class labels (boolean, integer or string) - for classification (including multiclassification) problems
            Use only if X is not catboost.Pool and does not point to a file.

        eval_set : catboost.Pool or list of catboost.Pool or tuple (X, y) or list [(X, y)], optional (default=None)
            Validation dataset or datasets for metrics calculation and possibly early stopping.

        features_for_select : str or list of feature indices, names or ranges
            (for grouping = Individual)
            Which features should participate in the selection.
            Format examples:
                - [0, 2, 3, 4, 17]
                - [0, "2-4", 17] (both ends in ranges are inclusive)
                - "0,2-4,20"
                - ["Name0", "Name2", "Name3", "Name4", "Name20"]

        num_features_to_select : positive int
            (for grouping = Individual)
            How many features to select from features_for_select.

        algorithm : EFeaturesSelectionAlgorithm or string, optional (default=RecursiveByShapValues)
            Which algorithm to use for features selection.
            Possible values:
                - RecursiveByPredictionValuesChange
                    Use prediction values change as feature strength, eliminate batch of features at once.
                - RecursiveByLossFunctionChange
                    Use loss function change as feature strength, eliminate batch of features at each step.
                - RecursiveByShapValues
                    Use shap values to estimate loss function change, eliminate features one by one.

        steps : positive int, optional (default=1)
            How many steps should be performed. In other words, how many times a full model will be trained.
            More steps give more accurate results.

        shap_calc_type : EShapCalcType or string, optional (default=Regular)
            Which method to use for calculation of shap values.
            Possible values:
                - Regular
                    Calculate regular SHAP values
                - Approximate
                    Calculate approximate SHAP values
                - Exact
                    Calculate exact SHAP values

        train_final_model : bool, optional (default=True)
            Need to fit model with selected features.

        verbose : bool or int
            If verbose is bool, then if set to True, logging_level is set to Verbose,
            if set to False, logging_level is set to Silent.
            If verbose is int, it determines the frequency of writing metrics to output and
            logging_level is set to Verbose.

        logging_level : string, optional (default=None)
            Possible values:
                - 'Silent'
                - 'Verbose'
                - 'Info'
                - 'Debug'

        plot : bool, optional (default=False)
            If True, draw train and eval error in Jupyter notebook.

        plot_file : file-like or str, optional (default=None)
            If not None, save train and eval error graphs to file

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        grouping : EFeaturesSelectionGrouping or string, optional (default=Individual)
            Which grouping to use for features selection.
            Possible values:
                - Individual
                    Select individual features
                - ByTags
                    Select feature groups (marked by tags)

        features_tags_for_select : list of strings
            (for grouping = ByTags)
            Which features tags should participate in the selection.

        num_features_tags_to_select : positive int
            (for grouping = ByTags)
            How many features tags to select from features_tags_for_select.

        Returns
        -------
        dict with fields:
            'selected_features': list of selected features indices
            'eliminated_features': list of eliminated features indices
            'selected_features_tags': list of selected features tags (optional, present if grouping == ByTags)
            'eliminated_features_tags': list of selected features tags (optional, present if grouping == ByTags)
        zQModel was already fitted. Set train_final_model to False or use not fitted model.Nr  zOy may be None only when X is an instance of catboost.Pool, str or pathlib.Path.)r+  r   r  r   r   r   features_selection_grouping,z&You should specify features_for_selectzKYou should not specify features_tags_for_select when grouping is Individualz)You should specify num_features_to_selectzNYou should not specify num_features_tags_to_select when grouping is Individualfeatures_for_selectnum_features_to_selectz+You should specify features_tags_for_selectz2features_tags_for_select must be a list of stringszBYou should not specify features_for_select when grouping is ByTagsz.You should specify num_features_tags_to_selectzEYou should not specify num_features_to_select when grouping is ByTagsfeatures_tags_for_selectnum_features_tags_to_selectrc  z8Custom objective is not supported for features selectionfeatures_selection_algorithmfeatures_selection_stepsr  Ttrain_final_modelr  r  r   z;Multiple eval sets are not supported for features selectionr   zmodel-{}zmodel-finalzSelect features plotr  r   r=  )$r  rD   rg   rs   rD  re   r  r   r   r   rv   r   rt   r/  r  r   r   r   r   r   r   r_   r7  r  r-   r  rV   r)   r*   r   rD  r  _select_featuresr  rB  show)r3   r+  r   r  r  r  	algorithmstepsr  r	  r   r   rA  rB  rW   rX   groupingr  r  r  r   rb  is_custom_objectiver  r  r1  	plot_dirsstepplot_dirr@  figuress                                  r   select_featureszCatBoost.select_features  s   ^ !1 stt9 4559Z:+?@ qrrx*55Q[br5  AL!(+F5@@01KXV\\8@455@@@18<ZPceqEr*-((3s<O3P*Q'&.'(PQQ+7'(uvv)1'(STT.:'(xyy0C,-3I/0+3'(UVV!":HE'(\]]&2'(lmm.6'(XYY)5'(opp5M128S45

?3I"+4"7"c
9Vb@c<c"#$^__$9NOjlu9v9|9|56 5:12)+@P^+_+e+e'( .2*+%l3JI<,-1#$abb\+./14(5a8	&t'89I#I.Iejq)  i9J9J49P!QR *   i!GH%'1 & diDZgpq,,77
IvV r !224=gF
#((*"g-O,113] +`  rqI +` s&   J?N N9AN N	N  N*c                 8    | j                   j                          y r(   )r   _convert_oblivious_to_asymmetricr@   s    r   %_convert_to_asymmetric_representationz.CatBoost._convert_to_asymmetric_representation  s    557r   r(   )NNNNNNNNNNNNNNNNNNNNNNNNNNNNNra   NNNNNNNNNNNNNNNNFNNNNNNNNNNNNN))r  r  r  LogProbabilityr  r  )CPU)r  r   r   rJ   Nr  )r  r   r   rJ   Nr  r   r   r   rJ   Nr  )r   r   rJ   F)	r   r   r   rJ   NFNNN)r   r   r   rJ   NNNNr   r   r   rJ   N)	rJ   AverageSinglePointAllrJ   FNNN)r   )cbmNN)Nr   NNT)TN)TNrJ   )NNNNTr   rJ   N)Nr   r   r   TTTTN皙?r   FNNN)Nr   r   TTTTNr"  TFNNN)Nr   r   r   TTTTNr"  TFNNN)NNNNNNNTNNFNNNNNN)8r   r   r   r   r4   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r  r9  r;  r=  r?  r  r  r  rG  rI  rK  rM  ry  r  r  r  r  r  r  r  r  r  r  r&  r'  s   @r   r  r    s   /
  osx|lpswimei
H nrCJ SWbflpVZgk$(Lu\& OH7yrH 3^jK3IjH2a \D/
0
5
6,/ubE#Nt. N N +/Y5P5P]b,.Y_,1t\e.23X[7;dUY	X,v P]TX$(L\2*)AV(&0"#"h%Te$N X\]_;?HT"H D
_ _`NRvz26IV NOHLsw,0i
V hiZ^mq26o
b jnim`dbfFP8r   r  c                   ~    e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z	 	 	 	 	 ddZddZddZddZ	ddZ
dd	Zdd
ZddZddZd Zed        Z xZS )CatBoostClassifierat_  
    Implementation of the scikit-learn API for CatBoost classification.

    Parameters
    ----------
    iterations : int, [default=500]
        Max count of trees.
        range: [1,+inf)
    learning_rate : float, [default value is selected automatically for binary classification with other parameters set to default. In all other cases default is 0.03]
        Step size shrinkage used in update to prevents overfitting.
        range: (0,1]
    depth : int, [default=6]
        Depth of a tree. All trees are the same depth.
        range: [1,16]
    l2_leaf_reg : float, [default=3.0]
        Coefficient at the L2 regularization term of the cost function.
        range: [0,+inf)
    model_size_reg : float, [default=None]
        Model size regularization coefficient.
        range: [0,+inf)
    rsm : float, [default=None]
        Subsample ratio of columns when constructing each tree.
        range: (0,1]
    loss_function : string or object, [default='Logloss']
        The metric to use in training and also selector of the machine learning
        problem to solve. If string, then the name of a supported metric,
        optionally suffixed with parameter description.
        If object, it shall provide methods 'calc_ders_range' or 'calc_ders_multi'.
    border_count : int, [default = 254 for training on CPU or 128 for training on GPU]
        The number of partitions in numeric features binarization. Used in the preliminary calculation.
        range: [1,65535] on CPU, [1,255] on GPU
    feature_border_type : string, [default='GreedyLogSum']
        The binarization mode in numeric features binarization. Used in the preliminary calculation.
        Possible values:
            - 'Median'
            - 'Uniform'
            - 'UniformAndQuantiles'
            - 'GreedyLogSum'
            - 'MaxLogSum'
            - 'MinEntropy'
    per_float_feature_quantization : list of strings, [default=None]
        List of float binarization descriptions.
        Format : described in documentation on catboost.ai
        Example 1: ['0:1024'] means that feature 0 will have 1024 borders.
        Example 2: ['0:border_count=1024', '1:border_count=1024', ...] means that two first features have 1024 borders.
        Example 3: ['0:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum',
                    '1:nan_mode=Forbidden,border_count=32,border_type=GreedyLogSum'] - defines more quantization properties for first two features.
    input_borders : string or pathlib.Path, [default=None]
        input file with borders used in numeric features binarization.
    output_borders : string, [default=None]
        output file for borders that were used in numeric features binarization.
    fold_permutation_block : int, [default=1]
        To accelerate the learning.
        The recommended value is within [1, 256]. On small samples, must be set to 1.
        range: [1,+inf)
    od_pval : float, [default=None]
        Use overfitting detector to stop training when reaching a specified threshold.
        Can be used only with eval_set.
        range: [0,1]
    od_wait : int, [default=None]
        Number of iterations which overfitting detector will wait after new best error.
    od_type : string, [default=None]
        Type of overfitting detector which will be used in program.
        Posible values:
            - 'IncToDec'
            - 'Iter'
        For 'Iter' type od_pval must not be set.
        If None, then od_type=IncToDec.
    nan_mode : string, [default=None]
        Way to process missing values for numeric features.
        Possible values:
            - 'Forbidden' - raises an exception if there is a missing value for a numeric feature in a dataset.
            - 'Min' - each missing value will be processed as the minimum numerical value.
            - 'Max' - each missing value will be processed as the maximum numerical value.
        If None, then nan_mode=Min.
    counter_calc_method : string, [default=None]
        The method used to calculate counters for dataset with Counter type.
        Possible values:
            - 'PrefixTest' - only objects up to current in the test dataset are considered
            - 'FullTest' - all objects are considered in the test dataset
            - 'SkipTest' - Objects from test dataset are not considered
            - 'Full' - all objects are considered for both learn and test dataset
        If None, then counter_calc_method=PrefixTest.
    leaf_estimation_iterations : int, [default=None]
        The number of steps in the gradient when calculating the values in the leaves.
        If None, then leaf_estimation_iterations=1.
        range: [1,+inf)
    leaf_estimation_method : string, [default=None]
        The method used to calculate the values in the leaves.
        Possible values:
            - 'Newton'
            - 'Gradient'
    thread_count : int, [default=None]
        Number of parallel threads used to run CatBoost.
        If None or -1, then the number of threads is set to the number of CPU cores.
        range: [1,+inf)
    random_seed : int, [default=None]
        Random number seed.
        If None, 0 is used.
        range: [0,+inf)
    use_best_model : bool, [default=None]
        To limit the number of trees in predict() using information about the optimal value of the error function.
        Can be used only with eval_set.
    best_model_min_trees : int, [default=None]
        The minimal number of trees the best model should have.
    verbose: bool
        When set to True, logging_level is set to 'Verbose'.
        When set to False, logging_level is set to 'Silent'.
    silent: bool, synonym for verbose
    logging_level : string, [default='Verbose']
        Possible values:
            - 'Silent'
            - 'Verbose'
            - 'Info'
            - 'Debug'
    metric_period : int, [default=1]
        The frequency of iterations to print the information to stdout. The value should be a positive integer.
    simple_ctr: list of strings, [default=None]
        Binarization settings for categorical features.
            Format : see documentation
            Example: ['Borders:CtrBorderCount=5:Prior=0:Prior=0.5', 'BinarizedTargetMeanValue:TargetBorderCount=10:TargetBorderType=MinEntropy', ...]
            CTR types:
                CPU and GPU
                - 'Borders'
                - 'Buckets'
                CPU only
                - 'BinarizedTargetMeanValue'
                - 'Counter'
                GPU only
                - 'FloatTargetMeanValue'
                - 'FeatureFreq'
            Number_of_borders, binarization type, target borders and binarizations, priors are optional parametrs
    combinations_ctr: list of strings, [default=None]
    per_feature_ctr: list of strings, [default=None]
    ctr_target_border_count: int, [default=None]
        Maximum number of borders used in target binarization for categorical features that need it.
        If TargetBorderCount is specified in 'simple_ctr', 'combinations_ctr' or 'per_feature_ctr' option it
        overrides this value.
        range: [1, 255]
    ctr_leaf_count_limit : int, [default=None]
        The maximum number of leaves with categorical features.
        If the number of leaves exceeds the specified limit, some leaves are discarded.
        The leaves to be discarded are selected as follows:
            - The leaves are sorted by the frequency of the values.
            - The top N leaves are selected, where N is the value specified in the parameter.
            - All leaves starting from N+1 are discarded.
        This option reduces the resulting model size
        and the amount of memory required for training.
        Note that the resulting quality of the model can be affected.
        range: [1,+inf) (for zero limit use ignored_features)
    store_all_simple_ctr : bool, [default=None]
        Ignore categorical features, which are not used in feature combinations,
        when choosing candidates for exclusion.
        Use this parameter with ctr_leaf_count_limit only.
    max_ctr_complexity : int, [default=4]
        The maximum number of Categ features that can be combined.
        range: [0,+inf)
    has_time : bool, [default=False]
        To use the order in which objects are represented in the input data
        (do not perform a random permutation of the dataset at the preprocessing stage).
    allow_const_label : bool, [default=False]
        To allow the constant label value in dataset.
    target_border: float, [default=None]
        Border for target binarization.
    classes_count : int, [default=None]
        The upper limit for the numeric class label.
        Defines the number of classes for multiclassification.
        Only non-negative integers can be specified.
        The given integer should be greater than any of the target values.
        If this parameter is specified the labels for all classes in the input dataset
        should be smaller than the given value.
        If several of 'classes_count', 'class_weights', 'class_names' parameters are defined
        the numbers of classes specified by each of them must be equal.
    class_weights : list or dict, [default=None]
        Classes weights. The values are used as multipliers for the object weights.
        If None, all classes are supposed to have weight one.
        If list - class weights in order of class_names or sequential classes if class_names is undefined
        If dict - dict of class_name -> class_weight.
        If several of 'classes_count', 'class_weights', 'class_names' parameters are defined
        the numbers of classes specified by each of them must be equal.
    auto_class_weights : string [default=None]
        Enables automatic class weights calculation. Possible values:
            - Balanced  # weight = maxSummaryClassWeight / summaryClassWeight, statistics determined from train pool
            - SqrtBalanced  # weight = sqrt(maxSummaryClassWeight / summaryClassWeight)
    class_names: list of strings, [default=None]
        Class names. Allows to redefine the default values for class labels (integer numbers).
        If several of 'classes_count', 'class_weights', 'class_names' parameters are defined
        the numbers of classes specified by each of them must be equal.
    one_hot_max_size : int, [default=None]
        Convert the feature to float
        if the number of different values that it takes exceeds the specified value.
        Ctrs are not calculated for such features.
    random_strength : float, [default=1]
        Score standard deviation multiplier.
    random_score_type : string [default=None]
        Type of random noise added to scores.
        Possible values:
            - 'Gumbel' - Gumbel-distributed
            - 'NormalWithModelSizeDecrease' - Normally-distributed with deviation decreasing with model iteration count
        If None than 'NormalWithModelSizeDecrease' will be used by default.
    name : string, [default='experiment']
        The name that should be displayed in the visualization tools.
    ignored_features : list, [default=None]
        Indices or names of features that should be excluded when training.
    train_dir : string or pathlib.Path, [default=None]
        The directory in which you want to record generated in the process of learning files.
    custom_metric : string or list of strings, [default=None]
        To use your own metric function.
    custom_loss: alias to custom_metric
    eval_metric : string or object, [default=None]
        To optimize your custom metric in loss.
    bagging_temperature : float, [default=None]
        Controls intensity of Bayesian bagging. The higher the temperature the more aggressive bagging is.
        Typical values are in range [0, 1] (0 - no bagging, 1 - default).
    save_snapshot : bool, [default=None]
        Enable progress snapshotting for restoring progress after crashes or interruptions
    snapshot_file : string or pathlib.Path, [default=None]
        Learn progress snapshot file path, if None will use default filename
    snapshot_interval: int, [default=600]
        Interval between saving snapshots (seconds)
    fold_len_multiplier : float, [default=None]
        Fold length multiplier. Should be greater than 1
    used_ram_limit : string or number, [default=None]
        Set a limit on memory consumption (value like '1.2gb' or 1.2e9).
        WARNING: Currently this option affects CTR memory usage only.
    gpu_ram_part : float, [default=0.95]
        Fraction of the GPU RAM to use for training, a value from (0, 1].
    pinned_memory_size: int [default=None]
        Size of additional CPU pinned memory used for GPU learning,
        usually is estimated automatically, thus usually should not be set.
    allow_writing_files : bool, [default=True]
        If this flag is set to False, no files with different diagnostic info will be created during training.
        With this flag no snapshotting can be done. Plus visualisation will not
        work, because visualisation uses files that are created and updated during training.
    final_ctr_computation_mode : string, [default='Default']
        Possible values:
            - 'Default' - Compute final ctrs for all pools.
            - 'Skip' - Skip final ctr computation. WARNING: model without ctrs can't be applied.
    approx_on_full_history : bool, [default=False]
        If this flag is set to True, each approximated value is calculated using all the preceeding rows in the fold (slower, more accurate).
        If this flag is set to False, each approximated value is calculated using only the beginning 1/fold_len_multiplier fraction of the fold (faster, slightly less accurate).
    boosting_type : string, default value depends on object count and feature count in train dataset and on learning mode.
        Boosting scheme.
        Possible values:
            - 'Ordered' - Gives better quality, but may slow down the training.
            - 'Plain' - The classic gradient boosting scheme. May result in quality degradation, but does not slow down the training.
    task_type : string, [default=None]
        The calcer type used to train the model.
        Possible values:
            - 'CPU'
            - 'GPU'
    device_config : string, [default=None], deprecated, use devices instead
    devices : list or string, [default=None], GPU devices to use.
        String format is: '0' for 1 device or '0:1:3' for multiple devices or '0-3' for range of devices.
        List format is : [0] for 1 device or [0,1,3] for multiple devices.

    bootstrap_type : string, Bayesian, Bernoulli, Poisson, MVS.
        Default bootstrap is Bayesian for GPU and MVS for CPU.
        Poisson bootstrap is supported only on GPU.
        MVS bootstrap is supported only on GPU.

    subsample : float, [default=None]
        Sample rate for bagging. This parameter can be used Poisson or Bernoully bootstrap types.

    mvs_reg : float, [default is set automatically at each iteration based on gradient distribution]
        Regularization parameter for MVS sampling algorithm

    monotone_constraints : list or numpy.ndarray or string or dict, [default=None]
        Monotone constraints for features.

    feature_weights : list or numpy.ndarray or string or dict, [default=None]
        Coefficient to multiply split gain with specific feature use. Should be non-negative.

    penalties_coefficient : float, [default=1]
        Common coefficient for all penalties. Should be non-negative.

    first_feature_use_penalties : list or numpy.ndarray or string or dict, [default=None]
        Penalties to first use of specific feature in model. Should be non-negative.

    per_object_feature_penalties : list or numpy.ndarray or string or dict, [default=None]
        Penalties for first use of feature for each object. Should be non-negative.

    sampling_frequency : string, [default=PerTree]
        Frequency to sample weights and objects when building trees.
        Possible values:
            - 'PerTree' - Before constructing each new tree
            - 'PerTreeLevel' - Before choosing each new split of a tree

    sampling_unit : string, [default='Object'].
        Possible values:
            - 'Object'
            - 'Group'
        The parameter allows to specify the sampling scheme:
        sample weights for each object individually or for an entire group of objects together.

    dev_score_calc_obj_block_size: int, [default=5000000]
        CPU only. Size of block of samples in score calculation. Should be > 0
        Used only for learning speed tuning.
        Changing this parameter can affect results due to numerical accuracy differences

    dev_efb_max_buckets : int, [default=1024]
        CPU only. Maximum bucket count in exclusive features bundle. Should be in an integer between 0 and 65536.
        Used only for learning speed tuning.

    sparse_features_conflict_fraction : float, [default=0.0]
        CPU only. Maximum allowed fraction of conflicting non-default values for features in exclusive features bundle.
        Should be a real value in [0, 1) interval.

    grow_policy : string, [SymmetricTree,Lossguide,Depthwise], [default=SymmetricTree]
        The tree growing policy. It describes how to perform greedy tree construction.

    min_data_in_leaf : int, [default=1].
        The minimum training samples count in leaf.
        CatBoost will not search for new splits in leaves with samples count less than min_data_in_leaf.
        This parameter is used only for Depthwise and Lossguide growing policies.

    max_leaves : int, [default=31],
        The maximum leaf count in resulting tree.
        This parameter is used only for Lossguide growing policy.

    score_function : string, possible values L2, Cosine, NewtonL2, NewtonCosine, [default=Cosine]
        For growing policy Lossguide default=NewtonL2.
        GPU only. Score that is used during tree construction to select the next tree split.

    max_depth : int, Synonym for depth.

    n_estimators : int, synonym for iterations.

    num_trees : int, synonym for iterations.

    num_boost_round : int, synonym for iterations.

    colsample_bylevel : float, synonym for rsm.

    random_state : int, synonym for random_seed.

    reg_lambda : float, synonym for l2_leaf_reg.

    objective : string, synonym for loss_function.

    num_leaves : int, synonym for max_leaves.

    min_child_samples : int, synonym for min_data_in_leaf

    eta : float, synonym for learning_rate.

    max_bin : float, synonym for border_count.

    scale_pos_weight : float, synonym for class_weights.
        Can be used only for binary classification. Sets weight multiplier for
        class 1 to scale_pos_weight value.

    metadata : dict, string to string key-value pairs to be stored in model metadata storage

    early_stopping_rounds : int
        Synonym for od_wait. Only one of these parameters should be set.

    cat_features : list or numpy.ndarray, [default=None]
        If not None, giving the list of Categ features indices or names (names are represented as strings).
        If it contains feature names, feature names must be defined for the training dataset passed to 'fit'.

    text_features : list or numpy.ndarray, [default=None]
        If not None, giving the list of Text features indices or names (names are represented as strings).
        If it contains feature names, feature names must be defined for the training dataset passed to 'fit'.

    embedding_features : list or numpy.ndarray, [default=None]
        If not None, giving the list of Embedding features indices or names (names are represented as strings).
        If it contains feature names, feature names must be defined for the training dataset passed to 'fit'.

    leaf_estimation_backtracking : string, [default=None]
        Type of backtracking during gradient descent.
        Possible values:
            - 'No' - never backtrack; supported on CPU and GPU
            - 'AnyImprovement' - reduce the descent step until the value of loss function is less than before the step; supported on CPU and GPU
            - 'Armijo' - reduce the descent step until Armijo condition is satisfied; supported on GPU only

    model_shrink_rate : float, [default=0]
        This parameter enables shrinkage of model at the start of each iteration. CPU only.
        For Constant mode shrinkage coefficient is calculated as (1 - model_shrink_rate * learning_rate).
        For Decreasing mode shrinkage coefficient is calculated as (1 - model_shrink_rate / iteration).
        Shrinkage coefficient should be in [0, 1).

    model_shrink_mode : string, [default=None]
        Mode of shrinkage coefficient calculation. CPU only.
        Possible values:
            - 'Constant' - Shrinkage coefficient is constant at each iteration.
            - 'Decreasing' - Shrinkage coefficient decreases at each iteration.

    langevin : bool, [default=False]
        Enables the Stochastic Gradient Langevin Boosting. CPU only.

    diffusion_temperature : float, [default=0]
        Langevin boosting diffusion temperature. CPU only.

    posterior_sampling : bool, [default=False]
        Set group of parameters for further use Uncertainty prediction:
            - Langevin = True
            - Model Shrink Rate = 1/(2N), where N is dataset size
            - Model Shrink Mode = Constant
            - Diffusion-temperature = N, where N is dataset size. CPU only.

    boost_from_average : bool, [default=True for RMSE, False for other losses]
        Enables to initialize approx values by best constant value for specified loss function.
        Available for RMSE, Logloss, CrossEntropy, Quantile and MAE.

    tokenizers : list of dicts,
        Each dict is a tokenizer description. Example:
        ```
        [
            {
                'tokenizer_id': 'Tokenizer',  # Tokeinzer identifier.
                'lowercasing': 'false',  # Possible values: 'true', 'false'.
                'number_process_policy': 'LeaveAsIs',  # Possible values: 'Skip', 'LeaveAsIs', 'Replace'.
                'number_token': '%',  # Rarely used character. Used in conjunction with Replace NumberProcessPolicy.
                'separator_type': 'ByDelimiter',  # Possible values: 'ByDelimiter', 'BySense'.
                'delimiter': ' ',  # Used in conjunction with ByDelimiter SeparatorType.
                'split_by_set': 'false',  # Each single character in delimiter used as individual delimiter.
                'skip_empty': 'true',  # Possible values: 'true', 'false'.
                'token_types': ['Word', 'Number', 'Unknown'],  # Used in conjunction with BySense SeparatorType.
                    # Possible values: 'Word', 'Number', 'Punctuation', 'SentenceBreak', 'ParagraphBreak', 'Unknown'.
                'subtokens_policy': 'SingleToken',  # Possible values:
                    # 'SingleToken' - All subtokens are interpreted as single token).
                    # 'SeveralTokens' - All subtokens are interpreted as several token.
            },
            ...
        ]
        ```

    dictionaries : list of dicts,
        Each dict is a tokenizer description. Example:
        ```
        [
            {
                'dictionary_id': 'Dictionary',  # Dictionary identifier.
                'token_level_type': 'Word',  # Possible values: 'Word', 'Letter'.
                'gram_order': '1',  # 1 for Unigram, 2 for Bigram, ...
                'skip_step': '0',  # 1 for 1-skip-gram, ...
                'end_of_word_token_policy': 'Insert',  # Possible values: 'Insert', 'Skip'.
                'end_of_sentence_token_policy': 'Skip',  # Possible values: 'Insert', 'Skip'.
                'occurrence_lower_bound': '3',  # The lower bound of token occurrences in the text to include it in the dictionary.
                'max_dictionary_size': '50000',  # The max dictionary size.
            },
            ...
        ]
        ```

    feature_calcers : list of strings,
        Each string is a calcer description. Example:
        ```
        [
            'NaiveBayes',
            'BM25',
            'BoW:top_tokens_count=2000',
        ]
        ```

    text_processing : dict,
        Text processging description.

    eval_fraction : float, [default=None]
        Fraction of the train dataset to be used as the evaluation dataset.
    rv  cx                     i }xg d}yt        t               j                               D ]  \  }z}{|zyvs{{xz<    t        t        |   x       y N)
not_paramsr3   r   rm  )r   r   r  rW  r$  r4   )}r3   rT  rK  rM  rR  model_size_regrO  rc  r   r   r   r   output_bordersfold_permutation_blockr  rX  r  r   counter_calc_methodleaf_estimation_iterationsleaf_estimation_methodrk  r   r  best_model_min_treesr   r   r   r   ctr_leaf_count_limitstore_all_simple_ctrmax_ctr_complexityr  allow_const_labelrw  classes_countrf  rg  rh  one_hot_max_sizerandom_strengthrandom_score_typer   r   r1  r[  rZ  ro  bagging_temperaturer  r  r  fold_len_multiplierr   gpu_ram_partpinned_memory_sizeallow_writing_filesfinal_ctr_computation_modeapprox_on_full_historyboosting_type
simple_ctrcombinations_ctrper_feature_ctrr  r  r   device_configdevicesbootstrap_type	subsamplemvs_regsampling_unitsampling_frequencydev_score_calc_obj_block_sizer   r   rN  rU  rV  rW  rP  rQ  rS  rb  rL  r  rd  gpu_cat_features_storagedata_partitionmetadatarY  rY  grow_policyr^  r_  r\  r]  score_functionleaf_estimation_backtrackingctr_history_unitr  r  penalties_coefficientr  r  model_shrink_ratemodel_shrink_modelangevindiffusion_temperatureposterior_samplingboost_from_averagerZ  
tokenizersdictionariesfeature_calcerstext_processingr[  callbackr  r  r   r'  rz   rv   rm  s}                                                                                                                               r   r4   zCatBoostClassifier.__init__  sS    t B
#FHMMO4JC*$):#s 5 	 $08r   c                 $   | j                   j                         }t        |       d|v rt        j	                  |d           | j
                  g |||||d||dddd||	|
||||||||||||||||  | S )a1  
        Fit the CatBoostClassifier model.

        Parameters
        ----------
        X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series
            If not catboost.Pool, 2 dimensional Feature matrix or string - file with dataset.

        y : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Labels of the training data.
            If not None, can be a single- or two- dimensional array with either:
              - numerical values - for binary classification problems
              - class labels (boolean, integer or string)
            Use only if X is not catboost.Pool and does not point to a file.

        cat_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Categ columns indices.
            Use only if X is not catboost.Pool.

        text_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Text columns indices.
            Use only if X is not catboost.Pool.

        embedding_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Embedding columns indices.
            Use only if X is not catboost.Pool.

        graph : list or numpy.ndarray or pandas.DataFrame
            The graph edges list description.
            If list or numpy.ndarrays or pandas.DataFrame, giving 2 dimensional.

        sample_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Instance weights, 1 dimensional array like.

        baseline : list or numpy.ndarray, optional (default=None)
            If not None, giving 2 dimensional array like data.
            Use only if X is not catboost.Pool.

        use_best_model : bool, optional (default=None)
            Flag to use best model

        eval_set : catboost.Pool or list of catboost.Pool or tuple (X, y) or list [(X, y)], optional (default=None)
            Validation dataset or datasets for metrics calculation and possibly early stopping.

        metric_period : int
            Frequency of evaluating metrics.

        verbose : bool or int
            If verbose is bool, then if set to True, logging_level is set to Verbose,
            if set to False, logging_level is set to Silent.
            If verbose is int, it determines the frequency of writing metrics to output and
            logging_level is set to Verbose.

        silent : bool
            If silent is True, logging_level is set to Silent.
            If silent is False, logging_level is set to Verbose.

        logging_level : string, optional (default=None)
            Possible values:
                - 'Silent'
                - 'Verbose'
                - 'Info'
                - 'Debug'

        plot : bool, optional (default=False)
            If True, draw train and eval error in Jupyter notebook

        plot_file : file-like or str, optional (default=None)
            If not None, save train and eval error graphs to file

        verbose_eval : bool or int
            Synonym for verbose. Only one of these parameters should be set.

        early_stopping_rounds : int
            Activates Iter overfitting detector with od_wait set to early_stopping_rounds.

        save_snapshot : bool, [default=None]
            Enable progress snapshotting for restoring progress after crashes or interruptions

        snapshot_file : string or pathlib.Path, [default=None]
            Learn progress snapshot file path, if None will use default filename

        snapshot_interval: int, [default=600]
            Interval between saving snapshots (seconds)

        init_model : CatBoost class or string or pathlib.Path, [default=None]
            Continue training starting from the existing model.
            If this parameter is a string or pathlib.Path, load initial model from the path specified by this string.

        callbacks : list, optional (default=None)
            List of callback objects that are applied at end of each iteration.

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Returns
        -------
        model : CatBoost
        rc  N)r  r  r  r$  _check_is_compatible_lossr  r3   r+  r   rY  rZ  r[  r_  r,  rh  r  r  r   r   rA  rB  r]  r   r   r   rY  r  r  r  r  r  rW   rX   r   s                               r   r  zCatBoostClassifier.fit  ss   X ""'')&!f$889PQ		 	M! 	MQ 	M 	Mm 	M5G 	M 	Mu 	MVc 	Mei 	Mko 	Mqu 	Mw{ 	M  ~F 	M  HV 	M	M#	M%2	M48	M:C	MEW	MYe	Mgt	M	M/	M1>	M@M	MO`	Mbl	Mnw	M zB	M DL	M r   c           
      2    | j                  ||||||d|      S )aI  
        Predict with data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        prediction_type : string, optional (default='Class')
            Can be:
            - 'RawFormulaVal' : return raw formula value.
            - 'Class' : return class label.
            - 'Probability' : return probability for every class.
            - 'LogProbability' : return log probability for every class.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool, optional (default=False)
            If True, writes the evaluation metric measured set to stderr.

        task_type : string, [default=None]
            The evaluator type.
            Possible values:
                - 'CPU'
                - 'GPU' (models with only numerical features are supported for now)

        Returns
        -------
        prediction:
            If data is for a single object, the return value depends on prediction_type value:
                - 'RawFormulaVal' : return raw formula value.
                - 'Class' : return class label.
                - 'Probability' : return one-dimensional numpy.ndarray with probability for every class.
                - 'LogProbability' : return one-dimensional numpy.ndarray with
                  log probability for every class.
            otherwise numpy.ndarray, with values that depend on prediction_type value:
                - 'RawFormulaVal' : one-dimensional array of raw formula value for each object.
                - 'Class' : one-dimensional array of class label for each object.
                - 'Probability' : two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                  with probability for every class for each object.
                - 'LogProbability' : two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                  with log probability for every class for each object.
        r  r  r  s           r   r  zCatBoostClassifier.predict  s%    r }}T?KLZaclnwxxr   c           
      2    | j                  |d||||d|      S )a  
        Predict class probability with X.

        Parameters
        ----------
        X : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If X is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool
            If True, writes the evaluation metric measured set to stderr.

        task_type : string, [default=None]
            The evaluator type.
            Possible values:
                - 'CPU'
                - 'GPU' (models with only numerical features are supported for now)

        Returns
        -------
        prediction :
            If X is for a single object
                return one-dimensional numpy.ndarray with probability for every class.
            otherwise
                return two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                with probability for every class for each object.
        r  predict_probar  )r3   r+  r  r  rk  r   r   s          r   rb  z CatBoostClassifier.predict_proba  s%    T }}Q{I|U\^moxyyr   c           
      2    | j                  |d||||d|      S )a  
        Predict class log probability with data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool
            If True, writes the evaluation metric measured set to stderr.

        task_type : string, [default=None]
            The evaluator type.
            Possible values:
                - 'CPU'
                - 'GPU' (models with only numerical features are supported for now)

        Returns
        -------
        prediction :
            If data is for a single object
                return one-dimensional numpy.ndarray with log probability for every class.
            otherwise
                return two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                with log probability for every class for each object.
        r  predict_log_probar  )r3   rX  r  r  rk  r   r   s          r   rd  z$CatBoostClassifier.predict_log_proba  s0    T }}T#3[)\[bdw  zC  D  	Dr   c           
      2    | j                  |||||||d      S )a  
        Predict target at each stage for data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        prediction_type : string, optional (default='Class')
            Can be:
            - 'RawFormulaVal' : return raw formula value.
            - 'Class' : return class label.
            - 'Probability' : return probability for every class.
            - 'LogProbability' : return log probability for every class.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        eval_period: int, optional (default=1)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool
            If True, writes the evaluation metric measured set to stderr.

        Returns
        -------
        prediction : generator for each iteration that generates:
            If data is for a single object, the return value depends on prediction_type value:
                - 'RawFormulaVal' : return raw formula value.
                - 'Class' : return majority vote class.
                - 'Probability' : return one-dimensional numpy.ndarray with probability for every class.
                - 'LogProbability' : return one-dimensional numpy.ndarray with
                  log probability for every class.
            otherwise numpy.ndarray, with values that depend on prediction_type value:
                - 'RawFormulaVal' : one-dimensional array of raw formula value for each object.
                - 'Class' : one-dimensional array of class label for each object.
                - 'Probability' : two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                  with probability for every class for each object.
                - 'LogProbability' : two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                  with log probability for every class for each object.
        r  r  r  s           r   r  z!CatBoostClassifier.staged_predict  s2    l ##D/;	S^`lnu  xH  I  	Ir   c           
      2    | j                  |d|||||d      S )aO  
        Predict classification target at each stage for data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        eval_period: int, optional (default=1)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool
            If True, writes the evaluation metric measured set to stderr.

        Returns
        -------
        prediction : generator for each iteration that generates:
            If data is for a single object
                return one-dimensional numpy.ndarray with probability for every class.
            otherwise
                return two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                with probability for every class for each object.
        r  staged_predict_probar  r3   rX  r  r  r  rk  r   s          r   rg  z'CatBoostClassifier.staged_predict_probaM  s2    N ##D-iQ\^jls  vL  M  	Mr   c           
      2    | j                  |d|||||d      S )aW  
        Predict classification target at each stage for data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        eval_period: int, optional (default=1)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool
            If True, writes the evaluation metric measured set to stderr.

        Returns
        -------
        prediction : generator for each iteration that generates:
            If data is for a single object
                return one-dimensional numpy.ndarray with log probability for every class.
            otherwise
                return two-dimensional numpy.ndarray with shape (number_of_objects x number_of_classes)
                with log probability for every class for each object.
        r  staged_predict_log_probar  rh  s          r   rj  z+CatBoostClassifier.staged_predict_log_probav  s3    N ##D*:KT_amov  yS  T  	Tr   c           	         t        |t              r*|t        d      |j                         }|t        d      t        |t              rXt        |j                        dk7  r-t        dj                  t        |j                                    ||j                  d      }n|t        d      t        j                  |      }| j                  |dddd	dd
      j                  d	      }t        j                  |j                  t        j                        r9t        j                  |j                  t        j                        rt        d      |j                  t        j                   k(  r9t        j                  |j                  t        j                        r}t        d      t        j                  |j                  t        j                        rt        d      t        j                  |j                  t        j                         rt        d      t        j"                  t        j                  |      t        j                  |      k(        S )a=  
        Calculate accuracy.

        Parameters
        ----------
        X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series
            Data to apply model on.
        y : list or numpy.ndarray
            True labels.

        Returns
        -------
        accuracy : float
        N\Wrong initializing y: X is catboost.Pool object, y must be initialized inside catboost.Pool.Label in X has not initialized.r   z=y is DataFrame and has {} columns, but must have exactly one.r   y should be specified.r  rJ   scorer  r  r  rk  r   r  zDpredicted classes have numeric type but specified y contains stringszDpredicted classes have boolean type but specified y contains stringsz=predicted classes have string type but specified y is numericz=predicted classes have string type but specified y is boolean)rg   rD  rD   r|  r   r_   r  r   r   r  r  r  
issubdtyperP  number	characterbool_mean)r3   r+  r   predicted_classess       r   ro  zCatBoostClassifier.score  s    a}#  %C  D  DAy#$EFFa#199~"#$c$j$jknopoxoxky$z{{!))A,AY 899HHQK MM#& * 
 '"+ 	 ==*00"))<}}QWWbll3#$jkk$$0}}QWWbll3#$jkk}}QWWbii0#$cddqww1#$cddwwrxx 12bhhqkABBr   c                 .   | j                         st        d      | j                         }|d|j                         v r|d= yyt	        |t
              st        d      d|cxk  rdk  sJ d        J d       t        |      | j                         d<   y)z
        Set a threshold for class separation in binary classification task for a trained model.
        :param binclass_probability_threshold: float number in [0, 1] or None to discard it
        z9You can't set probability threshold for not fitted model.Nbinclass_probability_thresholdz3binclass_probability_threshold must have float typer  r  z^Please provide correct probability for binclass_probability_threshold argument in [0, 1] range)r  rD   rD  rU  rg   ro   r   )r3   rx  rL  s      r   set_probability_thresholdz,CatBoostClassifier.set_probability_threshold  s    
 ~~ [\\$$&)1/8==?B=> C <kJ#$YZZ7=2= qpq= qpq=DGHfDgD @Ar   c                 l    | j                         st        d      | j                  j                         S )zT
        Get a threshold for class separation in binary classification task
        z5Not fitted models don't have a probability threshold.)r  rD   r  #_get_binclass_probability_thresholdr@   s    r   get_probability_thresholdz,CatBoostClassifier.get_probability_threshold  s-     ~~ WXX||??AAr   c                     t        | t              r0t        j                  |       st	        dj                  |             y y )NzInvalid loss_function='{}': for classifier use Logloss, CrossEntropy, MultiClass, MultiClassOneVsAll or custom objective object)rg   r   r  r2  rD   r   r1  s    r   r^  z,CatBoostClassifier._check_is_compatible_loss  sL    mS)(2W2WXe2f !ssysy  {H  tIJ J 3g)r   )wNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNFNNNNNNNNNNNNN)r  r   r   rJ   Nr  )r   r   rJ   Nr  )r  r   r   r   rJ   Nr  r(   )r   r   r   r   r  r4   r  r  rb  rd  r  rg  rj  ro  ry  r|  rb   r^  r&  r'  s   @r   r$  r$    s   M^ #O  '+# #'#!!!   #'# $&* *.!%"%)!"$(%)" q@9D \`lpVZgk$(	tl9yv*zX*DX6Ip'MR'TR0Cdh$B J Jr   r$  c                   J    e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
 fd	Z	 	 	 	 	 	 ddZddZddZddZ	e
d        Zd	 Z xZS )CatBoostRegressora  
    Implementation of the scikit-learn API for CatBoost regression.

    Parameters
    ----------
    Like in CatBoostClassifier, except loss_function, classes_count, class_names and class_weights

    loss_function : string, [default='RMSE']
        'RMSE'
        'MAE'
        'Quantile:alpha=value'
        'LogLinQuantile:alpha=value'
        'Poisson'
        'MAPE'
        'Lq:q=value'
        'SurvivalAft:dist=value;scale=value'
    	regressorcq                     i }qg d}rt        t               j                               D ]  \  }s}t|srvsttqs<    t        t        |   q       y r&  )r   r   r  rW  r  r4   vr3   rT  rK  rM  rR  r(  rO  rc  r   r   r   r   r)  r*  r  rX  r  r   r+  r,  r-  rk  r   r  r.  r   r   r   r   r/  r0  r1  r  r2  rw  r4  r5  r6  r   r   r1  rZ  ro  r7  r  r  r  r8  r   r9  r:  r;  r<  r=  r>  r?  r@  rA  r  r  r   rB  rC  rD  rE  rF  rH  rG  rI  r   r   rN  rU  rV  rW  rP  rQ  rS  rb  rL  r  rJ  rK  rL  rY  rY  rM  r^  r_  r\  r]  rN  rO  rP  r  r  rQ  r  r  rR  rS  rT  rU  rV  rW  rZ  rX  rY  rZ  r[  r[  r  r  r   r'  rz   rv   rm  sv                                                                                                                        r   r4   zCatBoostRegressor.__init__  sS    f B
#FHMMO4JC*$):#s 5 	/7r   c                    t        | j                        }t        |       d|v rt        j	                  |d           | j
                  g |||||d||dddd||	|
|||||||||||||||| S )a  
        Fit the CatBoost model.

        Parameters
        ----------
        X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series
            If not catboost.Pool, 2 dimensional Feature matrix or string - file with dataset.

        y : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Labels of the training data.
            If not None, can be a single- or two- dimensional array with numerical values.
            Use only if X is not catboost.Pool and does not point to a file.

        cat_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Categ columns indices.
            Use only if X is not catboost.Pool.

        text_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Text columns indices.
            Use only if X is not catboost.Pool.

        embedding_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Embedding columns indices.
            Use only if X is not catboost.Pool.

        graph : list or numpy.ndarray or pandas.DataFrame
            The graph edges list description.
            If list or numpy.ndarrays or pandas.DataFrame, giving 2 dimensional.

        sample_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Instance weights, 1 dimensional array like.

        baseline : list or numpy.ndarray, optional (default=None)
            If not None, giving 2 dimensional array like data.
            Use only if X is not catboost.Pool.

        use_best_model : bool, optional (default=None)
            Flag to use best model

        eval_set : catboost.Pool or list of catboost.Pool or tuple (X, y) or list [(X, y)], optional (default=None)
            Validation dataset or datasets for metrics calculation and possibly early stopping.

        metric_period : int
            Frequency of evaluating metrics.

        verbose : bool or int
            If verbose is bool, then if set to True, logging_level is set to Verbose,
            if set to False, logging_level is set to Silent.
            If verbose is int, it determines the frequency of writing metrics to output and
            logging_level is set to Verbose.

        silent : bool
            If silent is True, logging_level is set to Silent.
            If silent is False, logging_level is set to Verbose.

        logging_level : string, optional (default=None)
            Possible values:
                - 'Silent'
                - 'Verbose'
                - 'Info'
                - 'Debug'

        plot : bool, optional (default=False)
            If True, draw train and eval error in Jupyter notebook

        plot_file : file-like or str, optional (default=None)
            If not None, save train and eval error graphs to file (requires installed plotly)

        verbose_eval : bool or int
            Synonym for verbose. Only one of these parameters should be set.

        early_stopping_rounds : int
            Activates Iter overfitting detector with od_wait set to early_stopping_rounds.

        save_snapshot : bool, [default=None]
            Enable progress snapshotting for restoring progress after crashes or interruptions

        snapshot_file : string or pathlib.Path, [default=None]
            Learn progress snapshot file path, if None will use default filename

        snapshot_interval: int, [default=600]
            Interval between saving snapshots (seconds)

        init_model : CatBoost class or string or pathlib.Path, [default=None]
            Continue training starting from the existing model.
            If this parameter is a string or pathlib.Path, load initial model from the path specified by this string.

        callbacks : list, optional (default=None)
            List of callback objects that are applied at end of each iteration.

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Returns
        -------
        model : CatBoost
        rc  N)r   r  r  r  r^  r  r_  s                               r   r  zCatBoostRegressor.fit  s   V $++,&!f$778OPtyy u uA u| u] u<N uPT uV[ u]j ulp urv ux| u  C u  EM u'u)1u3:u<IuKOuQZu\nu%u'4u6<u>Su 'u )6u 8Iu KUu W`u bju ltu 	ur   c           
      V    || j                         }| j                  ||||||d|      S )aL  
        Predict with data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        prediction_type : string, optional (default='RawFormulaVal')
            Can be:
            - 'RawFormulaVal' : return raw formula value.
            - 'Exponent' : return Exponent of raw formula value.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool
            If True, writes the evaluation metric measured set to stderr.

        task_type : string, [default=None]
            The evaluator type.
            Possible values:
                - 'CPU'
                - 'GPU' (models with only numerical features are supported for now)

        Returns
        -------
        prediction :
            If data is for a single object, the return value is single float formula return value
            otherwise one-dimensional numpy.ndarray of formula return values for each object.
        r  )_get_default_prediction_typer  r  s           r   r  zCatBoostRegressor.predict  s9    X ""??AO}}T?KLZaclnwxxr   c           
      2    | j                  |||||||d      S )a  
        Predict target at each stage for data.

        Parameters
        ----------
        data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.

        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.

        eval_period: int, optional (default=1)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).

        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.

        verbose : bool
            If True, writes the evaluation metric measured set to stderr.

        Returns
        -------
        prediction : generator for each iteration that generates:
            If data is for a single object, the return value is single float formula return value
            otherwise one-dimensional numpy.ndarray of formula return values for each object.
        r  r  r  s           r   r  z CatBoostRegressor.staged_predict&  s2    H ##D/;	S^`lnu  xH  I  	Ir   c           	         t        |t              r*|t        d      |j                         }|t        d      |t        d      t	        j
                  |t        j                        }| j                  || j                         ddddd	      }| j                  j                         }|d
k(  r	|dddf   }|j                  |j                  k7  r1d}t        |j                  |j                  |j                              |j                  |j                        }t	        j                  ||j!                  d      z
  dz        }t	        j                  ||z
  dz        }d||z  z
  S )a3  
        Calculate R^2.

        Parameters
        ----------
        X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series
            Data to apply model on.
        y : list or numpy.ndarray
            True labels.

        Returns
        -------
        R^2 : float
        Nrl  rm  rn  r  r   rJ   ro  rp  r  zLlabels and predictions should have same size. But y.size={} != preds.size={})axisr   r   )rg   rD  rD   r|  r   r  float64r  r  r  r  r   r   r  r  r   ru  )r3   r+  r   r  r  msgtotal_sum_of_squaresresidual_sum_of_squaress           r   ro  zCatBoostRegressor.scoreL  s]    a}#  %C  D  DAy#$EFFY 899HHQbjj)mm ==?& $ 
 ||335((%ad+Kqvv%`C

166;3C3C DEEIIk''(!vvq166q6>'9a&?@"$&&!k/a)?"@*-AAAAr   c                     t         j                  |       xs, t         j                  |       xs t         j                  |       }t	        | t
              r|st        dj                  |             y y )NzInvalid loss_function='{}': for regressor use RMSE, MultiRMSE, SurvivalAft, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq or custom objective object)r  r5  r8  r>  rg   r   rD   r   )rc  is_regressions     r   r^  z+CatBoostRegressor._check_is_compatible_lossx  s     99-H  ~HLrLr  tA  MB  ~  FN  Fo  Fo  p}  F~mS)- !L MS  MS  Ta  Mbc c 3@)r   c                     t        | j                        }t        |       |j                  d      }|r9t	        |t
              r)|j                  d      s|j                  d      ry|dk(  ryy)Nrc  PoissonTweedier  r  r  )r   r  r  r   rg   r   
startswith)r3   r   rc  s      r   r  z.CatBoostRegressor._get_default_prediction_type  sa    $++,&!

?3Zs;''	2m6N6Ny6Y! 55,r   )pNNNNNNr{  NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNr~  )Nr   r   rJ   Nr  r  r(   )r   r   r   r   r  r4   r  r  r  ro  rb   r^  r  r&  r'  s   @r   r  r    s   $ "O  '+# #'#!!!   #'# $&* *.!%"%)!"$(%)" cy8v dh>BlpVZgk$(ruh.y`$IL*BX c c
r   r  c                   F    e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 fd	Z	 	 	 	 	 	 	 d
dZddZddZddZ	e
d        Z xZS )CatBoostRankera  
    Implementation of the scikit-learn API for CatBoost ranking.
    Parameters
    ----------
    Like in CatBoostClassifier, except loss_function, classes_count, class_names and class_weights
    loss_function : string, [default='YetiRank']
        'YetiRank'
        'YetiRankPairwise'
        'StochasticFilter'
        'StochasticRank'
        'QueryCrossEntropy'
        'QueryRMSE'
        'GroupQuantile'
        'QuerySoftMax'
        'PairLogit'
        'PairLogitPairwise'
    ry  cq                     i }qg d}rt        t               j                               D ]  \  }s}t|srvsttqs<    t        t        |   q       y r&  )r   r   r  rW  r  r4   r  sv                                                                                                                        r   r4   zCatBoostRanker.__init__  sR    f B
#FHMMO4JC*$):#s 5 	nd,V4r   c                     t        | j                        } t        |        d| v rt        j	                  | d           | j
                  g ||||||||	||
|||||||||||||||||||||  | S )aC  
        Fit the CatBoostRanker model.
        Parameters
        ----------
        X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series
            If not catboost.Pool, 2 dimensional Feature matrix or string - file with dataset.
        y : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Labels of the training data.
            If not None, can be a single-dimensional array with numerical values.
            Use only if X is not catboost.Pool and does not point to a file.
        group_id : numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Ranking groups, 1 dimensional array like.
            Use only if X is not catboost.Pool.
        cat_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Categ columns indices.
            Use only if X is not catboost.Pool.
        text_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Text columns indices.
            Use only if X is not catboost.Pool.
        embedding_features : list or numpy.ndarray, optional (default=None)
            If not None, giving the list of Embedding columns indices.
            Use only if X is not catboost.Pool.
        pairs : list or numpy.ndarray or pandas.DataFrame, optional (default=None)
            The pairs description in the form of a two-dimensional matrix of shape N by 2:
            N is the number of pairs.
            The first element of the pair is the zero-based index of the winner object from the input dataset for pairwise comparison.
            The second element of the pair is the zero-based index of the loser object from the input dataset for pairwise comparison.
        graph : list or numpy.ndarray or pandas.DataFrame
            The graph edges list description.
            If list or numpy.ndarrays or pandas.DataFrame, giving 2 dimensional.
        sample_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
            Instance weights, 1 dimensional array like.
        group_weight : list or numpy.ndarray (default=None)
            The weights of all objects within the defined groups from the input data in the form of one-dimensional array-like data.
            Used for calculating the final values of trees. By default, it is set to 1 for all objects in all groups.
            Only a weight or group_weight parameter can be used at a time
        subgroup_id : list or numpy.ndarray (default=None)
            Subgroup identifiers for all input objects. Supported identifier types are:
            int
            string types (string or unicode for Python 2 and bytes or string for Python 3).
        pairs_weight : list or numpy.ndarray (default=None)
            The weight of each input pair of objects in the form of one-dimensional array-like pairs.
            The number of given values must match the number of specified pairs.
            This information is used for calculation and optimization of Pairwise metrics .
            By default, it is set to 1 for all pairs.
        baseline : list or numpy.ndarray, optional (default=None)
            If not None, giving 2 dimensional array like data.
            Use only if X is not catboost.Pool.
        use_best_model : bool, optional (default=None)
            Flag to use best model
        eval_set : catboost.Pool or list of catboost.Pool or tuple (X, y) or list [(X, y)], optional (default=None)
            Validation dataset or datasets for metrics calculation and possibly early stopping.
        verbose : bool or int
            If verbose is bool, then if set to True, logging_level is set to Verbose,
            if set to False, logging_level is set to Silent.
            If verbose is int, it determines the frequency of writing metrics to output and
            logging_level is set to Verbose.
        logging_level : string, optional (default=None)
            Possible values:
                - 'Silent'
                - 'Verbose'
                - 'Info'
                - 'Debug'
        plot : bool, optional (default=False)
            If True, draw train and eval error in Jupyter notebook
        plot_file : file-like or str, optional (default=None)
            If not None, save train and eval error graphs to file
        verbose_eval : bool or int
            Synonym for verbose. Only one of these parameters should be set.
        metric_period : int
            Frequency of evaluating metrics.
        silent : bool
            If silent is True, logging_level is set to Silent.
            If silent is False, logging_level is set to Verbose.
        early_stopping_rounds : int
            Activates Iter overfitting detector with od_wait set to early_stopping_rounds.
        save_snapshot : bool, [default=None]
            Enable progress snapshotting for restoring progress after crashes or interruptions
        snapshot_file : string or pathlib.Path, [default=None]
            Learn progress snapshot file path, if None will use default filename
        snapshot_interval: int, [default=600]
            Interval between saving snapshots (seconds)
        init_model : CatBoost class or string or pathlib.Path, [default=None]
            Continue training starting from the existing model.
            If this parameter is a string or pathlib.Path, load initial model from the path specified by this string.
        callbacks : list, optional (default=None)
            List of callback objects that are applied at end of each iteration.

        log_cout: output stream or callback for logging (default=None)
            If None is specified, sys.stdout is used

        log_cerr: error stream or callback for logging (default=None)
            If None is specified, sys.stderr is used

        Returns
        -------
        model : CatBoost
        rc  )r   r  r  r  r^  r  )!r3   r+  r   rd  rY  rZ  r[  r^  r_  r,  re  rf  rg  rh  r  r  r   r   rA  rB  r]  r   r   r   rY  r  r  r  r  r  rW   rX   r   s!                                    r   r  zCatBoostRanker.fit  s~   T $++,&!f$44VO5LM		 	M! 	MQ 	M 	Mm 	M5G 	M 	MPU 	M	M!)	M+7	M9D	MFR	MT\	M^l	M	M#	M%2	M48	M:C	MEW	MYe	Mgt	M 	M 0	M 2?	M AN	M Pa	M cm	M ox	M zB	M DL	M r   c           	      0    | j                  |d||||d      S )a  
        Predict with data.
        Parameters
        ----------
        X : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.
        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.
        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.
        verbose : bool
            If True, writes the evaluation metric measured set to stderr.
        Returns
        -------
        prediction :
            If data is for a single object, the return value is single float formula return value
            otherwise one-dimensional numpy.ndarray of formula return values for each object.
        r  r  r  )r3   r+  r  r  rk  r   s         r   r  zCatBoostRanker.predict  s!    6 }}QiW^`ijjr   c           
      2    | j                  |d|||||d      S )a  
        Predict target at each stage for data.
        Parameters
        ----------
        X : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas.DataFrame or pandas.Series
                or catboost.FeaturesData
            Data to apply model on.
            If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted
            as a list of features for a single object.
        ntree_start: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).
        ntree_end: int, optional (default=0)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).
            If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_.
        eval_period: int, optional (default=1)
            Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing).
        thread_count : int (default=-1)
            The number of threads to use when applying the model.
            Allows you to optimize the speed of execution. This parameter doesn't affect results.
            If -1, then the number of threads is set to the number of CPU cores.
        verbose : bool
            If True, writes the evaluation metric measured set to stderr.
        Returns
        -------
        prediction : generator for each iteration that generates:
            If data is for a single object, the return value is single float formula return value
            otherwise one-dimensional numpy.ndarray of formula return values for each object.
        r  r  r  )r3   r+  r  r  r  rk  r   s          r   r  zCatBoostRanker.staged_predict  s1    : ##AYP[]ikr  uE  F  	Fr   c	                 :   d }	t        |t              r:|t        d      |j                         }|t        d      |j	                         }|t        d      |t        d      | j                  |      }
t        |g|
g |	|||gg d      d||dd|	      d   S )	a  
        Calculate NDCG@top
        Parameters
        ----------
        X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series
            Data to apply model on.
        y : list or numpy.ndarrays or pandas.DataFrame or pandas.Series
            True labels.
        group_id : list or numpy.ndarray or pandas.DataFrame or pandas.Series
            Ranking groups. If X is a Pool, group_id must be defined into X
        top : unsigned integer, up to `pow(2, 32) / 2 - 1`
            NDCG, Number of top-ranked objects to calculate NDCG
        type : str
            NDCG, Metric_type: 'Base' or 'Exp'
        denominator : str
            NDCG, Denominator type: 'LogPosition' or 'Position'
        group_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series
            Group weights.
        thread_count : int, optional (default=-1)
            Number of threads to work with.
        Returns
        -------
        NDCG@top : float
                   higher is better
        c                     t        j                  t        j                  | d             ryddj                  t	        | |      D cg c]  \  }}|	dj                  ||       c}}      z   S c c}}w )NNDCGzNDCG:;z{}={})r   r  equalr/  ro  r   )r  namesrH  ns       r   get_ndcg_metric_namez2CatBoostRanker.score.<locals>.get_ndcg_metric_name  sa    vvbhhvt,-SXXFTYHZ&lHZ1^_^kw~~a';HZ&lmmm&ls   
A/A/Nrl  zjWrong initializing group_id: X is catboost.Pool object, group_id must be initialized inside catboost.Pool.zy must be initialized.zPgroup_id must be initialized. If groups are not expected, pass an array of zeros)r  r   denominatorr   )rg   rD  rD   r|  get_group_id_hashr  _eval_metric_util)r3   r+  r   rd  r  r   r  re  rk  r  r  s              r   ro  zCatBoostRanker.score  s    4	n
 a}#  %C  D  DA##  %Q  R  R**,H9 899 rssll1o !{m5I3PTVaJb  eC  6D  FJ  LT  Vb  dh  jn  p|  }  ~  @  	@r   c                     t         j                  |       }t         j                  |       }|r)t        j                  dj                  |       t               |s|st        dj                  |             y y )NzHRegression loss ('{}') ignores an important ranking parameter 'group_id'zInvalid loss_function='{}': for ranker use YetiRank, YetiRankPairwise, StochasticFilter, StochasticRank, QueryCrossEntropy, QueryRMSE, GroupQuantile, QuerySoftMax, PairLogit, PairLogitPairwise. It's also possible to use a regression loss)r  rA  r5  r  r  r   RuntimeWarningrD   )rc  
is_rankingr  s      r   r^  z(CatBoostRanker._check_is_compatible_loss  sx    33MB
 99-HMMdkklyz  }K  Lm !N OUfUbNce e ,
r   )pNNNNNNrz  NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNr  )r   r   rJ   Nr  )NNNNNNrJ   )r   r   r   r   r  r4   r  r  r  ro  rb   r^  r&  r'  s   @r   r  r    s   $ O   '+# #'#!!!   #'# $&* *.!%"%)!"$(%)" cy5v NR^bOSlpVZgk$(sjk:F>-@^ 
e 
er   r  c                 ~   |t        d      || |} nt        d      |||}nt        d      |t        |      }|j                  d|i       |-|j                  ddi       d|v r|d= |j                  d|i       ||t        d	      |}t        |      }|j	                  | |||	|
||||||||||
       |S )ai  
    Train CatBoost model.

    Parameters
    ----------
    params : dict
        Parameters for CatBoost.
        If  None, all params are set to their defaults.
        If  dict, overriding parameters present in the dict.

    pool : catboost.Pool or tuple (X, y)
        Data to train on.

    iterations : int
        Number of boosting iterations. Can be set in params dict.

    evals : catboost.Pool or list of catboost.Pool or tuple (X, y) or list [(X, y)]
        Synonym for eval_set. Only one of these parameters should be set.

    dtrain : catboost.Pool or tuple (X, y)
        Synonym for pool parameter. Only one of these parameters should be set.

    logging_level : string, optional (default=None)
        Possible values:
            - 'Silent'
            - 'Verbose'
            - 'Info'
            - 'Debug'

    metric_period : int
        Frequency of evaluating metrics.

    verbose : bool or int
        If verbose is bool, then if set to True, logging_level is set to Verbose,
        if set to False, logging_level is set to Silent.
        If verbose is int, it determines the frequency of writing metrics to output and
        logging_level is set to Verbose.

    verbose_eval : bool or int
        Synonym for verbose. Only one of these parameters should be set.

    iterations : int
        Number of boosting iterations. Can be set in params dict.

    num_boost_round : int
        Synonym for iterations. Only one of these parameters should be set.

    eval_set : catboost.Pool or list of catboost.Pool or tuple (X, y) or list [(X, y)]
        Validation dataset or datasets for metrics calculation and possibly early stopping.

    plot : bool, optional (default=False)
        If True, draw train and eval error in Jupyter notebook
`
    plot_file : file-like or str, optional (default=None)
        If not None, save train and eval error graphs to file

    early_stopping_rounds : int
        Activates Iter overfitting detector with od_wait set to early_stopping_rounds.

    save_snapshot : bool, [default=None]
        Enable progress snapshotting for restoring progress after crashes or interruptions

    snapshot_file : string or pathlib.Path, [default=None]
        Learn progress snapshot file path, if None will use default filename

    snapshot_interval: int, [default=600]
        Interval between saving snapshots (seconds)

    init_model : CatBoost class or string or pathlib.Path, [default=None]
        Continue training starting from the existing model.
        If this parameter is a string or pathlib.Path, load initial model from the path specified by this string.

    log_cout: output stream or callback for logging (default=None)
        If None is specified, sys.stdout is used

    log_cerr: error stream or callback for logging (default=None)
        If None is specified, sys.stderr is used

    Returns
    -------
    model : CatBoost class
    params should be set.9Only one of the parameters pool and dtrain should be set.HOnly one of the parameters iterations and num_boost_round should be set.rT  r  r  r  rX  z9Only one of the parameters evals, eval_set should be set.)r+  r  r   rA  rB  r   r   r   rY  r  r  r  r  rW   rX   )rD   r   r   r  r  )r  r   dtrainr   r   rT  rV  evalsr  rA  rB  r   r   rY  r  r  r  r  rW   rX   r  s                        r   trainr  
  s#   n ~344<D [\\"(J jkk&!*
 	 (v
 	 y!,
 	  [\\VE	IIx}4[dnu'}$9)=N[e(	  4
 Lr   c                 |    g }| D ]4  }t               }||_        |j                          |j                  |       6 |S )z6
    Convert _Catboost instances to Catboost ones
    )r  r  r  rV   )modelsoutput_modelsr  cb_models       r   _convert_to_catboostr    sD     M: ..0X&	 
 r   c                    |dkD  sJ |dkD  sJ |dkD  sJ |dkD  sJ |d}t         j                  j                  |      }|j                  dd|      }|j                  dd|      }t	        |       }||z  dz  |z  }g }t        j                         }|j                  }t        |      D ]  }|j                  ||      }t        ||   |	|dd|d	dd	d
|z  |ddd      }|j                  | ||||d	       |j                  |d|        t        |dd      5 }t        j                  |      }ddd       d   D ]s  }t!        t#        |d   |d               D ]S  \  } \  }!}"|j                  t        j$                  |t        j$                  t'        d
|"            z              |d   | <   U u t        |d      5 }t        j(                  ||       ddd       |j+                  |d       |j-                         \  }#}$|j/                  |#|z  t        j$                  |	      z  |$|z  t        j$                  |	      z         ||j1                  |       z
  |j                  ||      z   }%t        ||   ||
|dd||dd	||ddd      }&|&j                  | |%||||d	       |j3                  t5        ||&gd
d
g              |S # 1 sw Y   xY w# 1 sw Y   xY w)a  
    Implementation of Gaussian process sampling (Kernel Gradient Boosting/Algorithm 4) from "Gradient Boosting Performs Gaussian Process Inference" https://arxiv.org/abs/2206.05608
    Produces samples from posterior GP with prior assumption f ~ GP(0, sigma ** 2 K + delta ** 2 I)

    Parameters
    ----------
    X : list or numpy.ndarray or pandas.DataFrame or pandas.Series or catboost.FeaturesData
        If catboost.FeaturesData it must be 2 dimensional Feature matrix
        Must be non-empty (contain > 0 objects)
    y : list or numpy.ndarray or pandas.DataFrame or pandas.Series
        Labels of the training data.
        Must be a single-dimensional array with numerical values.
    eval_set : catboost.Pool or list of catboost.Pool or tuple (X, y) or list [(X, y)], optional (default=None)
        Validation dataset or datasets for metrics calculation and possibly early stopping in posterior training.
    cat_features : list or numpy.ndarray, optional (default=None)
        If not None, giving the list of Categ columns indices.
        Use only if X is not catboost.FeaturesData
    text_features : list or numpy.ndarray, optional (default=None)
        If not none, giving the list of Text columns indices.
        Use only if X is not catboost.FeaturesData
    embedding_features : list or numpy.ndarray, optional (default=None)
        If not none, giving the list of Embedding columns indices.
        Use only if X is not catboost.FeaturesData
    random_seed : int, [default=None]
        Random number seed.
        If None, 0 is used.
        range: [0,+inf)
    samples : int, [default=10]
        Number of Monte-Carlo samples from GP posterior. Controls how many models this function will return.
        range: [1,+inf)
    posterior_iterations : int, [default=900]
        Max count of trees for posterior sampling step.
        range: [1,+inf)
    prior_iterations : int, [default=100]
        Max count of trees for prior sampling step.
        range: [1,+inf]
    learning_rate : float, [default=0.1]
        Step size shrinkage used in update to prevent overfitting.
        range: (0,1]
    depth : int, [default=6]
        Depth of the trees in the models.
        range: [1,16]
    sigma : float, [default=0.1]
        Scale of GP kernel (lower values lead to lower posterior variance)
        range: (0,+inf)
    delta : float, [default=0]
        Scale of homogenious noise of GP kernel (adjust if target is noisy)
        range: [0,+inf)
    random_strength : float, [default=0.1]
        Corresponds to parameter beta in the paper. Higher values lead to faster convergence to GP posterior.
        range: (0,+inf)
    random_score_type : string [default='Gumbel']
        Type of random noise added to scores.
        Possible values:
            - 'Gumbel' - Gumbel-distributed (as in paper)
            - 'NormalWithModelSizeDecrease' - Normally-distributed with deviation decreasing with model iteration count (default in CatBoost)
    eps : float, [default=1e-4]
        Technical parameter that controls the precision of the prior estimation.
        range: (0, 1]
    verbose : bool or int
        Verbosity of posterior model training output
        If verbose is bool, then if set to True, logging_level is set to Verbose,
        if set to False, logging_level is set to Silent.
        If verbose is int, it determines the frequency of writing metrics to output and
        logging_level is set to Verbose.

    Returns
    -------
    models : list of trained CatBoostRegressor models (size = samples parameter value)
    r   Nl    )lowhighr   r   )r  r   r{  r  Fr   L2Plain)r   rT  rK  rc  rD  rM  r   rO  rW  r5  r6  rR  rN  r>  )rY  rZ  r[  r  r  )r   r  r  r  )encodingoblivious_treesr  leaf_weights)r  w)r   )r   rT  rK  rR  rc  rD  rM  r   rO  rW  r5  r6  rR  rN  r>  )r  rY  rZ  r[  r  )r%  )r   randomdefault_rngintegersr_   r
  NamedTemporaryFiler   r  normalr  r  r?  openr  loadri   ro  sqrtrR  dumpr  r  r  r  rV   
sum_models)'r+  r   r  rY  rZ  r[  r   samplesposterior_iterationsprior_iterationsrK  rM  sigmadeltar5  r6  epsr   random_generatorprior_seedsposterior_seedsNrR  r  tmp_fileprior_model_tmp_filesampleprior_yprior
prior_file
prior_jsontreeindrq  rc  r  r  posterior_y	posteriors'                                          r   sample_gaussian_processr    sH   T AIIaKKa !GGyy,,[9"++g+NK&//AG'/ROAA(50Q6:M**,H#==."))!)<!#F+' )-$cE/!
  			%'1  	 	
 	-f1E&g>*:.J ?01D&/D4GnI]0^&_"]c6+;+B+BQRUWU\U\]`abdj]kUlQlIm+B+n]#C( '` 2 &,
IIj*- --f=..0t  9I1J!JTTY\\^\c\cdt\uMuv%--**-=-D-D5WX-D-YY%'/+'/ )-$+/!
	" 	%'1  	 	
 	Z	(:QFKLI !L U ?>
 -,s   K
?K
K	K!	c                    |t        d      t        |      }t        |       t        |       t	        ||||      \  }}}d|vrt        d      t        d ||fD              r|t        d      ||dk(  rd}
d}||j                  d	|i       ||j                  d
|i       ||j                  d|i       |-|j                  ddi       d|v r|d= |j                  d|i       || |} nt        d      |||}nt        d      ||j                  d|i       |	|	}|||d<   |||d<   |||d<   |S|Q|d}nZt        |d      r|j                         }n=t        |d      rt        |      }n%t        |      }t        |      }n||}n	|||k(  sJ ||dk(  rd}n1|/|j                  dd      }t        |t              xr t        |      }d|v ryt        |d   | j!                               }t#        | j%                               t#        |      k7  r4t        dt'        |      z   dz   t'        | j%                               z         |d= d|v ryt        |d   | j!                               }t#        | j)                               t#        |      k7  r4t        dt'        |      z   dz   t'        | j)                               z         |d= d|v ryt        |d   | j!                               } t#        | j+                               t#        |       k7  r4t        dt'        |       z   dz   t'        | j+                               z         |d= t-        |      }!t/        |!       g }"t1        |      D ]@  }#|"j3                  t4        j6                  j9                  |!d j;                  |#                   B |"D ]  }$t/        |$        t=        ||      5  t?        ||d!|""      5  |s(tA        || ||||
||||||      cddd       cddd       S tA        || ||||
||||||      \  }%}&tC        |&      }'|%|'fcddd       cddd       S # 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)#a2  
    Cross-validate the CatBoost model.

    Parameters
    ----------
    pool : catboost.Pool
        Data to cross-validate on.

    params : dict
        Parameters for CatBoost.
        CatBoost has many of parameters, all have default values.
        If  None, all params still defaults.
        If  dict, overriding some (or all) params.

    dtrain : catboost.Pool or tuple (X, y)
        Synonym for pool parameter. Only one of these parameters should be set.

    iterations : int
        Number of boosting iterations. Can be set in params dict.

    num_boost_round : int
        Synonym for iterations. Only one of these parameters should be set.

    fold_count : int, optional (default=3)
        The number of folds to split the dataset into.

    nfold : int
        Synonym for fold_count.

    type : string, optional (default='Classical')
        Type of cross-validation
        Possible values:
            - 'Classical'
            - 'Inverted'
            - 'TimeSeries'

    inverted : bool, optional (default=False)
        Train on the test fold and evaluate the model on the training folds.

    partition_random_seed : int, optional (default=0)
        Use this as the seed value for random permutation of the data.
        Permutation is performed before splitting the data for cross validation.
        Each seed generates unique data splits.

    seed : int, optional
        Synonym for partition_random_seed. This parameter is deprecated. Use
        partition_random_seed instead.
        If both parameters are initialised partition_random_seed parameter is
        ignored.

    shuffle : bool, optional (default=True)
        Shuffle the dataset objects before splitting into folds.

    logging_level : string, optional (default=None)
        Possible values:
            - 'Silent'
            - 'Verbose'
            - 'Info'
            - 'Debug'

    stratified : bool, optional (default=None)
        Perform stratified sampling. True for classification and False otherwise.

    as_pandas : bool, optional (default=True)
        Return pd.DataFrame when pandas is installed.
        If False or pandas is not installed, return dict.

    metric_period : int, [default=1]
        The frequency of iterations to print the information to stdout. The value should be a positive integer.

    verbose : bool or int
        If verbose is bool, then if set to True, logging_level is set to Verbose,
        if set to False, logging_level is set to Silent.
        If verbose is int, it determines the frequency of writing metrics to output and
        logging_level is set to Verbose.

    verbose_eval : bool or int
        Synonym for verbose. Only one of these parameters should be set.

    plot : bool, optional (default=False)
        If True, draw train and eval error in Jupyter notebook

    plot_file : file-like or str, optional (default=None)
        If not None, save train and eval error graphs to file

    early_stopping_rounds : int
        Activates Iter overfitting detector with od_wait set to early_stopping_rounds.

    save_snapshot : bool, [default=None]
        Enable progress snapshotting for restoring progress after crashes or interruptions

    snapshot_file : string or pathlib.Path, [default=None]
        Learn progress snapshot file path, if None will use default filename

    snapshot_interval: int, [default=600]
        Interval between saving snapshots (seconds)

    metric_update_interval: float, [default=0.5]
        Interval between updating metrics (seconds)

    folds: generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)
        If generator or iterator, it should yield the train and test indices for each fold.
        If object, it should be one of the scikit-learn splitter classes
        (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
        and have ``split`` method.
        if folds is not None, then all of fold_count, shuffle, partition_random_seed, inverted are None

    return_models: bool, optional (default=False)
        if True, return a list of models fitted for each CV fold

    log_cout: output stream or callback for logging (default=None)
        If None is specified, sys.stdout is used

    log_cerr: error stream or callback for logging (default=None)
        If None is specified, sys.stderr is used

    Returns
    -------
    cv results : pandas.core.frame.DataFrame with cross-validation results
        columns are: test-error-mean  test-error-std  train-error-mean  train-error-std
    cv models : list of trained models, if return_models=True
    Nr  rc  z@Parameter loss_function should be specified for cross-validationc              3   $   K   | ]  }|d u 
 y wr(   r   rG  s     r   r   zcv.<locals>.<genexpr>  s     
6"5Q1D="5rI  z_if folds is not None, then all of fold_count, shuffle, partition_random_seed, inverted are None
TimeSeriesFr   r   r   r  r  r  rX  r  r  rT  r  r  r  r   get_n_splits__len__rY  zGcategorical features indices in params are different from ones in pool z vs rZ  z@text features indices in params are different from ones in pool r[  zEembedding features indices in params are different from ones in pool zfold-{}zCross-validation plotr  )"rD   r   r  rr  r   r  r   rC   r  r_   rh   r   rg   rt   r  r   r  rr   r  r   r  r  r7  r-   r  rV   r)   r*   r/  r   re   rD  _cvr  )(r  r   r  rT  rV  r  nfoldinvertedr  seedr  r   r  	as_pandasr   r   r   rA  rB  rY  r  r  r  metric_update_intervalfoldsr   return_modelsrW   rX   rc  cat_feature_indices_from_params text_feature_indices_from_params%embedding_feature_indices_from_paramsr1  r  r  r  r  	cv_modelsoutput_cv_modelss(                                           r   r  r  @  s\   @ ~344fFff%,<w|-=)M7M f$^__

6:u"5
665;Lm
 	
 DL0w
 	  ]
 	  ]
 	 (v
 	 y!,
 	 <D [\\"(J jkk*
 	  $ "/ "/$&7"#}+=Jun-"//1
	* Z
U Z
		
} 333DL0
		

?D9|<jA[\iAj
*?~@VX\XnXnXp*q't++-.#6U2VV i"%&E"F!G &!'),T-I-I-K)L!M N N >"& +@AXZ^ZpZpZr+s(t,,./37W3XX b"%&F"G!H &!'),T-J-J-L)M!N O O ?#v%0EfMaFbdhdzdzd|0}-t1134<a8bb g"%&K"L!M &!'),T-O-O-Q)R!S T T '(v&II&Ij!i1A1A$1GHI ")  
8X	&TY[r  @I  )J%& )J  )J	&	&" "%%&"GY  4I>,,A )J  )J	&	&  )J  )J	&	&	&s0   Q%Q
=	Q'Q
7	Q
Q	QQ(c                        e Zd Z fdZ xZS )r  c           	          t         t        |   |       |t        j                         }d}nd}t        |t              st        |t              r|g}t        |      }| j                  |||||||       y )NTF)
rW  r  r4   r
  r  rg   rt   r#   rp  _create_calcer)
r3   catboostrt  r  r  r  rk  r  delete_temp_dir_flagrm  s
            r   r4   zBatchMetricCalcer.__init__g  sr    /9?&&(G#' #( g|,
7M0RiG09G[)[,X_auvr   )r   r   r   r4   r&  r'  s   @r   r  r  e  s    w wr   r  c                 @    t               }|j                  | ||       |S r(   )r  r#  )r  r%  r&  rA  s       r   r  r  u  s!    ZF
vw(89Mr   c                 J    | j                  dddd|rdndddd	|d
dd      S )NzPrediction and targetr  r   r^  Objects per binz% pool objects in binr   r  )r   r   positionzrgba(0,0,0,0)gQ?)bgcolorr   )r   r   r   legend)r`  )r/  r   single_pools      r   _calc_feature_statistics_layoutr  {  sL    99,
 +6&;R

 &
   r   c                    	 dd l m} t        |       }g }| d   }d|j                         v r t        |d         dk(  r?|j                  j                  ddg      }|j                  g t        |||dk(              S t        j                  t        |d               }d	}	|j                  j                  dd
t        t        t        |d         dz               dj!                  |d   d         gt#        |d   d d |d   dd        D 
cg c]  \  }
}dj!                  |
|       c}}
z   dj!                  |d   d         gz   d      }nd|j                         v ret        j$                  |d         d d d   }d}	|j                  j                  dd
t        t        t        |d                     |d   |   d      }nt'        d      t)        |       D ]  \  }}|dk(  rd}ndj!                  ||         }|j+                  |d   |   dd|z   dd      }|j+                  |d   |   dddid |z   dd!      }t        |d"         dk7  r"|j+                  |d"   |   ddd#id$|z   dd!      }|dkD  r|d   j-                         }t        j.                  g d%      }t        j.                  g d&      }||z  ||dz
  |z
  z  z   t1        |dz
        z  }|j3                  t4              }|j7                  |d   |   t1        |      z  |	|z  d'j!                  |      |z   d(dd) d*j                   | i+      }n"|j7                  |d   |   |	d,|z   d(dd)d-i+      }|j+                  |d.   |   ddd/id0|z   dd!      }t        |d"         dk7  r|||||gz  }|||||gz  } t        |||dk(        }|j                  ||      }|S # t        $ r.}t        j                  d       t        t        |            d }~ww xY wc c}}
w )1Nr   z?To draw binarized feature statistics you should install plotly.rT  re  rf  r   rh  objects_per_binr"  r  ri  rJ   rj  rk  Frl  r  g?z
Cat valuesTzHExpected field "borders" or "cat_values" in binarized feature statisticsr   z	, {} poolmean_targetr   zMean targety1r   )r   r   r   r   r   mean_predictiondashz1Mean prediction on each segment of feature values)r   r   r   r   r   r   mean_weighted_targetdotzMean weighted target)      r  )r  r  r  z % pool objects in bin (total {})r   r   zrgba({}, {}, {}, 0.4))r   widthr   r   r   r   r  zrgba(30, 150, 30, 0.4)predictions_on_varying_featuredashdotz(Mean prediction with substituted feature)r  r  r  r  r  r   r_   rU  r   ro  r   r  r   arangerh   r  r   ro  argsortrD   ri   r#  r   r  rp   astypern   Bar)statistics_listr  r/  r0  pools_countrX  
statisticsr   order	bar_widthrv  rw  r  name_suffixtrace_1trace_2trace_3objects_in_poolcolor_acolor_br   trace_4trace_5r   r3  s                            r   '_build_binarized_feature_statistics_figr    s   "&
 o&KD #JJOO%%z)$%*IIOO&A3O?E99"-LRQVXcghXh-i9jj		#j):;<=			%Jy$9 :Q >?@&--j.CA.FGH*-j.CCR.H*U^J_`a`bJc*df*d,% )//u=*dff '--j.CB.GHIJ !   	
 
*	*

:&789$B$?			%J|$< =>?-e4   
 fgg"?3:!K%,,Z];K**'. ,  
 ***+E2 6"D{R  
 
123q8jj34U;$u%+k9 ! G ?():;??AOhh}-Ghh}-Gq[7kAo.A#BBeKZ[OF\\ELL%Eff./69OO+-7>>OR]];4;;UC  	G ff./6&45  	G **9:5A 9%;kI  
 
123q8WgwAADWgw88DM 4P -Rq8HIF
))f)
-CJa  "WX#a&!!"&fs   O P	P)O>>Pc                    d| d   j                         v rt        | d   d         |kD  rg }t        dt        | d   d         |      D ]s  }||z   }g d}t        | D 	
cg c](  \  }}	|t        |D 
cg c]  }
|
|	|
   || f c}
      f* c}
}	}      }t	        ||      }dj                  |||      }|||fgz  }u |S t	        | |      }||fgS c c}
w c c}
}	}w )Nr  r   )r  r  r  r   r  r  z{}_parts[{}:{}])rU  r_   r  r'  r  r   )r  r  r  r  rs  beginendstatistics_keyskstatsrz   sub_statisticsr3  feature_name_with_part_suffixs                 r   _plot_feature_statistics_unitsr$    s0   z!}))++JqM,4O0PSk0k1c*Q-"=>@XYE22CTO!  {E  #F  {EnvnoqvAtWf,gWfPSc5:ec3J-KWf,g'h#i  {E  #F  GN9.*UC,=,D,D\SXZ],^)c89::D Z 5j*Ml#$$ -h  #Fs   C1C
CCc                 4   g }| D ]e  }||   }| |   }d|d   j                         v}d|d   j                         v r|D ]  }	t        |	d         dkD  sd} |rU|t        ||||      z  }g |d   d   }
g }|D ]^  \  }}|j                  t	        |dd|j
                  D cg c]  }|j                   c}id|j                  j                  ig             ` |
j                  t	        d	d
d
dddddd|      gt	        ddddddd      g       |
S c c}w )Nr  r   rT  Fr   r   r   r  downr   r  Tg      ?r  gq=
ףp?r  )	directionr  r  r   r  r   r  r  zStatistics for featurepaperg?)r   	showarrowr   xrefr   yrefalign)r  annotations)
rU  r_   r$  rV   r'  rX  r   r   r   r(  )r  r  r   r  figs_with_namesr  r  r  	need_skipr!  main_figr  r3  rX  s                 r   r  r    so   O,$[1*;7
 
1(:(:(<<	
1**,,#uY'(1, %I $ 9*jR^`xyy - q!!$HG,\"9TVV9:WcjjFVFV<WX	
 -  2&	
 .%7dH
  $ O+ :s    Dc           	      :   t        | |      r| S t        | t              st        d      |j                  |      }t	        | j
                        }t        |       d|v r|j                  |d          | j                  D ]  }t        ||t        | |              |S )a  
    Convert a CatBoost model to a sklearn-compatible model.

    Parameters
    ----------
    model : CatBoost model
        a model to convert from

    subclass : an sklearn-compatible class
        a class to convert to : CatBoostClassifier, CatBoostRegressor or CatBoostRanker

    Returns
    -------
    a converted model : `subclass` type
        a model converted from the initial CatBoost `model` to a sklearn-compatible `subclass` model
    z&model should be a subclass of CatBoostrc  )rg   r  rD   __new__r   r  r  r^  r  r  r  )r  subclassconverted_modelr   r  s        r   _to_subclassr5  C  s    " %"eX&DEE&&x0O e(()Ff& **6/+BCwud';< r   c                 "    t        | t              S r(   )r5  r  r  s    r   to_regressorr8  f  s    011r   c                 "    t        | t              S r(   )r5  r$  r7  s    r   to_classifierr:  j  s    122r   c                 "    t        | t              S r(   )r5  r  r7  s    r   	to_rankerr<  n  s    ~..r   c                       e Zd Zd Zd Zy)r  c                     || _         y r(   )
_callbacks)r3   r  s     r   r4   z_TrainCallbacksWrapper.__init__s  s	    #r   c                 L    | j                   D ]  }|j                  |      r y y)NFT)r?  after_iteration)r3   infocbs      r   rA  z&_TrainCallbacksWrapper.after_iterationv  s&    //B%%d+ " r   N)r   r   r   r4   rA  r   r   r   r  r  r  s    $r   r  ra   r!  )NNNNNr(   )NNNNNNNNNNNNNNNNNNNN)NNNNNr   i  d   皙?r$   rE  r   rE  Gumbelg-C6?F)NNNNNNNFr   NTNNTNNNFNNNNNg      ?N	ClassicalFNNr  )
contextlibr   r  r   loggingrQ   r)   version_infocollections.abcr   r   r   r	   collectionsr
   r   sixr   r   r   r  numpyr   ctypesplatformr
  r  r  enumr   operatorr   r;   systemCDLLr   pandasr   r   r  r  scipy.sparserR  plot_helpersr   r    r!   r   r"   rt  r#   r   r)  	_PoolBaser  _MetricCalcerBaser  rU   rT   _configure_mallocrD   r|   r0  r  r4  r7  r:  r=  r  r@  is_maximizable_metricis_minimizable_metric_PreprocessParamsr  _MetadataHashProxyr  rO  _have_equal_featuresr  MultiTargetCustomMetricMultiTargetCustomObjectiveMultiRegressionCustomMetricMultiRegressionCustomObjectiveru   r  	getLoggerr   logger_library_initintegerrm   rp   floatingro   rt   rh   r   rk   PathLikers   pathlibr&   r-   r/   r9   rd   re   rj   r~   r   r   r   r   r   r   r   r   r   r8  rB  rD  r-  r3  r7  r<  rD  rI  r`  r  rr  rp  r  r  r  r  r  r  r  r  r$  r  r  r  r  r  r  r  r  r  r  r$  r  r5  r8  r:  r<  r  r   r   r   <module>rk     s   %   
 	vKKGG 0 6 6          8??J(  T S  " 		// mm##''// ''$-$O$O !'CC &AA #;; (EE $== "+"K"K 33 // !77 !77 // 33 11 11 %% 55 33 #;; &AA '?? !*!E!E 			//  
		8	$   	   

+bkk"RZZF3v.J'JJ
& &8& 8v ,-  $ $4:n
30` &D 4$ 4 8@T l^0@K9 @KF.4" 
P 
P$OHAV#<a:F a:H@s4$$NI$8} I$8XHwJ wJt!W Wt{eX {e| ]aw{`d37@F
 ,0VZx{ }BcL NRRVX\Z^^aRVb-J	w) w (sl%"*Z F23/V ib  
  F  	s$   P 'P( P%$P%(QQ