调参

GridSearchCV() 对估算器指定参数值进行详尽搜索

from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

grid_search.best_params_
# Out:{'max_features': 8, 'n_estimators': 30}

grid_search.best_estimator_
# Out:
# RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
#            max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
#            min_impurity_split=None, min_samples_leaf=1,
#            min_samples_split=2, min_weight_fraction_leaf=0.0,
#            n_estimators=30, n_jobs=1, oob_score=False, random_state=42,
#            verbose=0, warm_start=False)

estimator : estimator object.每个估算器需要提供一个score函数或填写scoring参数。
param_grid : dict or list of dictionaries，键作为参数名称，list作为参数的字典。或存有这样的字典的列表。
scoring : string, callable, list/tuple, dict or None, default: None，
cv : int, cross-validation generator or an iterable, optional，如果是整数，则代表KFold
refit : boolean, or string, default=True，应用已找到的最好的参数到整个数据集上。

Methods	description
decision_function(X)	Call decision_function on the estimator with the best found parameters.
fit(X[, y, groups])	Run fit with all sets of parameters.
get_params([deep])	Get parameters for this estimator.
inverse_transform(Xt)	Call inverse_transform on the estimator with the best found params.
predict(X)	Call predict on the estimator with the best found parameters.
predict_log_proba(X)	Call predict_log_proba on the estimator with the best found parameters.
predict_proba(X)	Call predict_proba on the estimator with the best found parameters.
score(X[, y])	Returns the score on the given data, if the estimator has been refit.
set_params(**params)	Set the parameters of this estimator.
transform(X)	Call transform on the estimator with the best found parameters.

RandomizedSearchCV()

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

estimator : estimator object.指定估算器对象。
param_distributions : dict，给定以参数名为键，list为参数的字典。或提供一个分布，分布必须提供一个rvs方法进行采样，例如来自scipy.stats.distributions的方法。
n_iter : int, default=10，采样参数设置数量。
scoring : string, callable, list/tuple, dict or None, default: None
cv : int, cross-validation generator or an iterable, optional
refit : boolean, or string default=True
random_state : int, RandomState instance or None, optional, default=None

模型

分类

RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=50, n_jobs=-1)
scores = cross_val_score(rnd_clf, trainData, trainLabel,
                         scoring="recall", cv=10)
scores.mean()

ExtraTreesClassifier

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
etr_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=50, n_jobs=-1)
etr_score = cross_val_score(etr_clf, trainData, trainLabel, scoring='recall', cv=10)
etr_score.mean()

GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=2,max_depth=6,n_estimators=100)

XGBClassifier

from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
xgb1 = XGBClassifier(learning_rate =0.05,
                     n_estimators=1000,
                     max_depth=3,
                     min_child_weight=1,
                     gamma=0.1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     reg_alpha=0.001,
                     reg_lambda=0.001,
                     scale_pos_weight=1,
                     seed=27)
xgb1.fit(subTrain, subLabel)
xgb1_pred = xgb1.predict(trainData)
xgb1_pred_prob = xgb1.predict_proba(trainData)[:, 1]
print(metrics.accuracy_score(trainLabel, xgb1_pred))
print(metrics.roc_auc_score(trainLabel, xgb1_pred_prob))
xgb1_f2 = calc_f2(trainLabel, xgb1_pred)
print(xgb1_f2)

Lightgbm

import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
                            boosting_type='gbdt',
                            objective='binary',
                            n_estimators=1000,
                            metric='auc',
                            max_depth=3,
                            num_leaves=5,
                            subsample=0.7,
                            colsample_bytree=0.7,
                            min_data_in_leaf=450,
                            feature_fraction=0.7,
                            bagging_fraction=0.7,
                            bagging_freq=6,
                            lambda_l1=1,
                            lambda_l2=0.001,
                            min_gain_to_split=0.265,
                            verbose=5,
                            is_unbalance=True)
lgb_clf.fit(subTrain, subLabel)
lgb_clf_pred = lgb_clf.predict(trainData)

模型融合

BaggingClassifier

from sklearn.ensemble import BaggingClassifier
bag_rnd = BaggingClassifier(rnd_clf, n_estimators=10, max_samples=1000, bootstrap=True, n_jobs=-1)
bag_rnd.fit(subTrain, subLabel)
rnd_pred = bag_rnd.predict(trainData)

VotingClassifier

from sklearn.ensemble import VotingClassifier
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(learning_rate =0.1,
                     n_estimators=1000,
                     max_depth=3,
                     min_child_weight=1,
                     gamma=0.1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     reg_alpha=0.001,
                     reg_lambda=0.001,
                     scale_pos_weight=1)

import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
                            boosting_type='gbdt',
                            objective='binary',
                            n_estimators=1000,
                            metric='auc',
                            max_depth=3,
                            num_leaves=5,
                            subsample=0.7,
                            colsample_bytree=0.7,
                            min_data_in_leaf=450,
                            feature_fraction=0.7,
                            bagging_fraction=0.7,
                            bagging_freq=6,
                            lambda_l1=1,
                            lambda_l2=0.001,
                            min_gain_to_split=0.265,
                            verbose=5,
                            is_unbalance=True)

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=320, min_samples_leaf=7, max_depth=7, 
                                 max_features='sqrt', subsample=0.7, random_state=10)

vot = VotingClassifier(estimators=[('xgb', xgb), ('lgb', lgb_clf)], voting='soft')
vot.fit(trainData, trainLabel)
vot_pred = vot.predict(testData)
vot_pred = pd.DataFrame(vot_pred)
submData['acc_now_delinq'] = vot_pred
submData.to_csv(os.path.join(homePath, 'submit.csv'), index=False)

StackingClassifier

from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=110, min_samples_split=90, min_samples_leaf=15,max_depth=10,oob_score=True,max_features=10)

from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(learning_rate =0.05,
                     n_estimators=1000,
                     max_depth=3,
                     min_child_weight=1,
                     gamma=0.1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     reg_alpha=0.001,
                     reg_lambda=0.001,
                     scale_pos_weight=1)

import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
                            boosting_type='gbdt',
                            objective='binary',
                            n_estimators=1000,
                            metric='auc',
                            max_depth=3,
                            num_leaves=5,
                            subsample=0.7,
                            colsample_bytree=0.7,
                            min_data_in_leaf=450,
                            feature_fraction=0.7,
                            bagging_fraction=0.7,
                            bagging_freq=6,
                            lambda_l1=1,
                            lambda_l2=0.001,
                            min_gain_to_split=0.265,
                            verbose=5,
                            is_unbalance=True)

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=320, min_samples_leaf=7, max_depth=7, 
                                 max_features='sqrt', subsample=0.7)


from mlxtend.classifier import StackingClassifier 
stack_clf = StackingClassifier(classifiers=[gbdt, lgb_clf, rnd_clf, xgb], meta_classifier=xgb)

stack_clf.fit(trainData, trainLabel)
stack_pred = stack_clf.predict(testData)

数据处理

特征放缩

MinMax scaling归一化

该方法更容易受离散点影响 X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min

>>> from sklearn.preprocessing import MinMaxScaler
>>>
>>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
>>> scaler = MinMaxScaler()
>>> print(scaler.fit(data))
MinMaxScaler(copy=True, feature_range=(0, 1))
>>> print(scaler.data_max_)
[  1.  18.]
>>> print(scaler.transform(data))
[[ 0.    0.  ]
 [ 0.25  0.25]
 [ 0.5   0.5 ]
 [ 1.    1.  ]]
>>> print(scaler.transform([[2, 2]]))
[[ 1.5  0. ]]

feature_range : tuple (min, max), default=(0, 1)，归一化后值的范围
copy : boolean, optional, default True，是否复制数据在新的数据上归一化

零均值标准化

>>> from sklearn.preprocessing import StandardScaler
>>>
>>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
>>> scaler = StandardScaler()
>>> print(scaler.fit(data))
StandardScaler(copy=True, with_mean=True, with_std=True)
>>> print(scaler.mean_)
[ 0.5  0.5]
>>> print(scaler.transform(data))
[[-1. -1.]
 [-1. -1.]
 [ 1.  1.]
 [ 1.  1.]]
>>> print(scaler.transform([[2, 2]]))
[[ 3.  3.]]

copy : boolean, optional, default True，是否复制数据在新的数据上执行
with_mean : boolean, True by default，若为True则在缩放前将数据居中。但在稀疏矩阵上是行不通的。
with_std : boolean, True by default，若为True，则将数据放缩到单位方差或等效于单位标准差

处理空数据

Imputer() 处理丢失值

各属性必须是数值

from sklearn.preprocessing import Imputer
# 指定用何值替换丢失的值，此处为中位数
imputer = Imputer(strategy="median")

# 使实例适应数据
imputer.fit(housing_num)

# 结果在statistics_ 变量中
imputer.statistics_

# 替换
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index = list(housing.index.values))

# 预览
housing_tr.loc[sample_incomplete_rows.index.values]

处理文本数据

pandas.factorize() 将输入值编码为枚举类型或分类变量

housing_cat = housing['ocean_proximity']
housing_cat.head(10)
# 输出
# 17606     <1H OCEAN
# 18632     <1H OCEAN
# 14650    NEAR OCEAN
# 3230         INLAND
# 3555      <1H OCEAN
# 19480        INLAND
# 8879      <1H OCEAN
# 13685        INLAND
# 4937      <1H OCEAN
# 4861      <1H OCEAN
# Name: ocean_proximity, dtype: object

housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
# 输出
# array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)

参数

values : ndarray (1-d)；序列
sort : boolean, default False；根据值排序
na_sentinel : int, default -1；给未找到赋的值
size_hint : hint to the hashtable sizer

返回值

labels : the indexer to the original array
uniques : ndarray (1-d) or Index；当传递的值是Index或Series时，返回独特的索引。

OneHotEncoder 编码整数特征为one-hot向量

返回值为稀疏矩阵

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

注意fit_transform()期望一个二维数组，所以这里将数据reshape了。

处理文本特征示例

housing_cat = housing['ocean_proximity']
housing_cat.head(10)
# 17606     <1H OCEAN
# 18632     <1H OCEAN
# 14650    NEAR OCEAN
# 3230         INLAND
# 3555      <1H OCEAN
# 19480        INLAND
# 8879      <1H OCEAN
# 13685        INLAND
# 4937      <1H OCEAN
# 4861      <1H OCEAN
# Name: ocean_proximity, dtype: object

housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
# array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)

housing_categories
# Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
print(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
# [[0]
#  [0]
#  [1]
#  ..., 
#  [2]
#  [0]
#  [3]]
# <16512x5 sparse matrix of type '<class 'numpy.float64'>'
# 	with 16512 stored elements in Compressed Sparse Row format>

LabelEncoder 标签编码

LabelEncoder`是一个可以用来将标签规范化的工具类，它可以将标签的编码值范围限定在[0,n_classes-1]。简单来说就是对不连续的数字或者文本进行编号。

>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2])
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])

当然，它也可以用于非数值型标签的编码转换成数值标签（只要它们是可哈希并且可比较的）:

>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1])
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']

LabelBinarizer 标签二值化

LabelBinarizer 是一个用来从多类别列表创建标签矩阵的工具类:

>>> from sklearn import preprocessing
>>> lb = preprocessing.LabelBinarizer()
>>> lb.fit([1, 2, 6, 4, 2])
LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
>>> lb.classes_
array([1, 2, 4, 6])
>>> lb.transform([1, 6])
array([[1, 0, 0, 0],
       [0, 0, 0, 1]])

对于多类别是实例，可以使用:class:MultiLabelBinarizer:

>>> lb = preprocessing.MultiLabelBinarizer()
>>> lb.fit_transform([(1, 2), (3,)])
array([[1, 1, 0],
       [0, 0, 1]])
>>> lb.classes_
array([1, 2, 3])

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

7.机器学习部分.md

7.机器学习部分.md

调参

GridSearchCV() 对估算器指定参数值进行详尽搜索

RandomizedSearchCV()

模型

分类

RandomForestClassifier

ExtraTreesClassifier

GradientBoostingClassifier

XGBClassifier

Lightgbm

模型融合

BaggingClassifier

VotingClassifier

StackingClassifier

数据处理

特征放缩

MinMax scaling归一化

零均值标准化

处理空数据

Imputer() 处理丢失值

处理文本数据

pandas.factorize() 将输入值编码为枚举类型或分类变量

参数

返回值

OneHotEncoder 编码整数特征为one-hot向量

处理文本特征示例

LabelEncoder 标签编码

LabelBinarizer 标签二值化

Files

7.机器学习部分.md

Latest commit

History

7.机器学习部分.md

File metadata and controls

调参

GridSearchCV() 对估算器指定参数值进行详尽搜索

RandomizedSearchCV()

模型

分类

RandomForestClassifier

ExtraTreesClassifier

GradientBoostingClassifier

XGBClassifier

Lightgbm

模型融合

BaggingClassifier

VotingClassifier

StackingClassifier

数据处理

特征放缩

MinMax scaling归一化

零均值标准化

处理空数据

Imputer() 处理丢失值

处理文本数据

pandas.factorize() 将输入值编码为枚举类型或分类变量

参数

返回值

OneHotEncoder 编码整数特征为one-hot向量

处理文本特征示例

LabelEncoder 标签编码

LabelBinarizer 标签二值化