from sklearn.model_selection import GridSearchCV
param_grid = [
# try 12 (3×4) combinations of hyperparameters
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
# then try 6 (2×3) combinations with bootstrap set as False
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_
# Out:{'max_features': 8, 'n_estimators': 30}
grid_search.best_estimator_
# Out:
# RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
# max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, min_samples_leaf=1,
# min_samples_split=2, min_weight_fraction_leaf=0.0,
# n_estimators=30, n_jobs=1, oob_score=False, random_state=42,
# verbose=0, warm_start=False)
- estimator : estimator object.每个估算器需要提供一个
score
函数或填写scoring
参数。 - param_grid : dict or list of dictionaries,键作为参数名称,list作为参数的字典。或存有这样的字典的列表。
- scoring : string, callable, list/tuple, dict or None, default: None,
- cv : int, cross-validation generator or an iterable, optional,如果是整数,则代表KFold
- refit : boolean, or string, default=True,应用已找到的最好的参数到整个数据集上。
Methods | description |
---|---|
decision_function(X) | Call decision_function on the estimator with the best found parameters. |
fit(X[, y, groups]) | Run fit with all sets of parameters. |
get_params([deep]) | Get parameters for this estimator. |
inverse_transform(Xt) | Call inverse_transform on the estimator with the best found params. |
predict(X) | Call predict on the estimator with the best found parameters. |
predict_log_proba(X) | Call predict_log_proba on the estimator with the best found parameters. |
predict_proba(X) | Call predict_proba on the estimator with the best found parameters. |
score(X[, y]) | Returns the score on the given data, if the estimator has been refit. |
set_params(**params) | Set the parameters of this estimator. |
transform(X) | Call transform on the estimator with the best found parameters. |
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
'n_estimators': randint(low=1, high=200),
'max_features': randint(low=1, high=8),
}
forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)
- estimator : estimator object.指定估算器对象。
- param_distributions : dict,给定以参数名为键,list为参数的字典。或提供一个分布,分布必须提供一个
rvs
方法进行采样,例如来自scipy.stats.distributions的方法。 - n_iter : int, default=10,采样参数设置数量。
- scoring : string, callable, list/tuple, dict or None, default: None
- cv : int, cross-validation generator or an iterable, optional
- refit : boolean, or string default=True
- random_state : int, RandomState instance or None, optional, default=None
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=50, n_jobs=-1)
scores = cross_val_score(rnd_clf, trainData, trainLabel,
scoring="recall", cv=10)
scores.mean()
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
etr_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=50, n_jobs=-1)
etr_score = cross_val_score(etr_clf, trainData, trainLabel, scoring='recall', cv=10)
etr_score.mean()
from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=2,max_depth=6,n_estimators=100)
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
xgb1 = XGBClassifier(learning_rate =0.05,
n_estimators=1000,
max_depth=3,
min_child_weight=1,
gamma=0.1,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
reg_alpha=0.001,
reg_lambda=0.001,
scale_pos_weight=1,
seed=27)
xgb1.fit(subTrain, subLabel)
xgb1_pred = xgb1.predict(trainData)
xgb1_pred_prob = xgb1.predict_proba(trainData)[:, 1]
print(metrics.accuracy_score(trainLabel, xgb1_pred))
print(metrics.roc_auc_score(trainLabel, xgb1_pred_prob))
xgb1_f2 = calc_f2(trainLabel, xgb1_pred)
print(xgb1_f2)
import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
boosting_type='gbdt',
objective='binary',
n_estimators=1000,
metric='auc',
max_depth=3,
num_leaves=5,
subsample=0.7,
colsample_bytree=0.7,
min_data_in_leaf=450,
feature_fraction=0.7,
bagging_fraction=0.7,
bagging_freq=6,
lambda_l1=1,
lambda_l2=0.001,
min_gain_to_split=0.265,
verbose=5,
is_unbalance=True)
lgb_clf.fit(subTrain, subLabel)
lgb_clf_pred = lgb_clf.predict(trainData)
from sklearn.ensemble import BaggingClassifier
bag_rnd = BaggingClassifier(rnd_clf, n_estimators=10, max_samples=1000, bootstrap=True, n_jobs=-1)
bag_rnd.fit(subTrain, subLabel)
rnd_pred = bag_rnd.predict(trainData)
from sklearn.ensemble import VotingClassifier
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(learning_rate =0.1,
n_estimators=1000,
max_depth=3,
min_child_weight=1,
gamma=0.1,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
reg_alpha=0.001,
reg_lambda=0.001,
scale_pos_weight=1)
import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
boosting_type='gbdt',
objective='binary',
n_estimators=1000,
metric='auc',
max_depth=3,
num_leaves=5,
subsample=0.7,
colsample_bytree=0.7,
min_data_in_leaf=450,
feature_fraction=0.7,
bagging_fraction=0.7,
bagging_freq=6,
lambda_l1=1,
lambda_l2=0.001,
min_gain_to_split=0.265,
verbose=5,
is_unbalance=True)
from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=320, min_samples_leaf=7, max_depth=7,
max_features='sqrt', subsample=0.7, random_state=10)
vot = VotingClassifier(estimators=[('xgb', xgb), ('lgb', lgb_clf)], voting='soft')
vot.fit(trainData, trainLabel)
vot_pred = vot.predict(testData)
vot_pred = pd.DataFrame(vot_pred)
submData['acc_now_delinq'] = vot_pred
submData.to_csv(os.path.join(homePath, 'submit.csv'), index=False)
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=110, min_samples_split=90, min_samples_leaf=15,max_depth=10,oob_score=True,max_features=10)
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(learning_rate =0.05,
n_estimators=1000,
max_depth=3,
min_child_weight=1,
gamma=0.1,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
reg_alpha=0.001,
reg_lambda=0.001,
scale_pos_weight=1)
import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
boosting_type='gbdt',
objective='binary',
n_estimators=1000,
metric='auc',
max_depth=3,
num_leaves=5,
subsample=0.7,
colsample_bytree=0.7,
min_data_in_leaf=450,
feature_fraction=0.7,
bagging_fraction=0.7,
bagging_freq=6,
lambda_l1=1,
lambda_l2=0.001,
min_gain_to_split=0.265,
verbose=5,
is_unbalance=True)
from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=320, min_samples_leaf=7, max_depth=7,
max_features='sqrt', subsample=0.7)
from mlxtend.classifier import StackingClassifier
stack_clf = StackingClassifier(classifiers=[gbdt, lgb_clf, rnd_clf, xgb], meta_classifier=xgb)
stack_clf.fit(trainData, trainLabel)
stack_pred = stack_clf.predict(testData)
该方法更容易受离散点影响
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min
>>> from sklearn.preprocessing import MinMaxScaler
>>>
>>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
>>> scaler = MinMaxScaler()
>>> print(scaler.fit(data))
MinMaxScaler(copy=True, feature_range=(0, 1))
>>> print(scaler.data_max_)
[ 1. 18.]
>>> print(scaler.transform(data))
[[ 0. 0. ]
[ 0.25 0.25]
[ 0.5 0.5 ]
[ 1. 1. ]]
>>> print(scaler.transform([[2, 2]]))
[[ 1.5 0. ]]
- feature_range : tuple (min, max), default=(0, 1),归一化后值的范围
- copy : boolean, optional, default True,是否复制数据在新的数据上归一化
>>> from sklearn.preprocessing import StandardScaler
>>>
>>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
>>> scaler = StandardScaler()
>>> print(scaler.fit(data))
StandardScaler(copy=True, with_mean=True, with_std=True)
>>> print(scaler.mean_)
[ 0.5 0.5]
>>> print(scaler.transform(data))
[[-1. -1.]
[-1. -1.]
[ 1. 1.]
[ 1. 1.]]
>>> print(scaler.transform([[2, 2]]))
[[ 3. 3.]]
- copy : boolean, optional, default True,是否复制数据在新的数据上执行
- with_mean : boolean, True by default,若为True则在缩放前将数据居中。但在稀疏矩阵上是行不通的。
- with_std : boolean, True by default,若为True,则将数据放缩到单位方差或等效于单位标准差
各属性必须是数值
from sklearn.preprocessing import Imputer
# 指定用何值替换丢失的值,此处为中位数
imputer = Imputer(strategy="median")
# 使实例适应数据
imputer.fit(housing_num)
# 结果在statistics_ 变量中
imputer.statistics_
# 替换
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index = list(housing.index.values))
# 预览
housing_tr.loc[sample_incomplete_rows.index.values]
housing_cat = housing['ocean_proximity']
housing_cat.head(10)
# 输出
# 17606 <1H OCEAN
# 18632 <1H OCEAN
# 14650 NEAR OCEAN
# 3230 INLAND
# 3555 <1H OCEAN
# 19480 INLAND
# 8879 <1H OCEAN
# 13685 INLAND
# 4937 <1H OCEAN
# 4861 <1H OCEAN
# Name: ocean_proximity, dtype: object
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
# 输出
# array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)
- values : ndarray (1-d);序列
- sort : boolean, default False;根据值排序
- na_sentinel : int, default -1;给未找到赋的值
- size_hint : hint to the hashtable sizer
- labels : the indexer to the original array
- uniques : ndarray (1-d) or Index;当传递的值是Index或Series时,返回独特的索引。
返回值为稀疏矩阵
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
注意fit_transform()
期望一个二维数组,所以这里将数据reshape了。
housing_cat = housing['ocean_proximity']
housing_cat.head(10)
# 17606 <1H OCEAN
# 18632 <1H OCEAN
# 14650 NEAR OCEAN
# 3230 INLAND
# 3555 <1H OCEAN
# 19480 INLAND
# 8879 <1H OCEAN
# 13685 INLAND
# 4937 <1H OCEAN
# 4861 <1H OCEAN
# Name: ocean_proximity, dtype: object
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
# array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)
housing_categories
# Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
print(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
# [[0]
# [0]
# [1]
# ...,
# [2]
# [0]
# [3]]
# <16512x5 sparse matrix of type '<class 'numpy.float64'>'
# with 16512 stored elements in Compressed Sparse Row format>
LabelEncoder`是一个可以用来将标签规范化的工具类,它可以将标签的编码值范围限定在[0,n_classes-1]。简单来说就是对不连续的数字或者文本进行编号。
>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2])
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])
当然,它也可以用于非数值型标签的编码转换成数值标签(只要它们是可哈希并且可比较的):
>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1])
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']
LabelBinarizer 是一个用来从多类别列表创建标签矩阵的工具类:
>>> from sklearn import preprocessing
>>> lb = preprocessing.LabelBinarizer()
>>> lb.fit([1, 2, 6, 4, 2])
LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
>>> lb.classes_
array([1, 2, 4, 6])
>>> lb.transform([1, 6])
array([[1, 0, 0, 0],
[0, 0, 0, 1]])
对于多类别是实例,可以使用:class:MultiLabelBinarizer:
>>> lb = preprocessing.MultiLabelBinarizer()
>>> lb.fit_transform([(1, 2), (3,)])
array([[1, 1, 0],
[0, 0, 1]])
>>> lb.classes_
array([1, 2, 3])