label y 训练集测试集x_训练集、测试集 train_test_split
訓練集 & 測試集
如果拿所有原始數據來訓練,存在的問題:
模型很差無法調整;
真實環境難以拿到真實 label;
所以將數據區分為 訓練數據 和 測試數據(train test split);
將訓練數據來訓練模型;然后用測試數據測試模型;
使用這種方式也存在問題;
python 原生分離 iris 數據集
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X = iris.data
y = iris.target
X
'''
(array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2], ...
[6.2, 3.4, 5.4, 2.3],
[5.9, 3. , 5.1, 1.8]])
'''
y
'''
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
'''
X.shape, y.shape #((150, 4), (150,))
# shuffle
shuffle_indexes = np.random.permutation(len(X)) # 0--len(X) 的隨機排列
shuffle_indexes
# array([ 22, 4, 142, 24, 7, 146, ... 9, 95, 130, 29, 124])
test_ratio = 0.2
test_size = int(len(X) * test_ratio)
test_size # 30
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]
test_indexes
'''
array([ 22, 4, 142, 24, 7, 146, 70, 77, 144, 14, 40, 119, 46, 85, 74, 87, 86, 60, 91, 120, 78, 45, 65, 105, 113, 39, 83, 80, 134, 16])
'''
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
X_test.shape, X_train.shape # ((30, 4), (120, 4))
封裝 train_test_split 函數
def train_test_split(X, y, test_ratio=0.2, seed=None):
assert x.shape[0] == y.shape[0], "the size of X must be equal to the size of y"
assert 0.0 <= test_ratio <= 1.0, "test_ ration must be valid"
if seed:
np.random.seed(seed)
shuffle_indexes = np.random.permutation(len(X))
test_size = int(len(X) * test_ratio)
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
return X_train, y_train, X_test, y_test
sklearn 中的 train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
train_test_split(*arrays, **options)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
總結
以上是生活随笔為你收集整理的label y 训练集测试集x_训练集、测试集 train_test_split的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 小森生活显示无可用服务器,《小森生活》无
- 下一篇: mysql group_concat去重