In this midterm you will be asked to:
Nearly everything we have done so far is important for your success on the midterm. But we are focused on classification and modeling with the train/test split on the midterm.
Assignments to definitely study: Day-09, Day-10, Day 11, and Day 11.5
There are lots of reasons, but two major ones are below.
##imports
import numpy as np
import scipy.linalg
import sklearn.decomposition as dec
import sklearn.datasets as ds
import matplotlib.pyplot as plt
import pandas as pd
iris = ds.load_iris()
data = pd.DataFrame(iris.data, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
target = pd.DataFrame(iris.target, columns=['species'])
plt.figure(figsize=(8,5));
plt.scatter(data['sepal_length'],data['sepal_width'], c=target['species'], s=30, cmap=plt.cm.rainbow);
plt.xlabel('feature 0'); plt.ylabel('feature 1')
plt.axis([4, 8, 2, 4.5])
(4.0, 8.0, 2.0, 4.5)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
train_features, test_features, train_labels, test_labels = train_test_split(data,
target['species'],
train_size = 0.75,
random_state=3)
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_features, train_labels)
y_predict = neigh.predict(test_features)
print(confusion_matrix(test_labels, y_predict))
print(neigh.score(test_features, test_labels))
[[15 0 0] [ 0 10 2] [ 0 0 11]] 0.9473684210526315
train_features, test_features, train_labels, test_labels = train_test_split(data.drop(columns=['petal_length','petal_width']),
target['species'],
train_size = 0.75,
random_state=3)
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_features, train_labels)
y_predict = neigh.predict(test_features)
print(confusion_matrix(test_labels, y_predict))
print(neigh.score(test_features, test_labels))
[[14 1 0] [ 0 7 5] [ 0 7 4]] 0.6578947368421053
pca = dec.PCA()
pca_data = pca.fit_transform(data)
print(pca.explained_variance_)
pca_data = pd.DataFrame(pca_data, columns=['PC1', 'PC2', 'PC3', 'PC4'])
plt.figure(figsize=(8,3));
plt.scatter(pca_data['PC1'], pca_data['PC2'], c=target['species'], s=30, cmap=plt.cm.rainbow);
plt.xlabel('PC1'); plt.ylabel('PC2')
plt.axis([-4, 4, -1.5, 1.5])
[4.22824171 0.24267075 0.0782095 0.02383509]
(-4.0, 4.0, -1.5, 1.5)
train_features, test_features, train_labels, test_labels = train_test_split(pca_data,
target['species'],
train_size = 0.75,
random_state=3)
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_features, train_labels)
y_predict = neigh.predict(test_features)
print(confusion_matrix(test_labels, y_predict))
print(neigh.score(test_features, test_labels))
[[15 0 0] [ 0 10 2] [ 0 0 11]] 0.9473684210526315
train_features, test_features, train_labels, test_labels = train_test_split(pca_data.drop(columns=['PC3','PC4']),
target['species'],
train_size = 0.75,
random_state=3)
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_features, train_labels)
y_predict = neigh.predict(test_features)
print(confusion_matrix(test_labels, y_predict))
print(neigh.score(test_features, test_labels))
[[15 0 0] [ 0 10 2] [ 0 0 11]] 0.9473684210526315