import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import plot_tree

# Load the wine dataset
wine = datasets.load_wine()

# Print the description of the dataset
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
    ============================= ==== ===== ======= =====
                                   Min   Max   Mean     SD
    ============================= ==== ===== ======= =====
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0.98  3.88    2.29  0.63
    Flavanoids:                   0.34  5.08    2.03  1.00
    Nonflavanoid Phenols:         0.13  0.66    0.36  0.12
    Proanthocyanins:              0.41  3.58    1.59  0.57
    Colour Intensity:              1.3  13.0     5.1   2.3
    Hue:                          0.48  1.71    0.96  0.23
    OD280/OD315 of diluted wines: 1.27  4.00    2.61  0.71
    Proline:                       278  1680     746   315
    ============================= ==== ===== ======= =====

    :Missing Attribute Values: None
    :Class Distribution: class_0 (59), class_1 (71), class_2 (48)
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML Wine recognition datasets.
https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data

The data is the results of a chemical analysis of wines grown in the same
region in Italy by three different cultivators. There are thirteen different
measurements taken for different constituents found in the three types of
wine.

Original Owners: 

Forina, M. et al, PARVUS - 
An Extendible Package for Data Exploration, Classification and Correlation. 
Institute of Pharmaceutical and Food Analysis and Technologies,
Via Brigata Salerno, 16147 Genoa, Italy.

Citation:

Lichman, M. (2013). UCI Machine Learning Repository
[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
School of Information and Computer Science. 

.. topic:: References

  (1) S. Aeberhard, D. Coomans and O. de Vel, 
  Comparison of Classifiers in High Dimensional Settings, 
  Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of  
  Mathematics and Statistics, James Cook University of North Queensland. 
  (Also submitted to Technometrics). 

  The data was used with many others for comparing various 
  classifiers. The classes are separable, though only RDA 
  has achieved 100% correct classification. 
  (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) 
  (All results using the leave-one-out technique) 

  (2) S. Aeberhard, D. Coomans and O. de Vel, 
  "THE CLASSIFICATION PERFORMANCE OF RDA" 
  Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of 
  Mathematics and Statistics, James Cook University of North Queensland. 
  (Also submitted to Journal of Chemometrics).

# Load the wine dataset
wine = load_wine()
y = wine.target

# Class distribution from wine.DESCR
class_distribution = [59, 71, 48]
class_labels = wine.target_names

# Use a Seaborn color palette
colors = sns.color_palette("bright", n_colors=len(class_distribution))

# Plot the class distribution using Matplotlib
plt.bar(range(len(class_distribution)), class_distribution, tick_label=class_labels, color=colors)
plt.title("Class Distribution in the Wine Dataset")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()

# Convert to a pandas DataFrame
df = pd.DataFrame(wine.data, columns=wine.feature_names)

# Plot boxplots for each feature
plt.figure(figsize=(15, 8))
df.boxplot(rot=45)
plt.title("Feature Ranges (Boxplot)")
plt.ylabel("Value")
plt.show()

# Drop 'proline' and 'magnesium' columns
df_no_proline_magnesium = df.drop(columns=['proline', 'magnesium'])

# Plot boxplot excluding both 'proline' and 'magnesium'
plt.figure(figsize=(15, 8))
df_no_proline_magnesium.boxplot(rot=45)
plt.title("Feature Ranges (Boxplot Excluding 'Proline' and 'Magnesium')")
plt.xlabel("Value")
plt.ylabel("Features")
plt.show()

# Create a DataFrame for the Wine dataset
wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)

# Compute the correlation matrix
corr_matrix = df.corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix of Wine Dataset Features")
plt.show()

# Find the count of the majority class
majority_class_count = max(counts)

# Calculate the total number of samples
total_samples = sum(counts)

# Compute the baseline accuracy as the proportion of the majority class
baseline_accuracy = majority_class_count / total_samples

# Print the baseline accuracy as a percentage
print(f"Baseline Accuracy: {baseline_accuracy * 100:.2f}%")

Baseline Accuracy: 39.89%

# Load the Wine dataset
wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
y = wine.target

# Remove outliers using the IQR method
# Calculate Q1 (25th percentile) and Q3 (75th percentile) for each column
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1  # Interquartile Range

# Define lower and upper bounds
lower_bound = Q1 - 1.2 * IQR
upper_bound = Q3 + 1.2 * IQR

# Filter data within bounds
X_cleaned = df[(df >= lower_bound) & (df <= upper_bound)].dropna()

# Ensure target variable matches cleaned data
y_cleaned = y[X_cleaned.index]

# Print the resulting shapes of data
print("Original data shape:", df.shape)
print("Cleaned data shape:", X_cleaned.shape)

Original data shape: (178, 13)
Cleaned data shape: (138, 13)

# Plot boxplot for the cleaned dataset
plt.figure(figsize=(15, 8))
X_cleaned.boxplot(rot=45)  
plt.title("Feature Ranges After Removing Outliers")
plt.xlabel("Features")
plt.ylabel("Value")
plt.show()

# Remove 'proline' and 'magnesium' columns
X_cleaned_no_proline_magnesium = X_cleaned.drop(columns=['proline', 'magnesium'])

# Plot boxplot for the cleaned dataset without 'proline' and 'magnesium'
plt.figure(figsize=(15, 8))
X_cleaned_no_proline_magnesium.boxplot(rot=45)  
plt.title("Feature Ranges After Removing Outliers ('Proline' and 'Magnesium' Excluded)")
plt.xlabel("Features")
plt.ylabel("Value")
plt.show()

# Split the cleaned data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, stratify=y_cleaned, random_state=36)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Print the resulting shapes of data
print("Training set shape:", X_train_scaled.shape)
print("Test set shape:", X_test_scaled.shape)

Training set shape: (110, 13)
Test set shape: (28, 13)

df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target

# Compute correlations
corr_with_target = df.corr()['target'].sort_values(ascending=False)
print("Correlation with Target:\n", corr_with_target)

Correlation with Target:
 target                          1.000000
alcalinity_of_ash               0.517859
nonflavanoid_phenols            0.489109
malic_acid                      0.437776
color_intensity                 0.265668
ash                            -0.049643
magnesium                      -0.209179
alcohol                        -0.328222
proanthocyanins                -0.499130
hue                            -0.617369
proline                        -0.633717
total_phenols                  -0.719163
od280/od315_of_diluted_wines   -0.788230
flavanoids                     -0.847498
Name: target, dtype: float64

# Initialize PCA to retain components explaining 95% of variance
pca = PCA(n_components=0.95)

# Fit PCA on the training set and transform both training and test sets
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Print the resulting shapes of PCA-transformed data
print("Training set after PCA shape:", X_train_pca.shape)
print("Test set after PCA shape:", X_test_pca.shape)

Training set after PCA shape: (110, 10)
Test set after PCA shape: (28, 10)

def knn_algorithm(X_train, y_train, X_test, k=3):
    predictions = []
    for test_point in X_test:
        # Calculate Euclidean distances
        distances = np.sqrt(np.sum((X_train - test_point) ** 2, axis=1))

        # Find the k-nearest neighbors
        k_indices = np.argsort(distances)[:k]
        k_labels = y_train[k_indices]
        
        # Get unique labels and their counts
        unique_labels, counts = np.unique(k_labels, return_counts=True)
        
        # Find the label with the maximum count
        most_frequent = unique_labels[np.argmax(counts)]
        
        # Append the most frequent label to predictions
        predictions.append(most_frequent)
    
    return np.array(predictions)

def calculate_accuracy(y_true, y_pred):
    correct_predictions = np.sum(y_true == y_pred)  # Element-wise comparison
    total_predictions = len(y_true)
    accuracy = correct_predictions / total_predictions
    return accuracy
    
def cross_validate_knn(X, y, k_neighbors=3, k_folds=5, random_seed=36):
    
    # Shuffle the data
    indices = np.arange(len(X))
    np.random.seed(random_seed)
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]
    
    # Split data into folds
    fold_size = len(X) // k_folds
    accuracies = []

    for fold in range(k_folds):
        # Create training and validation splits
        start = fold * fold_size
        end = (fold + 1) * fold_size
        
        X_val = X[start:end] # Validation set features
        y_val = y[start:end] # Validation set labels

        # Combine the remaining folds into the training set
        X_train = np.concatenate([X[:start], X[end:]], axis=0)
        y_train = np.concatenate([y[:start], y[end:]], axis=0)
        
        # Predict using the custom kNN function
        y_pred = knn_algorithm(X_train, y_train, X_val, k=k_neighbors)
        
        # Evaluate accuracy
        accuracy = calculate_accuracy(y_val, y_pred)
        accuracies.append(accuracy)

    # Return the average accuracy across folds
    return np.mean(accuracies)

# Test different values of k
k_values = range(1, 6)
results = []

# Iterate through each k value and find the average accuracy
for k in k_values:
    avg_accuracy = cross_validate_knn(X_train_scaled, y_train, k_neighbors=k, k_folds=5)
    results.append((k, avg_accuracy))
    print(f"k={k}, Cross-Validation Accuracy: {avg_accuracy * 100:.2f}%")

# Find the best k
best_k = max(results, key=lambda x: x[1])[0]
print(f"Best k: {best_k}")

k=1, Cross-Validation Accuracy: 96.36%
k=2, Cross-Validation Accuracy: 97.27%
k=3, Cross-Validation Accuracy: 98.18%
k=4, Cross-Validation Accuracy: 98.18%
k=5, Cross-Validation Accuracy: 97.27%
Best k: 3

# Range of max_depth values to test
max_depth_values = range(1, 6)
mean_accuracies = []

# Perform cross-validation for each max_depth value
for max_depth in max_depth_values:
    dt_model = DecisionTreeClassifier(max_depth=max_depth, random_state=36)
    cv_scores = cross_val_score(dt_model, X_train_pca, y_train, cv=5, scoring='accuracy')
    mean_accuracy = np.mean(cv_scores)
    mean_accuracies.append(mean_accuracy)
    print(f"max_depth={max_depth}, Cross-Validation Accuracy: {mean_accuracy * 100:.2f}%")

# Find the best max_depth
best_max_depth = max_depth_values[np.argmax(mean_accuracies)]
print(f"Best max_depth: {best_max_depth}")

max_depth=1, Cross-Validation Accuracy: 68.18%
max_depth=2, Cross-Validation Accuracy: 93.64%
max_depth=3, Cross-Validation Accuracy: 92.73%
max_depth=4, Cross-Validation Accuracy: 92.73%
max_depth=5, Cross-Validation Accuracy: 92.73%
Best max_depth: 2

# Initialize the Decision Tree Classifier with the best max_depth
dt_model = DecisionTreeClassifier(max_depth=best_max_depth, criterion='gini', random_state=36)

# Train the model on the training data
dt_model.fit(X_train_pca, y_train)

DecisionTreeClassifier(max_depth=2, random_state=36)

DecisionTreeClassifier(max_depth=2, random_state=36)

# Train the kNN algorithm with the best k
final_predictions = knn_algorithm(X_train_pca, y_train, X_test_pca, k=best_k)

# Calculate accuracy for kNN predictions
accuracy = calculate_accuracy(y_test, final_predictions)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 96.43%

# Reduce the dataset to 2 dimensions for visualization
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_pca)
X_test_pca = pca.transform(X_test_pca)

# Plot training data
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap='viridis', alpha=0.6, label='Training Data')

# Plot test data with predictions
plt.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=final_predictions, cmap='coolwarm', edgecolors='k', label='Test Predictions')

plt.title('kNN Predictions (PCA Reduced Data)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

# Step 1: Calculate the confusion matrix
kkn_cm = confusion_matrix(y_test, final_predictions)

# Step 2: Visualize the confusion matrix using Seaborn
plt.figure(figsize=(8, 6))
plt.imshow(kkn_cm, interpolation='nearest', cmap='Blues')
plt.title('kNN Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(len(wine.target_names))
plt.xticks(tick_marks, wine.target_names)
plt.yticks(tick_marks, wine.target_names)
plt.ylabel('True Labels')
plt.xlabel('Predicted Labels')

# Annotate the cells with numbers
for i in range(kkn_cm.shape[0]):
    for j in range(kkn_cm.shape[1]):
        plt.text(j, i, str(kkn_cm[i, j]), ha='center', va='center', color='black')
plt.tight_layout()
plt.show()

# True positives (diagonal elements)
tp_kkn = np.diagonal(kkn_cm)

# Total predicted values for all classes (TP + FP for all classes)
predicted_totals = np.sum(kkn_cm, axis=0)

# Overall Precision (Macro Average)
macro_average_precision = np.mean(tp_kkn / predicted_totals) * 100  # Convert to percentage

# Print overall precision
print(f"Overall Macro Average Precision: {macro_average_precision:.2f}%")

Overall Macro Average Precision: 96.97%

# Predict the labels for the test set
y_pred = dt_model.predict(X_test_pca)

# Calculate accuracy for kNN predictions
accuracy = calculate_accuracy(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 89.29%

plt.figure(figsize=(20, 10))
plot_tree(dt_model, feature_names=wine.feature_names, class_names=wine.target_names, filled=True, rounded=True)
plt.title("Decision Tree Visualization")
plt.show()

# Confusion matrix
dt_cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
plt.imshow(dt_cm, interpolation='nearest', cmap='Blues')
plt.title('Confusion Matrix - Matplotlib')
plt.colorbar()
tick_marks = np.arange(len(wine.target_names))
plt.xticks(tick_marks, wine.target_names)
plt.yticks(tick_marks, wine.target_names)
plt.ylabel('True Labels')
plt.xlabel('Predicted Labels')

# Annotate the cells with numbers
for i in range(dt_cm.shape[0]):
    for j in range(dt_cm.shape[1]):
        plt.text(j, i, str(dt_cm[i, j]), ha='center', va='center', color='black')
plt.tight_layout()
plt.show()

# True positives (diagonal elements)
tp_dt = np.diagonal(dt_cm)

# Total predicted values for all classes (TP + FP for all classes)
predicted_totals = np.sum(dt_cm, axis=0)

# Overall Precision (Macro Average)
macro_average_precision = np.mean(tp_dt / predicted_totals) * 100  # Convert to percentage

# Print overall precision
print(f"Overall Macro Average Precision: {macro_average_precision:.2f}%")

Overall Macro Average Precision: 90.28%

# Precision for each class
precision_per_class = tp_kkn / predicted_totals * 100  # Convert to percentage

# Print precision per class
print("Precision per class:")
for idx, class_name in enumerate(wine.target_names):
    print(f"Class {class_name}: {precision_per_class[idx]:.2f}%")

Precision per class:
Class class_0: 90.91%
Class class_1: 100.00%
Class class_2: 100.00%

# Precision for each class
precision_per_class = tp_dt / predicted_totals * 100  # Convert to percentage

# Print precision per class
print("Precision per class:")
for idx, class_name in enumerate(wine.target_names):
    print(f"Class {class_name}: {precision_per_class[idx]:.2f}%")

Precision per class:
Class class_0: 83.33%
Class class_1: 100.00%
Class class_2: 87.50%

A Comparative Analysis of kNN and Decision Tree for Multi-Class Classification¶

Understanding the Problem¶

Aim and Relevance¶

Dataset Overview¶

Challenges with the Dataset¶

Methodology¶

Dataset Exploration¶

Class Distribution Analysis¶

Feature Distribution Analysis¶

Correlation Analysis¶

Evaluation Metrics¶

Establishing a Baseline¶

Dataset Preprocessing¶

Outlier Removal¶

Train-Test Split & Feature Scaling¶

Handling Correlated Features¶

Algorithm Implementation¶

k-Nearest Neighbors (kNN)¶

Implementation¶

Cross-Validation¶

Model Training¶

Decision Trees¶

Cross-Validation¶

Implementation¶

Model Training¶

Results¶

k-Nearest Neighbors (kNN)¶

Decision Tree¶

Evaluation¶