Clustering with KMeans and PCA

  Example on Wine data

Posted by Haby on August 8, 2018


This Wine data set contains the results of a chemical analysis of wines grown in a specific area of Italy. I use KMeans Algorithm to cluster different Wine and check if the result is correct by comparing with label variable.

@author: Ha

1.Import package and data

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import os
print(os.listdir("../input"))
['Wine.csv']
df = pd.read_csv("../input/Wine.csv")
df.head()
Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue OD280 Proline Customer_Segment
0 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065 1
1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050 1
2 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185 1
3 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480 1
4 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735 1

2.Basic Data Information

# drop Customer_Segment
label = df.Customer_Segment
df = df.drop("Customer_Segment",axis = 1)
df.describe()
Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue OD280 Proline
count 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000
mean 13.000618 2.336348 2.366517 19.494944 99.741573 2.295112 2.029270 0.361854 1.590899 5.058090 0.957449 2.611685 746.893258
std 0.811827 1.117146 0.274344 3.339564 14.282484 0.625851 0.998859 0.124453 0.572359 2.318286 0.228572 0.709990 314.907474
min 11.030000 0.740000 1.360000 10.600000 70.000000 0.980000 0.340000 0.130000 0.410000 1.280000 0.480000 1.270000 278.000000
25% 12.362500 1.602500 2.210000 17.200000 88.000000 1.742500 1.205000 0.270000 1.250000 3.220000 0.782500 1.937500 500.500000
50% 13.050000 1.865000 2.360000 19.500000 98.000000 2.355000 2.135000 0.340000 1.555000 4.690000 0.965000 2.780000 673.500000
75% 13.677500 3.082500 2.557500 21.500000 107.000000 2.800000 2.875000 0.437500 1.950000 6.200000 1.120000 3.170000 985.000000
max 14.830000 5.800000 3.230000 30.000000 162.000000 3.880000 5.080000 0.660000 3.580000 13.000000 1.710000 4.000000 1680.000000
# checking NA
df.isnull().sum()
Alcohol                 0
Malic_Acid              0
Ash                     0
Ash_Alcanity            0
Magnesium               0
Total_Phenols           0
Flavanoids              0
Nonflavanoid_Phenols    0
Proanthocyanins         0
Color_Intensity         0
Hue                     0
OD280                   0
Proline                 0
dtype: int64

3.Data Analysis

Skewness and Outliers and Correlation

def plot_multi_variable(row,column) :
    """
    Plot multi variables histogram for dataframe with row = row and colunm = column
    """
    _,ax = plt.subplots(row,column,figsize = (12,14))
    for i in range(0,row) :
        for j in range(0,column) :
            if (row*i + j >= df.shape[1]) :
                break
            else :
                sns.distplot(df.iloc[:,row*i+j],color = "red",bins = 20,ax = ax[i,j])

plot_multi_variable(4,4)

png

def plot_boxplot_multi_variable(row,column) :
    """
    Plot multi variables boxplot for dataframe with row = row and colunm = column
    """
    _,ax = plt.subplots(row,column,figsize = (12,14))
    for i in range(0,row) :
        for j in range(0,column) :
            if (row*i + j >= df.shape[1]) :
                break
            else :
                sns.boxplot(y = df.iloc[:,row*i+j],color = "lightblue",ax = ax[i,j])

plot_boxplot_multi_variable(4,4)

png

_,_= plt.subplots(1,1,figsize = (10,10))
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask = mask,cmap=plt.cm.PuOr,square = True,annot = True)
<matplotlib.axes_subplots.AxesSubplot at 0x7f8ce7eddbe0>

png

Flavanoids vs Total_Phenols : 0.86

OD280 vs Flavanoids : 0.79

OD280 vs Total_Phenols : 0.70

To large prob, these 3 variables are not I.I.D

_,ax = plt.subplots(1,3,figsize = (18,3))
sns.regplot(df["Flavanoids"],df["Total_Phenols"],ax = ax[0],color = "red")
sns.regplot(df["Flavanoids"],df["OD280"],ax = ax[1],color = "green")
sns.regplot(df["Total_Phenols"],df["OD280"],ax = ax[2],color = "blue")
<matplotlib.axes_subplots.AxesSubplot at 0x7f8cebdcb048>

png

4.Clustering

Kmeans is sensitive to Outliers and skewness.

Try to scale first

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

pl = Pipeline([
    ("scale",StandardScaler()),
    ("Kmeans", KMeans(n_clusters = 2,random_state = 13))
])

kmeans = pl.fit(df)
# centroids
print(kmeans.steps[1][1].cluster_centers_)
# withinss, Total within-cluster sum of squares , larger the better
print(kmeans.steps[1][1].inertia_)
[[-0.31148001  0.33837268 -0.0499309   0.46976489 -0.3074597  -0.75037054
  -0.789532    0.56770273 -0.61153123  0.0982258  -0.5400717  -0.68516469
  -0.58021779]
 [ 0.32580094 -0.35393004  0.05222657 -0.49136328  0.32159578  0.78487033
   0.82583232 -0.59380401  0.63964761 -0.10274193  0.56490258  0.71666652
   0.60689447]]
1658.7588524290954
# decide how many clusters is better
withinss = []
for i in range(1,10) :
    pl = Pipeline([
    ("scale",StandardScaler()),
    ("Kmeans", KMeans(n_clusters = i,random_state = 13))
    ])
    kmeans = pl.fit(df)
    withinss.append(kmeans.steps[1][1].inertia_)
plt.plot(withinss)
plt.ylabel("Total within-cluster sum of squares")
plt.xlabel("Number of Cluster")

png

K = 3 is the optimized solution based on elbow criterion

When k = 3, cluster-inside sse is 1277 < 1658 when k = 2

pl_opt = Pipeline([
    ("scale",StandardScaler()),
    ("Kmeans", KMeans(n_clusters = 3,random_state = 13))
])

kmeans_opt = pl_opt.fit(df)
print(kmeans_opt.steps[1][1].inertia_)
1277.9284888446423
df["Cluster"] = kmeans_opt.steps[1][1].labels_
# Centroids
centroids = kmeans_opt.steps[1][1].cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='r', zorder=10)
<matplotlib.collections.PathCollection at 0x7f8cd9e50f60>

png

g = sns.PairGrid(df,hue = "Cluster",hue_kws={"marker": ["o", "s", "D"]},vars = df.columns[:df.shape[1]-1])
g = g.map_diag(sns.kdeplot)
g = g.map_upper(plt.scatter)
g = g.map_lower(sns.kdeplot)

png

PCA analysis

from sklearn.decomposition import PCA
pipeline_pca = Pipeline([
    ("scale" , StandardScaler()),
    ("PCA" ,PCA(n_components = 13,random_state = 13))
])
pca_opt = pipeline_pca.fit_transform(df.drop("Cluster",axis = 1))
color = ["red" if i == 0 else "blue" if i == 1  else "yellow" for i in df["Cluster"]]
plt.figure(figsize = (7,7))
plt.scatter(pca_opt[:,0],pca_opt[:,2], c= color, alpha=0.8)
plt.title("PCA Decomposition with 3 Cluster")
plt.show()

png

5.Error Check

from sklearn.metrics import classification_report
label_new = ["A" if i == 1 else "B" if i == 2 else "C" for i in label]
Cluster_new = ["A" if i == 2 else "B" if i == 0 else "C" for i in df["Cluster"]]
print(classification_report(Cluster_new,label_new))
              precision    recall  f1-score   support

           A       1.00      0.95      0.98        62
           B       0.92      1.00      0.96        65
           C       1.00      0.94      0.97        51

   micro avg       0.97      0.97      0.97       178
   macro avg       0.97      0.96      0.97       178
weighted avg       0.97      0.97      0.97       178
# Accuracy
Correct = [1 if label_new[i] == Cluster_new[i] else 0 for i in range(0,len(label_new))]
print("Total Accuracy is :",np.mean(Correct))
Total Accuracy is : 0.9662921348314607