import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
from zipfile import ZipFile
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, SpectralClustering, AgglomerativeClustering, DBSCAN
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 12)
Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-jul-2022
with ZipFile('/content/tabular-playground-series-jul-2022.zip', 'r') as zf:
zf.extractall('./')
data = pd.read_csv('data.csv', index_col='id')
data.head()
data.info()
data.describe()
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')
Most of the features are not at all correlated with each other.
There are no missing values, in the data.
if data.isna().any().any():
print(data.isna().sum()*100/data.shape[0])
else:
print("No Missing values")
pca = PCA()
pca.fit(data)
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel("Number of features")
plt.ylabel("Explained Variance")
plt.title("Variance vs. No. Features", fontsize=20)
pca = PCA(n_components=12)
pca.fit(data)
reduced_data = pca.transform(data)
wss = list()
n_clusters = range(1, 10)
for n in n_clusters:
kmeans = KMeans(n_clusters=n)
kmeans.fit(reduced_data)
wss.append(kmeans.inertia_)
plt.plot(n_clusters, wss)
From the above, elbow plot we can see that the change in the value of inertia is very less for the clusters 7, 8, 9.
So I guess that the number of optimal clusters can be either 7 or 8.
It is actually 7. because I tried 6 and 8 both gave very less rand index value.
kmeans = KMeans(n_clusters=8)
kmeans.fit(data)
kmeans_values = kmeans.predict(data)
wss = list()
n_clusters = range(1, 20)
for n in n_clusters:
minikmeans = MiniBatchKMeans(n_clusters=n)
minikmeans.fit(reduced_data)
wss.append(minikmeans.inertia_)
plt.plot(n_clusters, wss)
minikmeans = MiniBatchKMeans(n_clusters=7)
minikmeans.fit(reduced_data)
mini_values = minikmeans.predict(reduced_data)
ms = MeanShift(n_jobs=-1)
ms.fit(reduced_data)
ms_values = ms.predict(reduced_data)
dbscan = DBSCAN(n_jobs=-1)
dbscan_values = dbscan.fit_predict(data)
values = ms_values
submission = pd.read_csv('/content/sample_submission.csv')
submission['Predicted'] = values
submission.to_csv('output.csv', index=False)
!kaggle competitions submit -c tabular-playground-series-jul-2022 -f output.csv -m "MeanShift + PCA12"
Rand Index is: 0.23885