import tqdm
import numpy as np
import pandas as pd
import seaborn as sns

from zipfile import ZipFile
from matplotlib import pyplot as plt

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, SpectralClustering, AgglomerativeClustering, DBSCAN
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 12)

Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-jul-2022
Downloading tabular-playground-series-jul-2022.zip to /content
 87% 17.0M/19.6M [00:00<00:00, 53.2MB/s]
100% 19.6M/19.6M [00:00<00:00, 55.8MB/s]
with ZipFile('/content/tabular-playground-series-jul-2022.zip', 'r') as zf:
    zf.extractall('./')

Loading the data

data = pd.read_csv('data.csv', index_col='id')
data.head()
f_00 f_01 f_02 f_03 f_04 f_05 f_06 f_07 f_08 f_09 ... f_19 f_20 f_21 f_22 f_23 f_24 f_25 f_26 f_27 f_28
id
0 -0.389420 -0.912791 0.648951 0.589045 -0.830817 0.733624 2.258560 2 13 14 ... -0.478412 -0.757002 -0.763635 -1.090369 1.142641 -0.884274 1.137896 1.309073 1.463002 0.813527
1 -0.689249 -0.453954 0.654175 0.995248 -1.653020 0.863810 -0.090651 2 3 6 ... -0.428791 -0.089908 -1.784204 -0.839474 0.459685 1.759412 -0.275422 -0.852168 0.562457 -2.680541
2 0.809079 0.324568 -1.170602 -0.624491 0.105448 0.783948 1.988301 5 11 5 ... -0.413534 -1.602377 1.190984 3.267116 -0.088322 -2.168635 -0.974989 1.335763 -1.110655 -3.630723
3 -0.500923 0.229049 0.264109 0.231520 0.415012 -1.221269 0.138850 6 2 13 ... 0.619283 1.287801 0.532837 1.036631 -2.041828 1.440490 -1.900191 -0.630771 -0.050641 0.238333
4 -0.671268 -1.039533 -0.270155 -1.830264 -0.290108 -1.852809 0.781898 8 7 5 ... -1.628830 -0.434948 0.322505 0.284326 -2.438365 1.473930 -1.044684 1.602686 -0.405263 -1.987263

5 rows × 29 columns

data.info()
data.describe()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 98000 entries, 0 to 97999
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   f_00    98000 non-null  float64
 1   f_01    98000 non-null  float64
 2   f_02    98000 non-null  float64
 3   f_03    98000 non-null  float64
 4   f_04    98000 non-null  float64
 5   f_05    98000 non-null  float64
 6   f_06    98000 non-null  float64
 7   f_07    98000 non-null  int64  
 8   f_08    98000 non-null  int64  
 9   f_09    98000 non-null  int64  
 10  f_10    98000 non-null  int64  
 11  f_11    98000 non-null  int64  
 12  f_12    98000 non-null  int64  
 13  f_13    98000 non-null  int64  
 14  f_14    98000 non-null  float64
 15  f_15    98000 non-null  float64
 16  f_16    98000 non-null  float64
 17  f_17    98000 non-null  float64
 18  f_18    98000 non-null  float64
 19  f_19    98000 non-null  float64
 20  f_20    98000 non-null  float64
 21  f_21    98000 non-null  float64
 22  f_22    98000 non-null  float64
 23  f_23    98000 non-null  float64
 24  f_24    98000 non-null  float64
 25  f_25    98000 non-null  float64
 26  f_26    98000 non-null  float64
 27  f_27    98000 non-null  float64
 28  f_28    98000 non-null  float64
dtypes: float64(22), int64(7)
memory usage: 22.4 MB
f_00 f_01 f_02 f_03 f_04 f_05 f_06 f_07 f_08 f_09 ... f_19 f_20 f_21 f_22 f_23 f_24 f_25 f_26 f_27 f_28
count 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 ... 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000 98000.000000
mean 0.001220 0.005580 -0.001042 -0.000700 -0.003522 -0.001612 -0.003042 5.545918 6.763061 8.193163 ... -0.004513 -0.000515 -0.001670 -0.038752 -0.220002 0.166434 -0.064309 -0.062540 0.098472 -0.230910
std 1.002801 1.000742 1.001373 1.000422 1.003061 1.000532 0.997434 3.691840 4.152348 5.904919 ... 1.004372 1.002962 0.999703 1.477858 1.494836 1.543014 1.576086 1.428055 1.305407 1.528476
min -4.732235 -4.202795 -4.377021 -4.010826 -4.535903 -4.300767 -4.894525 0.000000 0.000000 0.000000 ... -4.894525 -4.732235 -4.438130 -6.873999 -8.234305 -7.792363 -6.593842 -7.375719 -7.335556 -6.954151
25% -0.675226 -0.670985 -0.672779 -0.672540 -0.682510 -0.675066 -0.680421 3.000000 4.000000 4.000000 ... -0.678773 -0.679777 -0.675147 -1.022964 -1.203204 -0.903385 -1.128966 -0.975680 -0.746489 -1.262606
50% 0.002022 0.006650 -0.000324 -0.003185 -0.003307 0.001024 -0.002053 5.000000 6.000000 7.000000 ... -0.000587 -0.000806 0.000819 -0.056687 -0.219046 0.167074 -0.099221 -0.070852 0.082230 -0.271319
75% 0.677271 0.677746 0.677086 0.672097 0.677589 0.673344 0.668112 8.000000 9.000000 11.000000 ... 0.672149 0.675437 0.676881 0.930158 0.764690 1.217432 0.987684 0.843212 0.925306 0.770516
max 4.490521 4.324974 4.560247 4.399373 4.050549 4.710316 3.998595 32.000000 30.000000 44.000000 ... 4.560247 4.399373 4.135419 6.517721 6.054831 7.527271 7.544731 7.005608 7.205971 6.977150

8 rows × 29 columns

sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')
<matplotlib.axes._subplots.AxesSubplot at 0x7fef3917c890>

Most of the features are not at all correlated with each other.

There are no missing values, in the data.

if data.isna().any().any():
    print(data.isna().sum()*100/data.shape[0])
else:
    print("No Missing values")
No Missing values

Dimensionality Reduction

Principal Component Analysis

pca = PCA()
pca.fit(data)
PCA()
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel("Number of features")
plt.ylabel("Explained Variance")
plt.title("Variance vs. No. Features", fontsize=20)
Text(0.5, 1.0, 'Variance vs. No. Features')
pca = PCA(n_components=12)
pca.fit(data)

reduced_data = pca.transform(data)

Try

  • Clustering with different algorithms(kmeans, dbscan)
  • Cluster with GMM
  • Repeat above with feature engineered data
  • After that apply PCA or Autoencoder and repeat
  • Visualize the results

Clustering

K-Means Clustering

wss = list()
n_clusters = range(1, 10)

for n in n_clusters:
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(reduced_data)
    wss.append(kmeans.inertia_)
plt.plot(n_clusters, wss)
[<matplotlib.lines.Line2D at 0x7fef3424c7d0>]

From the above, elbow plot we can see that the change in the value of inertia is very less for the clusters 7, 8, 9.

So I guess that the number of optimal clusters can be either 7 or 8.

It is actually 7. because I tried 6 and 8 both gave very less rand index value.

kmeans = KMeans(n_clusters=8)
kmeans.fit(data)
kmeans_values = kmeans.predict(data)

Mini Batch K-Means Clustering

wss = list()
n_clusters = range(1, 20)

for n in n_clusters:
    minikmeans = MiniBatchKMeans(n_clusters=n)
    minikmeans.fit(reduced_data)
    wss.append(minikmeans.inertia_)
plt.plot(n_clusters, wss)
[<matplotlib.lines.Line2D at 0x7fef31325990>]
minikmeans = MiniBatchKMeans(n_clusters=7)
minikmeans.fit(reduced_data)
mini_values = minikmeans.predict(reduced_data)

Mean Shift Clustering

ms = MeanShift(n_jobs=-1)
ms.fit(reduced_data)
ms_values = ms.predict(reduced_data)

DBSCAN Clustering

dbscan = DBSCAN(n_jobs=-1)
dbscan_values = dbscan.fit_predict(data)

Submission

values = ms_values
submission = pd.read_csv('/content/sample_submission.csv')

submission['Predicted'] = values
submission.to_csv('output.csv', index=False)
!kaggle competitions submit -c tabular-playground-series-jul-2022 -f output.csv -m "MeanShift + PCA12"

Rand Index is: 0.23885