import tqdm
import numpy as np
import pandas as pd
import seaborn as sns

from zipfile import ZipFile
from matplotlib import pyplot as plt

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, SpectralClustering, AgglomerativeClustering, DBSCAN

%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 12)

Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c tabular-playground-series-jul-2022

Downloading tabular-playground-series-jul-2022.zip to /content
 87% 17.0M/19.6M [00:00<00:00, 53.2MB/s]
100% 19.6M/19.6M [00:00<00:00, 55.8MB/s]

with ZipFile('/content/tabular-playground-series-jul-2022.zip', 'r') as zf:
    zf.extractall('./')

Loading the data

data = pd.read_csv('data.csv', index_col='id')
data.head()

data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98000 entries, 0 to 97999
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   f_00    98000 non-null  float64
 1   f_01    98000 non-null  float64
 2   f_02    98000 non-null  float64
 3   f_03    98000 non-null  float64
 4   f_04    98000 non-null  float64
 5   f_05    98000 non-null  float64
 6   f_06    98000 non-null  float64
 7   f_07    98000 non-null  int64  
 8   f_08    98000 non-null  int64  
 9   f_09    98000 non-null  int64  
 10  f_10    98000 non-null  int64  
 11  f_11    98000 non-null  int64  
 12  f_12    98000 non-null  int64  
 13  f_13    98000 non-null  int64  
 14  f_14    98000 non-null  float64
 15  f_15    98000 non-null  float64
 16  f_16    98000 non-null  float64
 17  f_17    98000 non-null  float64
 18  f_18    98000 non-null  float64
 19  f_19    98000 non-null  float64
 20  f_20    98000 non-null  float64
 21  f_21    98000 non-null  float64
 22  f_22    98000 non-null  float64
 23  f_23    98000 non-null  float64
 24  f_24    98000 non-null  float64
 25  f_25    98000 non-null  float64
 26  f_26    98000 non-null  float64
 27  f_27    98000 non-null  float64
 28  f_28    98000 non-null  float64
dtypes: float64(22), int64(7)
memory usage: 22.4 MB

sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')

<matplotlib.axes._subplots.AxesSubplot at 0x7fef3917c890>

Most of the features are not at all correlated with each other.

There are no missing values, in the data.

if data.isna().any().any():
    print(data.isna().sum()*100/data.shape[0])
else:
    print("No Missing values")

No Missing values

Dimensionality Reduction

Principal Component Analysis

pca = PCA()
pca.fit(data)

PCA()

plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel("Number of features")
plt.ylabel("Explained Variance")
plt.title("Variance vs. No. Features", fontsize=20)

Text(0.5, 1.0, 'Variance vs. No. Features')

pca = PCA(n_components=12)
pca.fit(data)

reduced_data = pca.transform(data)

Try

Clustering with different algorithms(kmeans, dbscan)
Cluster with GMM
Repeat above with feature engineered data
After that apply PCA or Autoencoder and repeat
Visualize the results

Clustering

K-Means Clustering

wss = list()
n_clusters = range(1, 10)

for n in n_clusters:
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(reduced_data)
    wss.append(kmeans.inertia_)

plt.plot(n_clusters, wss)

[<matplotlib.lines.Line2D at 0x7fef3424c7d0>]

From the above, elbow plot we can see that the change in the value of inertia is very less for the clusters 7, 8, 9.

So I guess that the number of optimal clusters can be either 7 or 8.

It is actually 7. because I tried 6 and 8 both gave very less rand index value.

kmeans = KMeans(n_clusters=8)
kmeans.fit(data)
kmeans_values = kmeans.predict(data)

Mini Batch K-Means Clustering

wss = list()
n_clusters = range(1, 20)

for n in n_clusters:
    minikmeans = MiniBatchKMeans(n_clusters=n)
    minikmeans.fit(reduced_data)
    wss.append(minikmeans.inertia_)

plt.plot(n_clusters, wss)

[<matplotlib.lines.Line2D at 0x7fef31325990>]

minikmeans = MiniBatchKMeans(n_clusters=7)
minikmeans.fit(reduced_data)
mini_values = minikmeans.predict(reduced_data)

Mean Shift Clustering

ms = MeanShift(n_jobs=-1)
ms.fit(reduced_data)

ms_values = ms.predict(reduced_data)

DBSCAN Clustering

dbscan = DBSCAN(n_jobs=-1)
dbscan_values = dbscan.fit_predict(data)

Submission

values = ms_values

submission = pd.read_csv('/content/sample_submission.csv')

submission['Predicted'] = values
submission.to_csv('output.csv', index=False)

!kaggle competitions submit -c tabular-playground-series-jul-2022 -f output.csv -m "MeanShift + PCA12"

Rand Index is: 0.23885

	f_00	f_01	f_02	f_03	f_04	f_05	f_06	f_07	f_08	f_09	...	f_19	f_20	f_21	f_22	f_23	f_24	f_25	f_26	f_27	f_28
id
0	-0.389420	-0.912791	0.648951	0.589045	-0.830817	0.733624	2.258560	2	13	14	...	-0.478412	-0.757002	-0.763635	-1.090369	1.142641	-0.884274	1.137896	1.309073	1.463002	0.813527
1	-0.689249	-0.453954	0.654175	0.995248	-1.653020	0.863810	-0.090651	2	3	6	...	-0.428791	-0.089908	-1.784204	-0.839474	0.459685	1.759412	-0.275422	-0.852168	0.562457	-2.680541
2	0.809079	0.324568	-1.170602	-0.624491	0.105448	0.783948	1.988301	5	11	5	...	-0.413534	-1.602377	1.190984	3.267116	-0.088322	-2.168635	-0.974989	1.335763	-1.110655	-3.630723
3	-0.500923	0.229049	0.264109	0.231520	0.415012	-1.221269	0.138850	6	2	13	...	0.619283	1.287801	0.532837	1.036631	-2.041828	1.440490	-1.900191	-0.630771	-0.050641	0.238333
4	-0.671268	-1.039533	-0.270155	-1.830264	-0.290108	-1.852809	0.781898	8	7	5	...	-1.628830	-0.434948	0.322505	0.284326	-2.438365	1.473930	-1.044684	1.602686	-0.405263	-1.987263

	f_00	f_01	f_02	f_03	f_04	f_05	f_06	f_07	f_08	f_09	...	f_19	f_20	f_21	f_22	f_23	f_24	f_25	f_26	f_27	f_28
count	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	...	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000	98000.000000
mean	0.001220	0.005580	-0.001042	-0.000700	-0.003522	-0.001612	-0.003042	5.545918	6.763061	8.193163	...	-0.004513	-0.000515	-0.001670	-0.038752	-0.220002	0.166434	-0.064309	-0.062540	0.098472	-0.230910
std	1.002801	1.000742	1.001373	1.000422	1.003061	1.000532	0.997434	3.691840	4.152348	5.904919	...	1.004372	1.002962	0.999703	1.477858	1.494836	1.543014	1.576086	1.428055	1.305407	1.528476
min	-4.732235	-4.202795	-4.377021	-4.010826	-4.535903	-4.300767	-4.894525	0.000000	0.000000	0.000000	...	-4.894525	-4.732235	-4.438130	-6.873999	-8.234305	-7.792363	-6.593842	-7.375719	-7.335556	-6.954151
25%	-0.675226	-0.670985	-0.672779	-0.672540	-0.682510	-0.675066	-0.680421	3.000000	4.000000	4.000000	...	-0.678773	-0.679777	-0.675147	-1.022964	-1.203204	-0.903385	-1.128966	-0.975680	-0.746489	-1.262606
50%	0.002022	0.006650	-0.000324	-0.003185	-0.003307	0.001024	-0.002053	5.000000	6.000000	7.000000	...	-0.000587	-0.000806	0.000819	-0.056687	-0.219046	0.167074	-0.099221	-0.070852	0.082230	-0.271319
75%	0.677271	0.677746	0.677086	0.672097	0.677589	0.673344	0.668112	8.000000	9.000000	11.000000	...	0.672149	0.675437	0.676881	0.930158	0.764690	1.217432	0.987684	0.843212	0.925306	0.770516
max	4.490521	4.324974	4.560247	4.399373	4.050549	4.710316	3.998595	32.000000	30.000000	44.000000	...	4.560247	4.399373	4.135419	6.517721	6.054831	7.527271	7.544731	7.005608	7.205971	6.977150