import pandas as pd
import numpy as np
df=pd.read_csv("/content/sample_data/Mall_Customers.csv")
df.head()
print(df.shape)
print(df['Gender'].value_counts())
from sklearn.preprocessing import LabelEncoder
# Convert Gender to numerical values
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
from imblearn.over_sampling import SMOTE
import pandas as pd
X = df.drop(columns=['Gender'])
y = df['Gender'] # Target: Gender column
# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Gender'] = y_resampled # Add balanced Gender column back
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_resampled[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']] = scaler.fit_transform(
df_resampled[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']]
)
from sklearn.cluster import KMeans
# Apply K-Means
kmeans = KMeans(n_clusters=4, random_state=42)
df_resampled['Cluster'] = kmeans.fit_predict(df_resampled[['Annual Income (k$)', 'Age', 'Spending Score (1-100)']])
import matplotlib.pyplot as plt
plt.scatter(df_resampled['Annual Income (k$)'], df_resampled['Spending Score (1-100)'], c=df_resampled['Cluster'], cmap='viridis')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Customer Clusters (After SMOTE & K-Means)')
plt.colorbar(label='Cluster')
plt.show()
Comments
Post a Comment