credit_card_clustering_autoencoder
credit_card_clustering_autoencoder
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2Cbo3KpwX0v0"
},
"outputs": [],
"source": [
"# Step 1: Install required libraries\n",
"!pip install torch torchvision datasets scikit-learn matplotlib seaborn
tqdm\n",
"!pip install datasets --upgrade"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qY5lP9oiX0v1"
},
"outputs": [],
"source": [
"# Step 2: Import libraries\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from datasets import load_dataset\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.manifold import TSNE\n",
"from sklearn.metrics import silhouette_score, davies_bouldin_score,
calinski_harabasz_score\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from tqdm import tqdm\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1olmebAKX0v1"
},
"outputs": [],
"source": [
"# Step 3: Load and preprocess the Hugging Face imodels/credit-card
dataset\n",
"!rm -rf ~/.cache/huggingface/datasets # Clear cache to avoid loading
issues\n",
"\n",
"# Check for GPU\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"if device.type == 'cpu':\n",
" print('WARNING: No GPU detected. Training will be slower. To enable
GPU, go to Runtime > Change runtime type > Select GPU.')\n",
"else:\n",
" print(f'Using device: {device}')\n",
"\n",
"# Load dataset\n",
"try:\n",
" print('Loading imodels/credit-card dataset...')\n",
" dataset = load_dataset('imodels/credit-card',
download_mode='force_redownload')['train']\n",
" print('Dataset loaded successfully')\n",
"except Exception as e:\n",
" print(f'Error loading dataset: {e}')\n",
" raise\n",
"\n",
"# Convert to pandas DataFrame\n",
"df = pd.DataFrame(dataset)\n",
"\n",
"# Inspect dataset\n",
"print('Dataset sample:')\n",
"print(df.head())\n",
"print('\\nUnique values per feature:')\n",
"for col in df.columns:\n",
" print(f'{col}: {df[col].unique()}')\n",
"\n",
"# Define feature and target columns\n",
"features = [col for col in df.columns if col !=
'default.payment.next.month']\n",
"target = 'default.payment.next.month'\n",
"\n",
"# Mapping for categorical features (for interpretation)\n",
"mapping_info = {\n",
" 'SEX': {1: 'male', 2: 'female'},\n",
" 'EDUCATION': {0: 'other', 1: 'graduate school', 2: 'university', 3:
'high school', 4: 'other', 5: 'other', 6: 'other'},\n",
" 'MARRIAGE': {0: 'other', 1: 'married', 2: 'single', 3: 'other'},\n",
" 'default.payment.next.month': {0: 'no default', 1: 'default'}\n",
"}\n",
"print('\\nMapping of integer values to categories:')\n",
"for col, mapping in mapping_info.items():\n",
" print(f'{col}: {mapping}')\n",
"\n",
"# Scale numerical features\n",
"numerical_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',\n",
" 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4',
'PAY_AMT5', 'PAY_AMT6']\n",
"scaler = StandardScaler()\n",
"df[numerical_features] = scaler.fit_transform(df[numerical_features])\n",
"\n",
"# Categorical features are already integer-encoded\n",
"X = df[features].astype(np.float32).values\n",
"y = df[target].astype(np.float32).values\n",
"\n",
"# Create PyTorch dataset\n",
"class CreditCardDataset(Dataset):\n",
" def __init__(self, features):\n",
" self.features = torch.tensor(features, dtype=torch.float32)\n",
" def __len__(self):\n",
" return len(self.features)\n",
" def __getitem__(self, idx):\n",
" return self.features[idx]\n",
"\n",
"credit_dataset = CreditCardDataset(X)\n",
"dataloader = DataLoader(credit_dataset, batch_size=64, shuffle=True)\n",
"print('Step 3 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lT3Y7Uo_X0v2"
},
"outputs": [],
"source": [
"# Step 4: Define the Autoencoder with modified architecture\n",
"class Autoencoder(nn.Module):\n",
" def __init__(self, input_dim=24, latent_dim=4):\n",
" super(Autoencoder, self).__init__()\n",
" # Encoder: deeper with dropout\n",
" self.encoder = nn.Sequential(\n",
" nn.Linear(input_dim, 16),\n",
" nn.ReLU(),\n",
" nn.Dropout(0.2),\n",
" nn.Linear(16, 8),\n",
" nn.ReLU(),\n",
" nn.Dropout(0.2),\n",
" nn.Linear(8, latent_dim),\n",
" nn.ReLU()\n",
" )\n",
" # Decoder: symmetric with dropout\n",
" self.decoder = nn.Sequential(\n",
" nn.Linear(latent_dim, 8),\n",
" nn.ReLU(),\n",
" nn.Dropout(0.2),\n",
" nn.Linear(8, 16),\n",
" nn.ReLU(),\n",
" nn.Dropout(0.2),\n",
" nn.Linear(16, input_dim)\n",
" )\n",
"\n",
" def forward(self, x):\n",
" encoded = self.encoder(x)\n",
" decoded = self.decoder(encoded)\n",
" return decoded, encoded\n",
"\n",
"# Initialize model\n",
"model = Autoencoder(input_dim=24, latent_dim=4).to(device)\n",
"criterion = nn.MSELoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
"print('Step 4 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "U8vKKBBOX0v2"
},
"outputs": [],
"source": [
"# Step 5: Train the Autoencoder (30 epochs)\n",
"num_epochs = 30\n",
"losses = []\n",
"\n",
"model.train()\n",
"for epoch in range(num_epochs):\n",
" epoch_loss = 0\n",
" for batch in tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}'):\
n",
" batch = batch.to(device)\n",
" optimizer.zero_grad()\n",
" output, _ = model(batch)\n",
" loss = criterion(output, batch)\n",
" loss.backward()\n",
" optimizer.step()\n",
" epoch_loss += loss.item()\n",
" avg_loss = epoch_loss / len(dataloader)\n",
" losses.append(avg_loss)\n",
" print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}')\n",
"\n",
"# Plot training loss\n",
"plt.figure(figsize=(10, 5))\n",
"plt.plot(losses, color='blue', label='Training Loss')\n",
"plt.title('Autoencoder Training Loss')\n",
"plt.xlabel('Epoch')\n",
"plt.ylabel('MSE Loss')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"plt.show()\n",
"print('Step 5 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9gD2qJ-kX0v3"
},
"outputs": [],
"source": [
"# Step 6: Extract latent representations\n",
"model.eval()\n",
"latent_features = []\n",
"with torch.no_grad():\n",
" for batch in dataloader:\n",
" batch = batch.to(device)\n",
" _, encoded = model(batch)\n",
" latent_features.append(encoded.cpu().numpy())\n",
"latent_features = np.concatenate(latent_features, axis=0)\n",
"print('Step 6 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jJ2j3KxWX0v3"
},
"outputs": [],
"source": [
"# Step 7: Apply K-Means clustering\n",
"kmeans = KMeans(n_clusters=2, random_state=42)\n",
"cluster_labels = kmeans.fit_predict(latent_features)\n",
"print('Step 7 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "a3lKKBBOX0v4"
},
"outputs": [],
"source": [
"# Step 8: Evaluate clustering\n",
"silhouette = silhouette_score(latent_features, cluster_labels)\n",
"davies_bouldin = davies_bouldin_score(latent_features, cluster_labels)\n",
"calinski_harabasz = calinski_harabasz_score(latent_features,
cluster_labels)\n",
"\n",
"print('Clustering Evaluation Metrics:')\n",
"print(f'Silhouette Score: {silhouette:.4f}')\n",
"print(f'Davies-Bouldin Index: {davies_bouldin:.4f}')\n",
"print(f'Calinski-Harabasz Index: {calinski_harabasz:.4f}')\n",
"print('Step 8 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "t3X5pGCRX0v4"
},
"outputs": [],
"source": [
"# Step 9: Visualize with t-SNE\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
"tsne_features = tsne.fit_transform(latent_features)\n",
"\n",
"# Plot predicted clusters\n",
"plt.figure(figsize=(12, 5))\n",
"plt.subplot(1, 2, 1)\n",
"sns.scatterplot(x=tsne_features[:, 0], y=tsne_features[:, 1],
hue=cluster_labels, palette=['blue', 'red'], legend='full')\n",
"plt.title('t-SNE: Predicted Clusters')\n",
"plt.xlabel('t-SNE Component 1')\n",
"plt.ylabel('t-SNE Component 2')\n",
"\n",
"# Plot true labels\n",
"plt.subplot(1, 2, 2)\n",
"sns.scatterplot(x=tsne_features[:, 0], y=tsne_features[:, 1], hue=y,
palette=['green', 'purple'], legend='full')\n",
"plt.title('t-SNE: True Labels')\n",
"plt.xlabel('t-SNE Component 1')\n",
"plt.ylabel('t-SNE Component 2')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"print('Step 9 complete')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11"
},
"colab": {
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 0
}