0% found this document useful (0 votes)
1 views6 pages

credit_card_clustering_autoencoder

This document outlines a step-by-step process for building and training an autoencoder using PyTorch on a credit card dataset. It includes steps for library installation, data loading and preprocessing, model definition, training, and evaluation of clustering results. Finally, it visualizes the clustering results using t-SNE.

Uploaded by

saemon2714
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
1 views6 pages

credit_card_clustering_autoencoder

This document outlines a step-by-step process for building and training an autoencoder using PyTorch on a credit card dataset. It includes steps for library installation, data loading and preprocessing, model definition, training, and evaluation of clustering results. Finally, it visualizes the clustering results using t-SNE.

Uploaded by

saemon2714
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

{

"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2Cbo3KpwX0v0"
},
"outputs": [],
"source": [
"# Step 1: Install required libraries\n",
"!pip install torch torchvision datasets scikit-learn matplotlib seaborn
tqdm\n",
"!pip install datasets --upgrade"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qY5lP9oiX0v1"
},
"outputs": [],
"source": [
"# Step 2: Import libraries\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from datasets import load_dataset\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.manifold import TSNE\n",
"from sklearn.metrics import silhouette_score, davies_bouldin_score,
calinski_harabasz_score\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from tqdm import tqdm\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1olmebAKX0v1"
},
"outputs": [],
"source": [
"# Step 3: Load and preprocess the Hugging Face imodels/credit-card
dataset\n",
"!rm -rf ~/.cache/huggingface/datasets # Clear cache to avoid loading
issues\n",
"\n",
"# Check for GPU\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"if device.type == 'cpu':\n",
" print('WARNING: No GPU detected. Training will be slower. To enable
GPU, go to Runtime > Change runtime type > Select GPU.')\n",
"else:\n",
" print(f'Using device: {device}')\n",
"\n",
"# Load dataset\n",
"try:\n",
" print('Loading imodels/credit-card dataset...')\n",
" dataset = load_dataset('imodels/credit-card',
download_mode='force_redownload')['train']\n",
" print('Dataset loaded successfully')\n",
"except Exception as e:\n",
" print(f'Error loading dataset: {e}')\n",
" raise\n",
"\n",
"# Convert to pandas DataFrame\n",
"df = pd.DataFrame(dataset)\n",
"\n",
"# Inspect dataset\n",
"print('Dataset sample:')\n",
"print(df.head())\n",
"print('\\nUnique values per feature:')\n",
"for col in df.columns:\n",
" print(f'{col}: {df[col].unique()}')\n",
"\n",
"# Define feature and target columns\n",
"features = [col for col in df.columns if col !=
'default.payment.next.month']\n",
"target = 'default.payment.next.month'\n",
"\n",
"# Mapping for categorical features (for interpretation)\n",
"mapping_info = {\n",
" 'SEX': {1: 'male', 2: 'female'},\n",
" 'EDUCATION': {0: 'other', 1: 'graduate school', 2: 'university', 3:
'high school', 4: 'other', 5: 'other', 6: 'other'},\n",
" 'MARRIAGE': {0: 'other', 1: 'married', 2: 'single', 3: 'other'},\n",
" 'default.payment.next.month': {0: 'no default', 1: 'default'}\n",
"}\n",
"print('\\nMapping of integer values to categories:')\n",
"for col, mapping in mapping_info.items():\n",
" print(f'{col}: {mapping}')\n",
"\n",
"# Scale numerical features\n",
"numerical_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',\n",
" 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4',
'PAY_AMT5', 'PAY_AMT6']\n",
"scaler = StandardScaler()\n",
"df[numerical_features] = scaler.fit_transform(df[numerical_features])\n",
"\n",
"# Categorical features are already integer-encoded\n",
"X = df[features].astype(np.float32).values\n",
"y = df[target].astype(np.float32).values\n",
"\n",
"# Create PyTorch dataset\n",
"class CreditCardDataset(Dataset):\n",
" def __init__(self, features):\n",
" self.features = torch.tensor(features, dtype=torch.float32)\n",
" def __len__(self):\n",
" return len(self.features)\n",
" def __getitem__(self, idx):\n",
" return self.features[idx]\n",
"\n",
"credit_dataset = CreditCardDataset(X)\n",
"dataloader = DataLoader(credit_dataset, batch_size=64, shuffle=True)\n",
"print('Step 3 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lT3Y7Uo_X0v2"
},
"outputs": [],
"source": [
"# Step 4: Define the Autoencoder with modified architecture\n",
"class Autoencoder(nn.Module):\n",
" def __init__(self, input_dim=24, latent_dim=4):\n",
" super(Autoencoder, self).__init__()\n",
" # Encoder: deeper with dropout\n",
" self.encoder = nn.Sequential(\n",
" nn.Linear(input_dim, 16),\n",
" nn.ReLU(),\n",
" nn.Dropout(0.2),\n",
" nn.Linear(16, 8),\n",
" nn.ReLU(),\n",
" nn.Dropout(0.2),\n",
" nn.Linear(8, latent_dim),\n",
" nn.ReLU()\n",
" )\n",
" # Decoder: symmetric with dropout\n",
" self.decoder = nn.Sequential(\n",
" nn.Linear(latent_dim, 8),\n",
" nn.ReLU(),\n",
" nn.Dropout(0.2),\n",
" nn.Linear(8, 16),\n",
" nn.ReLU(),\n",
" nn.Dropout(0.2),\n",
" nn.Linear(16, input_dim)\n",
" )\n",
"\n",
" def forward(self, x):\n",
" encoded = self.encoder(x)\n",
" decoded = self.decoder(encoded)\n",
" return decoded, encoded\n",
"\n",
"# Initialize model\n",
"model = Autoencoder(input_dim=24, latent_dim=4).to(device)\n",
"criterion = nn.MSELoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
"print('Step 4 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "U8vKKBBOX0v2"
},
"outputs": [],
"source": [
"# Step 5: Train the Autoencoder (30 epochs)\n",
"num_epochs = 30\n",
"losses = []\n",
"\n",
"model.train()\n",
"for epoch in range(num_epochs):\n",
" epoch_loss = 0\n",
" for batch in tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}'):\
n",
" batch = batch.to(device)\n",
" optimizer.zero_grad()\n",
" output, _ = model(batch)\n",
" loss = criterion(output, batch)\n",
" loss.backward()\n",
" optimizer.step()\n",
" epoch_loss += loss.item()\n",
" avg_loss = epoch_loss / len(dataloader)\n",
" losses.append(avg_loss)\n",
" print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}')\n",
"\n",
"# Plot training loss\n",
"plt.figure(figsize=(10, 5))\n",
"plt.plot(losses, color='blue', label='Training Loss')\n",
"plt.title('Autoencoder Training Loss')\n",
"plt.xlabel('Epoch')\n",
"plt.ylabel('MSE Loss')\n",
"plt.legend()\n",
"plt.grid(True)\n",
"plt.show()\n",
"print('Step 5 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9gD2qJ-kX0v3"
},
"outputs": [],
"source": [
"# Step 6: Extract latent representations\n",
"model.eval()\n",
"latent_features = []\n",
"with torch.no_grad():\n",
" for batch in dataloader:\n",
" batch = batch.to(device)\n",
" _, encoded = model(batch)\n",
" latent_features.append(encoded.cpu().numpy())\n",
"latent_features = np.concatenate(latent_features, axis=0)\n",
"print('Step 6 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jJ2j3KxWX0v3"
},
"outputs": [],
"source": [
"# Step 7: Apply K-Means clustering\n",
"kmeans = KMeans(n_clusters=2, random_state=42)\n",
"cluster_labels = kmeans.fit_predict(latent_features)\n",
"print('Step 7 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "a3lKKBBOX0v4"
},
"outputs": [],
"source": [
"# Step 8: Evaluate clustering\n",
"silhouette = silhouette_score(latent_features, cluster_labels)\n",
"davies_bouldin = davies_bouldin_score(latent_features, cluster_labels)\n",
"calinski_harabasz = calinski_harabasz_score(latent_features,
cluster_labels)\n",
"\n",
"print('Clustering Evaluation Metrics:')\n",
"print(f'Silhouette Score: {silhouette:.4f}')\n",
"print(f'Davies-Bouldin Index: {davies_bouldin:.4f}')\n",
"print(f'Calinski-Harabasz Index: {calinski_harabasz:.4f}')\n",
"print('Step 8 complete')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "t3X5pGCRX0v4"
},
"outputs": [],
"source": [
"# Step 9: Visualize with t-SNE\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
"tsne_features = tsne.fit_transform(latent_features)\n",
"\n",
"# Plot predicted clusters\n",
"plt.figure(figsize=(12, 5))\n",
"plt.subplot(1, 2, 1)\n",
"sns.scatterplot(x=tsne_features[:, 0], y=tsne_features[:, 1],
hue=cluster_labels, palette=['blue', 'red'], legend='full')\n",
"plt.title('t-SNE: Predicted Clusters')\n",
"plt.xlabel('t-SNE Component 1')\n",
"plt.ylabel('t-SNE Component 2')\n",
"\n",
"# Plot true labels\n",
"plt.subplot(1, 2, 2)\n",
"sns.scatterplot(x=tsne_features[:, 0], y=tsne_features[:, 1], hue=y,
palette=['green', 'purple'], legend='full')\n",
"plt.title('t-SNE: True Labels')\n",
"plt.xlabel('t-SNE Component 1')\n",
"plt.ylabel('t-SNE Component 2')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"print('Step 9 complete')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11"
},
"colab": {
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 0
}

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy