ML Codes
ML Codes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, classification_report
import seaborn as sns
# 1. Linear Regression
print("\n1. Linear Regression")
# Define variables
X = data[['feature1', 'feature2']]
y = data['target']
# Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Visualizations
plt.figure(figsize=(10, 6))
plt.scatter(y_test, lr_predictions)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Linear Regression: Actual vs Predicted')
plt.savefig('linear_regression_results.png')
plt.close()
# Print results
mse = mean_squared_error(y_test, lr_predictions)
print(f"Linear Regression Mean Squared Error: {mse}")
print(f"Coefficients: {lr_model.coef_}")
print(f"Intercept: {lr_model.intercept_}")
# 2. Logistic Regression
print("\n2. Logistic Regression")
# Define variables
X = data[['feature1', 'feature2', 'categorical_feature']]
y = data['binary_target']
# Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Visualizations
cm = confusion_matrix(y_test, log_reg_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Logistic Regression: Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('logistic_regression_confusion_matrix.png')
plt.close()
# Print results
accuracy = accuracy_score(y_test, log_reg_predictions)
print(f"Logistic Regression Accuracy: {accuracy}")
print("\nClassification Report:")
print(classification_report(y_test, log_reg_predictions))
# 3. Naive Bayes
print("\n3. Naive Bayes")
# Define variables
X = data[['feature1', 'feature2']]
y = data['categorical_target']
# Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
# Visualizations
cm = confusion_matrix(y_test, nb_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Naive Bayes: Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('naive_bayes_confusion_matrix.png')
plt.close()
# Print results
accuracy = accuracy_score(y_test, nb_predictions)
print(f"Naive Bayes Accuracy: {accuracy}")
print("\nClassification Report:")
print(classification_report(y_test, nb_predictions))
# 4. Decision Tree
print("\n4. Decision Tree")
# Classification
print("\nDecision Tree - Classification")
# Define variables
X = data[['feature1', 'feature2', 'categorical_feature']]
y = data['categorical_target']
# Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
# Visualizations
plt.figure(figsize=(20,10))
plot_tree(dt_classifier, feature_names=X.columns, class_names=dt_classifier.classes_, filled=True,
rounded=True)
plt.title('Decision Tree Classifier')
plt.savefig('decision_tree_classifier.png')
plt.close()
# Print results
accuracy = accuracy_score(y_test, dt_class_predictions)
print(f"Decision Tree (Classification) Accuracy: {accuracy}")
print("\nClassification Report:")
print(classification_report(y_test, dt_class_predictions))
# Regression
print("\nDecision Tree - Regression")
# Define variables
X_reg = data[['feature1', 'feature2']]
y_reg = data['continuous_target']
# Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_reg = pd.DataFrame(imputer.fit_transform(X_reg), columns=X_reg.columns)
# Visualizations
plt.figure(figsize=(10, 6))
plt.scatter(y_test_reg, dt_reg_predictions)
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Decision Tree Regression: Actual vs Predicted')
plt.savefig('decision_tree_regression_results.png')
plt.close()
# Print results
mse = mean_squared_error(y_test_reg, dt_reg_predictions)
print(f"Decision Tree (Regression) Mean Squared Error: {mse}")
# 5. K-means Clustering
print("\n5. K-means Clustering")
# Define variables
X = data[['feature1', 'feature2']]
# Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Visualizations
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertias, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.savefig('kmeans_elbow_method.png')
plt.close()
# Visualize clusters
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=kmeans_labels, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], color='red', marker='x', s=200,
linewidths=3)
plt.title('K-means Clustering')
plt.colorbar(scatter)
plt.savefig('kmeans_clusters.png')
plt.close()
# Print results
print(f"K-means clustering completed with {optimal_k} clusters")
print(f"Cluster centers:\n{kmeans.cluster_centers_}")
# 6. Hierarchical Clustering
print("\n6. Hierarchical Clustering")
# Define variables
X = data[['feature1', 'feature2']]
# Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Visualizations
# Dendrogram
linked = linkage(X_scaled, method='ward')
plt.figure(figsize=(10, 7))
dendrogram(linked)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.savefig('hierarchical_dendrogram.png')
plt.close()
# Print results
print(f"Hierarchical clustering completed with 3 clusters")
print(f"Cluster labels: {np.unique(hierarchical_labels)}")
# 1. Pre-processing
# Missing value imputation
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
# Standardization
scaler = StandardScaler()
data_standardized = pd.DataFrame(scaler.fit_transform(data_imputed), columns=data_imputed.columns)
# Normalization
normalizer = Normalizer()
data_normalized = pd.DataFrame(normalizer.fit_transform(data_imputed),
columns=data_imputed.columns)
# 2. Visualization
# Bar chart
plt.figure(figsize=(10, 6))
data['categorical_column'].value_counts().plot(kind='bar')
plt.title('Bar Chart')
plt.show()
# Scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(data['x_column'], data['y_column'])
plt.title('Scatter Plot')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
# Heat map
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
# Line graph
plt.figure(figsize=(10, 6))
plt.plot(data['x_column'], data['y_column'])
plt.title('Line Graph')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
# 3. Linear Regression
X = data[['feature1', 'feature2']]
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
# 4. Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg_predictions = log_reg.predict(X_test)
# 5. Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)
# For regression
dt_regressor = DecisionTreeRegressor() # CART
dt_regressor.fit(X_train, y_train)
dt_reg_predictions = dt_regressor.predict(X_test)
# 7. K-means Clustering
# Elbow method
inertias = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
inertias.append(kmeans.inertia_)
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertias, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
# 8. Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=3)
hierarchical_labels = hierarchical.fit_predict(X)