0% found this document useful (0 votes)
2 views23 pages

Machine learning Lab Assignment 1

The document outlines various machine learning assignments, including Linear Regression, Logistic Regression, Random Forest Classification, Decision Tree, Clustering, and Support Vector Machine (SVM). Each section provides code snippets for data preprocessing, model training, and evaluation using different datasets. Key metrics such as accuracy, mean absolute error, and silhouette score are used to assess model performance.

Uploaded by

tarlanavikas12
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views23 pages

Machine learning Lab Assignment 1

The document outlines various machine learning assignments, including Linear Regression, Logistic Regression, Random Forest Classification, Decision Tree, Clustering, and Support Vector Machine (SVM). Each section provides code snippets for data preprocessing, model training, and evaluation using different datasets. Key metrics such as accuracy, mean absolute error, and silhouette score are used to assess model performance.

Uploaded by

tarlanavikas12
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 23

ASSIGNMENT

1.Linear Regression
Code:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error,
r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
data = pd.read_csv('dataset.csv')

# Display basic info


print(data.head())
print(data.info())

# Handle missing values (example: drop rows with missing values)


data = data.dropna()
data = data[data['Production'] != '=']

# Verify the rows are removed


print(data[data['Production'] == '='])

# Encode categorical features


categorical_cols = ['State_Name', 'District_Name', 'Crop', 'Season']
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le

# Define features and target variable


X = data[['Area', 'Season', 'Crop', 'Crop_Year']] # Example features
y = data['Production']
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Scale the features


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data


y_pred = model.predict(X_test)

# Evaluate the model


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
Dataset:

Input:
Output:
2.Logistic Regression
Code:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
#Read the dataset using pandas (replace 'your_dataset.csv' with your actual file
path)
data = pd.read_csv('studyhours.csv')
print(data)
#Assuming the target column is 'target' and all other coulmnss are features
X = data.drop(columns=['status']) #Drop the target column to get features
y = data['status'] #Target variable
#Split the data into training and testing sets
X_train,X_test,y_train,y_test =
train_test_split(X,y,test_size=0.4,random_state=20)
#Initialize the Logistic Regression model
model = LogisticRegression()
#Train the model
model.fit(X_train,y_train)
#Make predictions on the test data
y_pred = model.predict(X_test)
#Evaluate the model
accuracy = accuracy_score(y_test,y_pred)
conf_matrix = confusion_matrix(y_test,y_pred)
#Print results
print("Accuracy.",accuracy)
print("Confusion Matrix.")
print(conf_matrix)
Dataset:
Input:

Output:
3.Random Forest Classification
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
from sklearn.preprocessing import LabelEncoder
# Load the Titanic dataset
file_path = 'titanic.csv' # Replace with your Titanic dataset file path
data = pd.read_csv(file_path)
# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())
# Drop columns not relevant for the model
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1,
errors='ignore')
# Fill missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Encode categorical features


categorical_cols = ['Sex', 'Embarked']
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le

# Define features and target variable


X = data.drop(['Survived'], axis=1)
y = data['Survived']

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize the Random Forest Classifier


model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display results
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
Dataset:
Input:
Output:
4. Decision Tree id3
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the weather dataset


filename = "weather.csv" # Update this path to your CSV file
df = pd.read_csv(filename)
print(df)

# Remove the 'Day' feature if present


df = df.drop(columns=['Day'], errors='ignore')

# Display the first few rows of the dataset


df.head()

# Encode categorical features using LabelEncoder


label_encoders = {}
for column in df.columns:
if df[column].dtype == 'object': # Apply encoding only to categorical columns
le = LabelEncoder()
df[column] = le.fit_transform(df[column])
label_encoders[column] = le
print("----------------------------After fit and
transform------------------------------------------")
print(df)
# Define features and target
X = df.iloc[:, :-1] # All columns except the last as features
y = df.iloc[:, -1] # Last column as target

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Build the decision tree classifier using the entropy criterion


model = DecisionTreeClassifier(criterion='entropy', random_state=42)
model.fit(X_train, y_train)

# Visualize the decision tree


plt.figure(figsize=(10, 6))
plot_tree(model, feature_names=X.columns,
class_names=label_encoders[df.columns[-1]].classes_,
filled=True, rounded=True, fontsize=10)
plt.title("Simple ID3 Decision Tree for Weather Dataset")
plt.show()
Dataset:
Input:
Output:
5.Clustering
Code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
# Load dataset from CSV file
df = pd.read_csv('student_marks.csv') # Ensure the file exists
# Selecting relevant features
marks = df[['Subject1', 'Subject2']].values
# Standardizing the data
scaler = StandardScaler()
marks_scaled = scaler.fit_transform(marks)
# Applying K-Means Clustering
k = 2 # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(marks_scaled)
# Get centroids
centroids = kmeans.cluster_centers_
# Assign cluster names based on performance
cluster_names = {0: 'High Performers', 1: 'Low Performers'} # Modify as
needed
df['Cluster Name'] = df['Cluster'].map(cluster_names)
# Save clustered data to CSV
df.to_csv('student_marks_clustered.csv', index=False)
# Performance Metrics
inertia = kmeans.inertia_ # SSE
silhouette_avg = silhouette_score(marks_scaled, df['Cluster'])
db_index = davies_bouldin_score(marks_scaled, df['Cluster'])
print(f"Inertia (SSE): {inertia:.2f}")
print(f"Silhouette Score: {silhouette_avg:.2f}")
print(f"Davies-Bouldin Index: {db_index:.2f}")
# Display cluster-wise information
print("\nCluster Information:")
print(df.groupby('Cluster Name')[['Subject1', 'Subject2']].mean())
# Plot the clusters
plt.figure(figsize=(8, 6))
plt.scatter(marks_scaled[:, 0], marks_scaled[:, 1], c=df['Cluster'], cmap='viridis',
marker='o', edgecolors='k', label='Students')
plt.scatter(centroids[:, 0], centroids[:, 1], s=200, c='red', marker='X',
label='Centroids')
plt.xlabel('Subject 1 (Scaled)')
plt.ylabel('Subject 2 (Scaled)')
plt.title('K-Means Clustering of Student Marks')
plt.legend()
plt.show()
Dataset:
Input:
Output:
6. Support Vector Machine SVM
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Read the dataset from CSV


df = pd.read_csv('Crop_recommendation.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
print(X)
print(y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Standardize the features


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train and evaluate Support Vector Machine


svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)

# Train and evaluate Logistic Regression


logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)

# Print the accuracy scores


print(f'SVM Accuracy: {svm_accuracy:.4f}')
print(f'Logistic Regression Accuracy: {logreg_accuracy:.4f}')
Dataset:
Input:

Output:

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy