0% found this document useful (0 votes)
1 views20 pages

Big Data Practical

The document is a practical file for Big Data Analysis submitted by a student at Apeejay Stya University. It includes various algorithms such as Naïve Bayes, K-Means, K-Nearest Neighbor, Apriori, DBSCAN, Decision Tree, Random Forest, Linear Regression, and Support Vector Machine, along with code snippets and outputs for each algorithm. The file serves as a comprehensive guide for implementing and understanding these data analysis techniques.

Uploaded by

aakash062006
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
1 views20 pages

Big Data Practical

The document is a practical file for Big Data Analysis submitted by a student at Apeejay Stya University. It includes various algorithms such as Naïve Bayes, K-Means, K-Nearest Neighbor, Apriori, DBSCAN, Decision Tree, Random Forest, Linear Regression, and Support Vector Machine, along with code snippets and outputs for each algorithm. The file serves as a comprehensive guide for implementing and understanding these data analysis techniques.

Uploaded by

aakash062006
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 20

APEEJAY STYA UNIVERSITY

Big Data Analysis Practical File

Submitted To: Submitted By:


Dr. Sudhakar Ranjan Himanshi
ASU2021010200225
Bachelor of
Computer Science
Engineering6th
Semester

1
INDEX

S.No. Algorithm Page


1. Naïve Bayes 3
2. K - Means 4-5
3. K – Nearest Neighbor 6-7
4. Apriori Algorithm 8-9
5. DBSCAN 10 - 11
6. Decision Tree 12 - 13
7. Random Forest 14 - 15
8. Linear Regression 16 - 17
9. Support Vector Machine 18 - 20

2
1. Naïve Bayes
CODE
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score

# Load Iris dataset

iris = load_iris()

X = iris.data

y = iris.target

# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Naive Bayes classifier

model = GaussianNB()

# Train the model

model.fit(X_train, y_train)

# Predict on the test data

y_pred =

model.predict(X_test)

# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

OUTPUT:

3
2. K- Means
CODE
# Generating a small random dataset

X_small = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]])

# Instantiate and fit KMeans

kmeans_small = KMeans(n_clusters=2)

kmeans_small.fit(X_small)

# Print cluster centroids

print("Cluster centroids:")

for i, centroid in enumerate(kmeans_small.centroids):

print(f"Cluster {i+1}: {centroid}")

# Print samples belonging to each cluster print("\

nSamples in each cluster:")

for i in range(kmeans_small.n_clusters):

cluster_samples = X_small[kmeans_small.predict(X_small) == i]

print(f"Cluster {i+1}: {cluster_samples}")

# Plotting the clusters

plt.scatter(X_small[:, 0], X_small[:, 1], c=kmeans_small.predict(X_small), cmap='viridis')

plt.scatter(kmeans_small.centroids[:, 0], kmeans_small.centroids[:, 1], marker='x', s=200, color='red')

plt.title('KMeans Clustering (Small Dataset)')

plt.xlabel('X')

plt.ylabel('Y')

plt.show()

OUTPUT:

4
5
3. K- Nearest Neighbor
CODE
import matplotlib.pyplot as plt

def knn_cal(x, y, x1, y1):

dis_cal = ((x1 - x) ** 2 + (y1 - y) ** 2) ** 0.5

return dis_cal

a = []

k = int(input('Enter the no. of coordinates: '))

print(a)

for i in range(k):

j = []

for k in range(2):

x = int(input('Enter the x coordinate: '))

j.append(x)

a.append(j)

print(a)

r = knn_cal(a[0][0], a[0][1], a[1][0], a[1][1])

print('Distance between the two points:', r)

# Plotting the points

x_coords = [coord[0] for coord in a]

y_coords = [coord[1] for coord in a]

plt.scatter(x_coords, y_coords, color='blue')

for i, txt in enumerate(range(k)):

plt.annotate(f'({a[i][0]}, {a[i][1]})', (a[i][0], a[i][1]))

6
plt.xlabel('x')

plt.ylabel('y')

plt.title('Coordinates')

plt.show()

OUTPUT:

7
4. Apriori Algorithm
CODE
from itertools import combinations

from collections import defaultdict

def apriori(dataset, min_support):

# Count occurrences of each item

item_counts = defaultdict(int)

for transaction in dataset:

for item in transaction:

item_counts[item] += 1

# Filter items based on min_support

frequent_items = {item for item, count in item_counts.items() if count >= min_support}

frequent_itemsets = []

k=2

while frequent_items:

# Generate candidate itemsets

candidate_itemsets = {frozenset(items) for items in combinations(frequent_items, k)}

# Count support for each candidate itemset

candidate_supports = defaultdict(int)

for transaction in dataset:

for candidate_itemset in candidate_itemsets:

if candidate_itemset.issubset(transaction):

candidate_supports[candidate_itemset] += 1

# Filter candidates based on min_support

frequent_items = {itemset for itemset, support in candidate_supports.items() if support >= min_support}

# Add frequent itemsets to result

frequent_itemsets.extend(frequent_items)

8
k += 1

return frequent_itemsets

# Example dataset

dataset = [

{'bread', 'milk'},

{'bread', 'diaper', 'beer', 'egg'},

{'milk', 'diaper', 'beer', 'cola'},

{'bread', 'milk', 'diaper', 'beer'},

{'bread', 'milk', 'diaper', 'cola'}

# Minimum support threshold

min_support = 3

# Finding frequent itemsets using Apriori

frequent_itemsets = apriori(dataset, min_support)

print("Frequent itemsets:")

for i, itemset in enumerate(frequent_itemsets):

print(f"Itemset {i+1}: {itemset}")

OUTPUT:

9
5. DBSCAN
CODE
from sklearn.datasets import make_moons

from sklearn.cluster import DBSCAN

import matplotlib.pyplot as plt

# Generate example data

X, _ = make_moons(n_samples=200, noise=0.1, random_state=42)

# Initialize DBSCAN

dbscan = DBSCAN(eps=0.2, min_samples=5)

# Fit the model

dbscan.fit(X)

# Get cluster labels

labels = dbscan.labels_

# Number of clusters in labels, ignoring noise if present.

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)

print('Estimated number of noise points: %d' % n_noise_)

# Plotting the clusters

plt.figure(figsize=(8, 6))

plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')

plt.title('DBSCAN Clustering')

plt.xlabel('Feature 1')

plt.ylabel('Feature 2')

plt.show()

10
OUTPUT:

11
6. Decision Tree
CODE
from sklearn.tree import DecisionTreeClassifier, plot_tree

import matplotlib.pyplot as plt

import numpy as np

# Define the dataset

# Each data point represents [Outlook, Temperature, Humidity,

Wind] # Outlook: 0 - Sunny, 1 - Overcast, 2 - Rainy

# Temperature: 0 - Cool, 1 - Mild, 2 - Hot

# Humidity: 0 - Normal, 1 - High

# Wind: 0 - Weak, 1 - Strong

X = np.array([

[0, 0, 0, 0],

[0, 0, 0, 1],

[1, 0, 0, 0],

[2, 1, 0, 0],

[2, 2, 1, 0],

[2, 2, 1, 1],

[1, 2, 1, 1],

[0, 1, 0, 0],

[0, 2, 1, 0],

[2, 1, 1, 0],

[0, 1, 1, 1],

[1, 1, 0, 1],

[1, 0, 1, 0],

[2, 1, 0, 1]

])

# Labels: 0 - No, 1 - Yes (Play tennis)

y = np.array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0])

# Create a Decision Tree classifier

clf = DecisionTreeClassifier(random_state=42)

12
# Train the

classifier clf.fit(X,

y)

# Plot the Decision Tree

plt.figure(figsize=(12, 8))

plot_tree(clf, filled=True, feature_names=['Outlook', 'Temperature', 'Humidity', 'Wind'], class_names=['No',


'Yes'])

plt.show()

# Generate random weather conditions for prediction

new_data = np.random.randint(3, size=(1, 4)) # Random integers between 0 and 2 for each feature

# Predict whether a player will play tennis or not

prediction = clf.predict(new_data)

# Output the prediction

if prediction[0] == 1:

print("Player will play tennis.")

else:

print("Player will not play tennis.")

OUTPUT:

13
7. Random Forest
CODE
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

# Load Iris dataset

iris = load_iris()

X = iris.data

y = iris.target

# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Random Forest classifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model

clf.fit(X_train, y_train)

# Predict on the test data

y_pred = clf.predict(X_test)

# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

# Print classification report

print("Classification Report:")

print(classification_report(y_test, y_pred))

14
OUTPUT:

15
8. Linear Regression
CODE
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import

train_test_split from sklearn.linear_model import

LinearRegression from sklearn.metrics import

mean_squared_error

# Create a sample dataset

data = pd.DataFrame({

'bedrooms': np.random.randint(1, 6, 100), # Random number of bedrooms (1-5)

'bathrooms': np.random.randint(1, 4, 100), # Random number of bathrooms (1-3)

'size_sqft': np.random.randint(800, 2500, 100), # Random size of the house in square feet (800-2500)

'price': np.random.randint(200000, 1000000, 100) # Random price of the house ($200k-$1M)

})

# Prepare the data

X = data[['bedrooms', 'bathrooms', 'size_sqft']] # Features

y = data['price'] # Target variable

# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Linear Regression model

model = LinearRegression()

# Train the model

model.fit(X_train, y_train)

# Make predictions on the test data

y_pred = model.predict(X_test)

16
# Calculate Mean Squared Error (MSE)

mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)

# Plotting the results

plt.figure(figsize=(10, 6))

plt.scatter(y_test, y_pred)

plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)

plt.title('House Price Prediction')

plt.xlabel('Actual Price')

plt.ylabel('Predicted Price')

plt.grid(True)

plt.show()

OUTPUT:

17
9. Support Vector Machine
CODE
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report

import numpy as np

import matplotlib.pyplot as plt

# Load the Iris dataset

iris = load_iris()

X = iris.data[:, :2] # Use only the first two features for visualization

y = iris.target

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Support Vector Classifier (SVC)

clf = SVC(kernel='linear')

# Train the model

clf.fit(X_train, y_train)

# Make predictions

y_pred = clf.predict(X_test)

# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

# Print classification report

print("Classification Report:")

print(classification_report(y_test, y_pred))

18
# Plot the decision boundary

plt.figure(figsize=(8, 6))

plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='k', s=80)

plt.xlabel('Sepal Length (cm)')

plt.ylabel('Sepal Width (cm)')

plt.title('Iris Dataset - Sepal Features')

plt.grid(True)

# Create a mesh to plot in

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1

y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),

np.arange(y_min, y_max, 0.01))

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot

Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')

plt.show()

OUTPUT:

19
20

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy