DA Lab Manual r22
DA Lab Manual r22
9 Perform Visualization techniques (types of maps - Bar, Colum, Line, Scatter, 31-33
3D Cubes etc)
10 Perform Descriptive analytics on healthcare data 34-36
INDEX
:: 1 ::
Week 1.a
Data Preprocessing
1
a. Handling missing values
a. Handling missing values
import pandas as pd
import numpy as np
dataset_path="Poojithafile.csv"
df=pd.read_csv(dataset_path)
print(df.head())
Output:
SNO HTNO Student Name Age Address Attendence Marks
0 1 22C11A0427 Likitha 18.0 kodad 75.0 90.0
1 2 22C11A0428 Nandhini 19.0 khammam 80.0 84.0
2 3 22C11A0429 Latha 18.0 tirupathi 75.0 76.0
3 4 22C11A0430 Poojitha 21.0 suryapet 97.0 91.0
4 5 22C11A0431 Madhuri NaN kodad 94.0 75.0
print(df.info())
Output:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SNO 12 non-null int64
1 HTNO 12 non-null object
2 Student Name 12 non-null object
3 Age 11 non-null float64
4 Address 10 non-null object
5 Attendence 10 non-null float64
6 Marks 11 non-null float64
dtypes: float64(3), int64(1), object(3)
memory usage: 804.0+ bytes
None
print(df.describe())
Output:
SNO Age Attendence Marks
count 12.000000 11.00000 10.000000 11.000000
mean 6.500000 20.00000 81.500000 77.090909
std 3.605551 2.04939 12.030055 21.658507
min 1.000000 18.00000 60.000000 21.000000
25% 3.750000 18.50000 75.000000 75.500000
50% 6.500000 19.00000 78.000000 87.000000
75% 9.250000 21.00000 91.750000 90.000000
max 12.000000 24.00000 98.000000 91.000000
:: 2 ::
print(df.shape)
Output:
(12, 7)
print(df.isnull().sum())
Output:
SNO 0
HTNO 0
Student Name 0
Age 1
Address 2
Attendence 2
Marks 1
dtype: int64
print(df.nunique())
Output:
SNO 12
HTNO 12
Student Name 12
Age 6
Address 7
Attendence 8
Marks 8
dtype: int64
print(df['Student Name'])
Output:
0 Likitha
1 Nandhini
2 Latha
3 Poojitha
4 Madhuri
5 Manjula
6 Sushmitha
7 Rishi
8 Dimple
9 Anmol
10 Namratha
11 Pavan kalyan
Name: Student Name, dtype: object
print(df.groupby('Age')['Attendence'].mean())
Output:
Age
18.0 78.333333
19.0 77.500000
20.0 98.000000
21.0 97.000000
23.0 76.000000
24.0 60.000000
Name: Attendence, dtype: float64
:: 3 ::
print(df.isnull().sum())
Output:
SNO 0
HTNO 0
Student Name 0
Age 1
Address 2
Attendence 2
Marks 1
dtype: int64
age_mean=df.Age.mean()
print("Mean of age column:",age_mean)
Output:
Mean of age column: 20.0
df['Age'].fillna(value=age_mean,inplace=True)
print(df.head())
Output:
SNO HTNO Student Name Age Address Attendence Marks
0 1 22C11A0427 Likitha 18.0 kodad 75.0 90.0
1 2 22C11A0428 Nandhini 19.0 khammam 80.0 84.0
2 3 22C11A0429 Latha 18.0 tirupathi 75.0 76.0
3 4 22C11A0430 Poojitha 21.0 suryapet 97.0 91.0
4 5 22C11A0431 Madhuri 20.0 kodad 94.0 75.0
print(df.isnull().sum())
Output:
SNO 0
HTNO 0
Student Name 0
Age 0
Address 2
Attendence 2
Marks 1
dtype: int64
print(df.to_string())
Output:
SNO HTNO Student Name Age Address Attendence Marks
0 1 22C11A0427 Likitha 18.0 kodad 75.0 90.0
1 2 22C11A0428 Nandhini 19.0 khammam 80.0 84.0
2 3 22C11A0429 Latha 18.0 tirupathi 75.0 76.0
3 4 22C11A0430 Poojitha 21.0 suryapet 97.0 91.0
4 5 22C11A0431 Madhuri 20.0 kodad 94.0 75.0
5 6 22C11A0432 Manjula 24.0 mulugu 60.0 NaN
6 7 22C11A0433 Sushmitha 18.0 NaN 85.0 90.0
7 8 22C11A0434 Rishi 19.0 karimnagar NaN 90.0
8 9 22C11A0435 Dimple 19.0 karimnagar 75.0 87.0
9 10 22C11A0436 Anmol 23.0 NaN 76.0
54.01011 22C11A0437 Namratha 21.0 karimnagar NaN
21.0
112 22C11A0438 Pavan kalyan 20.0 pitapuram 98.0 90.0
:: 4 ::
Week 1.b
1 b. Noise detection removal
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Create a DataFrame
df = pd.DataFrame(marks, columns=['Marks'])
print("Original Marks Dataset with Noise:\n", df)
# Remove outliers
df_cleaned = df[(df['Marks'] >= lower_bound) & (df['Marks'] <=
upper_bound)]
:: 5 ::
o\p
Original Marks Dataset with Noise:
Marks
0 87.640523
1 74.001572
2 79.787380
3 92.408932
4 88.675580
5 120.000000
6 79.500884
7 68.486428
8 68.967811
9 74.105985
10 71.440436
11 84.542735
12 77.610377
13 71.216750
14 74.438632
15 -10.000000
16 84.940791
17 67.948417
18 73.130677
19 61.459043
20 44.470102
21 76.536186
22 78.644362
23 62.578350
24 92.697546
25 55.456343
26 70.457585
27 68.128161
28 85.327792
29 84.693588
:: 6 ::
Week 2
2 Implement data processing to identify data redundancy and elimination
import pandas as pd
# Step 1: Create a simple student data dataset with some
redundancy (duplicates)
data = {
'StudentID': [101, 102, 103, 104, 105, 102, 106, 107, 105],
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Bob',
'Frank', 'Grace', 'Eva'],
'Age': [20, 21, 22, 23, 24, 21, 25, 26, 24],
'Grade': ['A', 'B', 'C', 'B', 'A', 'B', 'A', 'A', 'A']
}
# Create a DataFrame
df = pd.DataFrame(data)
#Step 4
Df_remove_praticular_duplicates_in_column=
df.drop_duplicates(subset=’StudentID’)
print(df_no_duplicates)
Output:
:: 7 ::
Week 3
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
# Create a DataFrame
df = pd.DataFrame(data)
Output:
:: 8 ::
Week 4
Implement Linear Regression
4
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
Output:
:: 10 ::
Week 5
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
# Step 3: Split the data into features (X) and target (y)
X = df[['Study_Hours']] # Feature (independent variable)
y = df['Passed'] # Target (dependent variable)
Output:
:: 12 ::
Week 6
:: 13 ::
6 Implement decision tree induction for classification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
# Step 3: Split the data into features (X) and target (y)
X = df[['Study_Hours', 'Marks']] # Features (independent
variables)
y = df['Passed'] # Target (dependent variable)
:: 14 ::
# Output the evaluation results
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')
Output:
Week 7
:: 15 ::
7 Implement random forest classifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
# Step 3: Split the data into features (X) and target (y)
X = df[['Study_Hours', 'Marks']] # Features (independent
variables)
y = df['Passed'] # Target (dependent variable)
:: 17 ::
Week 8
8 Object segmentation using hierarchical based methods.
:: 18 ::
import numpy as np
import matplotlib.pyplot as plt
from skimage import io, color
from skimage.transform import resize
from scipy.cluster.hierarchy import linkage, fcluster
from skimage.segmentation import mark_boundaries
# Step 1: Load an Image
image = io.imread('FDP on Cybersecurity.jpg') # Example Image
URL
image_rgb = image / 255.0 # Normalize image
# Downsample the image to reduce the size (e.g., resize to 1/4 of
the original size)
downsampled_image = resize(image_rgb, (image_rgb.shape[0] // 4,
image_rgb.shape[1] // 4), mode='reflect')
# Show the original and downsampled image
plt.figure(figsize=(8, 6))
plt.subplot(1, 2, 1)
plt.imshow(image_rgb)
plt.title('Original Image')
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(downsampled_image)
plt.title('Downsampled Image')
plt.axis('off')
plt.show()
# Step 2: Pre-process Image (Reshape for clustering)
# Flatten the downsampled image
pixels = downsampled_image.reshape(-1, 3) # Shape:
(number_of_pixels, 3)
# Step 3: Use only a subset of pixels for clustering
# Select a subset of pixels randomly (e.g., 10,000 pixels)
subset_size = 10000
np.random.seed(42) # For reproducibility
subset_indices = np.random.choice(pixels.shape[0], subset_size,
replace=False)
subset_pixels = pixels[subset_indices]
# Step 4: Perform Hierarchical Clustering on the Subset of Pixels
Z = linkage(subset_pixels, method='ward') # 'ward' minimizes
variance within clusters
# Step 5: Assign clusters (we define a threshold to segment the
image)
num_clusters = 5 # You can change the number of clusters
clusters = fcluster(Z, num_clusters, criterion='maxclust')
# Now, we need to apply these clusters to the full downsampled
image (not just the subset)
# Step 6: Map the clustering result back to the full image (since
we're only using a subset, this step is different)
# We will create an array that holds the labels for the entire
image and set those corresponding to the subset's indices.
# Create a full array of cluster labels for the image (it will
have the same shape as the downsampled image)
cluster_labels_full = np.zeros(pixels.shape[0], dtype=int)
# Assign the clusters to the labels corresponding to the subset
indices
:: 19 ::
cluster_labels_full[subset_indices] = clusters
# Reshape the cluster labels to the shape of the downsampled
image
segmented_image =
cluster_labels_full.reshape(downsampled_image.shape[0],
downsampled_image.shape[1])
# Step 7: Visualize the Segmented Image
# Show the segmented image
plt.figure(figsize=(8, 6))
plt.imshow(segmented_image, cmap='jet') # Use 'jet' color map
for better visualization
plt.title('Hierarchical Segmentation (Clustering)')
plt.axis('off')
plt.show()
# Step 8: Mark boundaries on the downsampled image
# Use mode='thick' to mark the boundaries
boundaries = mark_boundaries(downsampled_image, segmented_image,
color=(1, 0, 0), mode='thick')
# Show image with boundaries marked
plt.figure(figsize=(8, 6))
plt.imshow(boundaries)
plt.title('Boundaries of Segments')
plt.axis('off')
plt.show()
OUTPUT:
:: 20 ::
:: 21 ::
Week 9
9 perform visualition techniques like bar,column ,line,scatter,3d cubes
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
plt.figure(figsize=(8, 6))
plt.bar(categories, values, color='skyblue')
plt.xlabel('Categories')
plt.ylabel('Values')
plt.title('Bar Chart')
plt.show()
plt.figure(figsize=(8, 6))
plt.barh(categories, values, color='lightcoral')
plt.xlabel('Values')
plt.ylabel('Categories')
plt.title('Column Chart (Horizontal Bar)')
plt.show()
plt.figure(figsize=(8, 6))
plt.plot(x, y, label='sin(x)', color='blue')
plt.plot(x, z, label='cos(x)', color='green')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Line Chart')
plt.legend()
plt.show()
plt.figure(figsize=(8, 6))
plt.scatter(x, y, color='red', label='sin(x)')
plt.scatter(x, z, color='purple', label='cos(x)')
plt.xlabel('X-axis')
:: 22 ::
plt.ylabel('Y-axis')
plt.title('Scatter Plot')
plt.legend()
plt.show()
:: 23 ::
:: 24 ::
Week 10
10 Perform Descriptive analysis on health care data.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Create a DataFrame
df = pd.DataFrame(data)
:: 25 ::
OUTPUT:
:: 26 ::
Week 11
11 Perform Predictive analytics on Product Sales data .
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
:: 27 ::
# Step 3: Preprocess the Data
# Convert 'Season' to numeric values (Spring = 0, Summer
= 1, Fall = 2, Winter = 3)
df['Season'] =
df['Season'].map({'Spring': 0, 'Summer':
1, 'Fall': 2, 'Winter': 3})
OUTPUT:
:: 28 ::
Model Coefficients:
Coefficient
Price 0.000014
Advertising 0.002721
Season -8.382353
Week 12
12 Apply Predictive analytics for Weather forecasting.
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Step 2: Create a Mock Weather Dataset
data = {
'Temperature': [30, 32, 33, 31, 29, 28, 25, 27, 30, 31, 33,
35, 36, 37, 34],
'Humidity': [80, 75, 77, 70, 85, 88, 90, 85, 80, 78, 76, 74,
73, 72, 71],
:: 29 ::
'Wind Speed': [10, 12, 15, 11, 13, 14, 9, 10, 12, 11, 10, 9,
8, 7, 6],
'Pressure': [1010, 1012, 1011, 1010, 1011, 1013, 1012, 1010,
1011, 1012, 1013, 1014, 1015, 1016, 1017],
'Month': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3]
}
# Convert the dictionary into a pandas DataFrame
df = pd.DataFrame(data)
# Step 3: Explore the Dataset
print(df.head())
print("\nSummary Statistics:")
print(df.describe())
# Step 4: Handle Missing Values (not needed in this mock dataset)
# df.fillna(df.median(), inplace=True)
# Step 5: Select Features and Target Variable
X = df[['Humidity', 'Wind Speed', 'Pressure', 'Month']] #
Features
y = df['Temperature'] # Target variable (Temperature)
# Step 6: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Step 7: Train the Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)
# Step 8: Make Predictions
y_pred = model.predict(X_test)
# Step 9: Evaluate the Model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"\nMean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Step 10: Visualize the Results
# Plot Actual vs Predicted Temperature
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Temperature')
plt.ylabel('Predicted Temperature')
plt.title('Actual vs Predicted Temperature')
plt.show()
# Step 11: Model Interpretation
# Display model coefficients
coefficients = pd.DataFrame(model.coef_, X.columns,
columns=['Coefficient'])
print("\nModel Coefficients:")
print(coefficients)
OUTPUT:
Temperature Humidity Wind Speed Pressure Month
0 30 80 10 1010 1
1 32 75 12 1012 2
2 33 77 15 1011 3
3 31 70 11 1010 4
4 29 85 13 1011 5
Summary Statistics:
Temperature Humidity Wind Speed Pressure Month count 15.000000
:: 30 ::
15.000000 15.000000 15.000000 15.000000
mean 31.400000 78.266667 10.466667 1012.466667 5.600000
std 3.376389 6.284524 2.503331 2.199567 3.718679
min 25.000000 70.000000 6.000000 1010.000000 1.000000
25% 29.500000 73.500000 9.000000 1011.000000 2.500000
50% 31.000000 77.000000 10.000000 1012.000000 5.000000
75% 33.500000 82.500000 12.000000 1013.500000 8.500000
max 37.000000 90.000000 15.000000 1017.000000 12.000000
Model Coefficients:
Coefficient
Humidity -0.388341
Wind Speed 0.377432
Pressure 0.837989
Month -0.013751
:: 31 ::