MACHINE LEARNING Manual
MACHINE LEARNING Manual
SKILL ADVANCED
COURSE
B.TECH
III YEAR – II SEM
def
load_csv(fi
lepath):
data = []
col = []
checkc
ol =
False
with
open(filepath
) as f: for val
in
f.readlines():
val = val.replace("\n","")
val = val.split(',')
if checkcol
is False:
col = val
checkco
l = True
else:
data.append(val)
df = pd.DataFrame(data=data,
columns=col) return df
2. Numpy.loadtxt function
df = np.loadtxt('convertcsv.csv',
delimeter = ',') print(df[:5,:])
3. Numpy.genfromtxt()
data = np.genfromtxt('100 Sales Records.csv', delimiter=',')
>>> pd.DataFrame(data)
4. Pandas.read_csv()
>>> pdDf = pd.read_csv('100 Sales Record.csv')
>>> pdDf.head()
5. Pickle
with
open('test.pkl','wb
') as f:
pickle.dump(pdD
f, f)
WEEK-2: Data preprocessing
1. Handling missing values
isnull()
notnull()
dropna()
fillna()
replace()
interpolate()
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
# filtering data
# displaying data only with Gender = NaN
data[bool_series]
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
# creating a dataframe from dictionary
df = pd.DataFrame(dict)
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
# importing pandas as pd
import pandas as pd
# x axis values
x = [1,2,3]
# corresponding y axis values
y = [2,4,1]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = pd.read_csv("C:\...\cancer.csv")
a = np.array(x)
y = a[:,30] # classes having 0 and 1
x = np.column_stack((x.malignant,x.benign))
x.shape
print (x),(y)
WEEK-5: Supervised Learning
1. Implementation of Linear Regression
import numpy as np
import matplotlib.pyplot as plt
def estimate_coef(x, y):
n = np.size(x)
m_x = np.mean(x)
m_y = np.mean(y)
SS_xy = np.sum(y*x) - n*m_y*m_x
SS_xx = np.sum(x*x) - n*m_x*m_x
b_1 = SS_xy / SS_xx
b_0 = m_y - b_1*m_x
return (b_0, b_1)
def plot_regression_line(x, y, b):
# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m",marker = "o", s = 30)
y_pred = b[0] + b[1]*x
plt.plot(x, y_pred, color = "g")
plt.xlabel('x')
plt.ylabel('y')
plt.show()
def main():
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {}\\nb_1 = {}".format(b[0], b[1]))
plot_regression_line(x, y, b)
if name == " main ":
WEEK-6 : Implementation of Logistic regression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings( "ignore" )
class LogitRegression() :
def init ( self, learning_rate, iterations ) :
self.learning_rate = learning_rate
self.iterations = iterations
def fit( self, X, Y ) :
self.m, self.n = X.shape
self.W = np.zeros( self.n )
self.b = 0
self.X = X
self.Y = Y
for i in range( self.iterations ) :
self.update_weights()
return self
def update_weights( self ) :
A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )
tmp = ( A - self.Y.T )
tmp = np.reshape( tmp, self.m )
dW = np.dot( self.X.T, tmp ) / self.m
db = np.sum( tmp ) / self.m
self.W = self.W - self.learning_rate * dW
self.b = self.b - self.learning_rate * db
return self
def predict( self, X ) :
Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )
Y = np.where( Z > 0.5, 1, 0 )
return Y
def main() :
df = pd.read_csv( "diabetes.csv" )
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1:].values
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size = 1/3, random_state = 0 )
model = LogitRegression( learning_rate = 0.01, iterations = 1000 )
model.fit( X_train, Y_train )
model1 = LogisticRegression()
model1.fit( X_train, Y_train)
Y_pred = model.predict( X_test )
Y_pred1 = model1.predict( X_test )
correctly_classified = 0
correctly_classified1 = 0
count = 0
for count in range( np.size( Y_pred ) ) :
if Y_test[count] == Y_pred[count] :
correctly_classified = correctly_classified + 1
if Y_test[count] == Y_pred1[count] :
correctly_classified1 = correctly_classified1 + 1
count = count + 1
print( "Accuracy on test set by our model : ", (
correctly_classified / count ) * 100 )
print( "Accuracy on test set by sklearn model : ", (
correctly_classified1 / count ) * 100 )
if name == " main " :
main()
# importing pandas package
import pandas as pd
# making data frame from csv file
data = pd.read_csv("employees.csv")
# Printing the first 10 to 24 rows of
# the data frame for visualization
data[10:25]
WEEK-7: Supervised Learning
1. Implementation of Decision tree classification
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
def importdata():
balance_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-
'+'databases/balance-scale/balance-scale.data',sep= ',', header = None)
print ("Dataset Length: ", len(balance_data))
print ("Dataset Shape: ", balance_data.shape)
print ("Dataset: ",balance_data.head())
return balance_data
def splitdataset(balance_data):
X = balance_data.values[:, 1:5]
Y = balance_data.values[:, 0]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size = 0.3, random_state = 100)
return X, Y, X_train, X_test, y_train, y_test
def train_using_gini(X_train, X_test, y_train):
clf_gini = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=3,
min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
return clf_gini
def tarin_using_entropy(X_train, X_test, y_train):
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,max_depth = 3,
min_samples_leaf = 5)
clf_entropy.fit(X_train, y_train)
return clf_entropy
def prediction(X_test, clf_object):
y_pred = clf_object.predict(X_test)
print("Predicted values:")
print(y_pred)
return y_pred
def cal_accuracy(y_test, y_pred):
print("Confusion Matrix: ",confusion_matrix(y_test, y_pred))print ("Accuracy :
",accuracy_score(y_test,y_pred)*100)
print("Report : ",
classification_report(y_test, y_pred))
def main():
data = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
clf_gini = train_using_gini(X_train, X_test, y_train)
clf_entropy = tarin_using_entropy(X_train, X_test, y_train)
print("Results Using Gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
print("Results Using Entropy:")
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)
if name ==" main ":
main()
1. Implementation of K-nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as
plt y = irisData.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)neighbors
= np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i, k in enumerate(neighbors):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_accuracy[i] = knn.score(X_train, y_train)
test_accuracy[i] = knn.score(X_test, y_test)
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()
WEEK-8
def std_dev(numbers):
avg = mean(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)
def MeanAndStdDev(mydata):
info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)]
del info[-1]
return info
def MeanAndStdDevForClass(mydata):
info = {}
dict = groupUnderClass(mydata)
for classValue, instances in dict.items():
info[classValue] = MeanAndStdDev(instances)
return info
y = irisData.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i, k in enumerate(neighbors):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_accuracy[i] = knn.score(X_train, y_train)
test_accuracy[i] = knn.score(X_test, y_test)
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()
WEEK-10: Build Artificial Neural Network model with back propagation
Let’s first understand the term neural networks. In a neural network, where neurons are
fed inputs which then neurons consider the weighted sum over them and pass it by an
activation function and passes out the output to next neuron.
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
model.summary()
WEEK-11
a). Implementing Random
Forest # Importing the
libraries import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = pd.read_csv('Salaries.csv')
print(data)
# Fitting Random Forest Regression to the dataset
# import the regressor
from sklearn.ensemble import RandomForestRegressor
# create regressor object
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
# fit the regressor with x and y data
regressor.fit(x, y)
Y_pred = regressor.predict(np.array([6.5]).reshape(1, 1)) # test the output by changing values
# Visualising the Random Forest Regression results
# arrange for creating a range of values
# from min value of x to max
# value of x with a difference of 0.01
# between two consecutive values
X_grid = np.arrange(min(x), max(x), 0.01)
# reshape for reshaping the data into a len(X_grid)*1 array,
# i.e. to make a column out of the X_grid value
X_grid = X_grid.reshape((len(X_grid), 1))
# Scatter plot for original data
plt.scatter(x, y, color = 'blue')
# plot predicted data
plt.plot(X_grid, regressor.predict(X_grid),color = 'green')
plt.title('Random Forest Regression')
plt.xlabel('Position level') plt.ylabel('Salary')
def FindColMinMax(items):n
= len(items[0]);
minima = [sys.maxint for i in range(n)];
maxima = [-sys.maxint -1 for i in range(n)];
for item in items:
for f in range(len(item)):
if (item[f] < minima[f]):
minima[f] = item[f];
if (item[f] > maxima[f]):
maxima[f] = item[f];
return minima,maxima;
def InitializeMeans(items, k, cMin, cMax):
# Initialize means to random numbers between
# the min and max of each column/feature
f = len(items[0]); # number of features
means = [[0 for i in range(f)] for j in range(k)];
for mean in means:
for i in range(len(mean)):
# Set value to a random float
# (adding +-1 to avoid a wide placement of a mean)
mean[i] = uniform(cMin[i]+1, cMax[i]-1);
return means;
def UpdateMean(n,mean,item):
for i in range(len(mean)):
m = mean[i];
m = (m*(n-1)+item[i])/float(n);
mean[i] = round(m, 3);
return mean;
def Classify(means,item):
# Classify item to the mean with minimum distance
minimum = sys.maxint;
index = -1;
for i in range(len(means)):
# Find distance from item to mean
dis = EuclideanDistance(item, means[i]);
if (dis < minimum):
minimum = dis;
index = i;
return index;
def CalculateMeans(k,items,maxIterations=100000):
# Find the minima and maxima for columns
cMin, cMax = FindColMinMax(items);
# Initialize means at random points
means = InitializeMeans(items,k,cMin,cMax);
# Initialize clusters, the array to hold
# the number of items in a class
clusterSizes= [0 for i in range(len(means))];
# An array to hold the cluster an item is in
belongsTo = [0 for i in range(len(items))];
# Calculate means
for e in range(maxIterations):
# If no change of cluster occurs, halt
noChange = True;
for i in range(len(items)):
item = items[i];
# Classify item into a cluster and update the
# corresponding means.
index = Classify(means,item);
clusterSizes[index] += 1;
cSize = clusterSizes[index];
means[index] = UpdateMean(cSize,means[index],item);
# Item changed cluster
if(index != belongsTo[i]):
noChange = False;
belongsTo[i] = index;
# Nothing changed,
return if (noChange):
Break
return means;
def FindClusters(means,items):
clusters = [[] for i in range(len(means))]; # Init clusters
for item in items:
# Classify item into a cluster
index =
Classify(means,item); # Add
item to cluster
clusters[index].append(item);
return clusters;
K-means
from sklearn.cluster import KMeans
#from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data=pd.read_csv("kmeansdata.csv")
df1=pd.DataFrame(data)
print(df1)
f1 = df1['Distance_Feature'].values
f2 = df1['Speeding_Feature'].values
X=np.matrix(list(zip(f1,f2)))
plt.plot()
plt.xlim([0, 100])
plt.ylim([0, 50])
plt.title('Dataset')
plt.ylabel('speeding_feature')
plt.xlabel('Distance_Feature')
plt.scatter(f1,f2)
plt.show()
# create new plot and data
plt.plot()
colors = ['b', 'g', 'r']
Page 33
markers = ['o', 'v', 's']
# KMeans algorithm
#K = 3
kmeans_model = KMeans(n_clusters=3).fit(X)
plt.plot()
for i, l in enumerate(kmeans_model.labels_):
plt.plot(f1[i], f2[i], color=colors[l], marker=markers[l],ls='None')
plt.xlim([0, 100])
plt.ylim([0, 50])
plt.show()
Driver_ID,Distance_Feature,Speeding_Feature
3423311935,71.24,28
3423313212,52.53,25
3423313724,64.54,27
3423311373,55.69,22
3423310999,54.58,25
3423313857,41.91,10
3423312432,58.64,20
3423311434,52.02,8
3423311328,31.25,34
3423312488,44.31,19
3423311254,49.35,40
3423312943,58.07,45
3423312536,44.22,22
3423311542,55.73,19
3423312176,46.63,43
3423314176,52.97,32
3423314202,46.25,35
3423311346,51.55,27
3423310666,57.05,26
3423313527,58.45,30
3423312182,43.42,23
3423313590,55.68,37
3423312268,55.15,18
Page 34