0% found this document useful (0 votes)
186 views14 pages

Codes

The document discusses several machine learning algorithms applied to different datasets: 1. It uses CART to analyze the Cereals dataset, building a classification tree and evaluating performance. 2. It performs logistic regression on the Universal Bank dataset to predict personal loans using income as a predictor. Model performance is evaluated using ROC curves. 3. K-nearest neighbors (KNN) is applied to the wine dataset for regression and classification. Models are validated and performance metrics like MAPE are calculated. 4. Naive Bayes classification is demonstrated on the Iris dataset using the Caret package to train and evaluate models.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
186 views14 pages

Codes

The document discusses several machine learning algorithms applied to different datasets: 1. It uses CART to analyze the Cereals dataset, building a classification tree and evaluating performance. 2. It performs logistic regression on the Universal Bank dataset to predict personal loans using income as a predictor. Model performance is evaluated using ROC curves. 3. K-nearest neighbors (KNN) is applied to the wine dataset for regression and classification. Models are validated and performance metrics like MAPE are calculated. 4. Naive Bayes classification is demonstrated on the Iris dataset using the Caret package to train and evaluate models.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 14

# This script uses CART to do estimation

# Uses the Cereals dataset


# Load cereals.csv
dc<-read.csv("cereals.csv")

# Remove the fields "Cereals" and "shelf"


fld<-!(names(dc) %in% c("Cereals","shelf"))
dcm<-dc[,fld]

# Partition into training and validation datasets


set.seed(34)
r<-sample(seq_len(nrow(dc)),0.7*nrow(dc),replace=F)
dt<-dcm[r,]
dv<-dcm[-r,]

# Use rpart to create the CART


library(rpart)
m.cart<-
rpart(rating~.,data=dt,parm=list(split="information"),control=rpart.control
(minsplit=2,minbucket=3))

# See the model output


summary(m.cart)

# Plot the tree


plot(m.cart,uniform=T,margin=.2)
text(m.cart,cex=.8, pretty=0)

# Get the ps plot for better visibility


post(m.cart,file="cereals.ps")

# Check where to prune

# Use the Cp and X validation plot


plotcp(m.cart)

# use r square
rsq.rpart(m.cart)

# Use the model to predict the rating for validation data


m.val<-predict(m.cart, newdata=dv)

# Get the difference between the predicted and observed values of rating
diff<-dv$rating-m.val

# Plot the result


plot(dv$rating,diff,pch=20, col="red")
abline(h=0, col="blue")

# Prepare to get the scored data into a csv file


output<-data.frame(list(m.val,dv$rating))
colnames(output)<-c("Predicted","Observed")
write.csv(output,"score.csv",row.names=F)

# Use RandomForest to compare results


library(randomForest)
m.rf<-randomForest(rating~.,dt)
varImpPlot(m.rf)
# Predict using the RandomForest Model
p.rf<-predict(m.rf,newdata=dv)

# Write the score of both the models along with observed data for
comparison
output<-data.frame(list(m.val,p.rf,dv$rating))
colnames(output)<-c("PredictedCART","PredictedRF","Observed")
write.csv(output,"score.csv",row.names=F)

# This script is for a classifier using CART


library(rpart)

# Read the play.csv dataset


d<-read.csv("Play.csv ")

# Use the rpart function to generate the CART


m<-
rpart(Play~.,d,method="class",parms=list(split="information"),control=rpart
.control(minsplit=2,minbucket=1))

# The following rpart includes a prior proportion of the two classes


#rpart(Play~.,d,method="class",parms=list(prior=c(.8,.2),split="information
"),control=rpart.control(minsplit=2,minbucket=1))

# See the rules generated by the CART


print(summary(m))

# See the complexity at each partition


printcp(m)

# Pruning the tree based on the minimum of cross validation error 'xerror'
# This is commented here as there will be only the root node if the pruning
is done in this example
#pm<-prune(m,cp=m$cptable[which.min(m$cptable[,"xerror"]),"CP"])

# Plot the tree to see the same visually


plot(m,uniform=T,branch=1,margin=.2)
text(m,cex=.6, pretty=0)

# Following will generate a more decorative tree in ps format


post(m,file="tree.ps")

# Predicting on the training dataset, as no validation dataset was defined.


# Predict will compute the classes of the given records
p<-predict(m,dat=d)

# Get the confusion matrix


print(table(d$Play,p[,2]))

# This script performs Logistic Regression on the Universal Bank Dataset


# The dependent variable PersonalLoan is predicted based on single
predictor Income

# Use Library Caret


library(caret)

# Read the dataset into a data frame


d<-read.csv("UniversalBank.csv")

# Convert PersonalLoan Column to factor


d$PersonalLoan=as.factor(as.character(d$PersonalLoan))

# Divide into training and validation partition


set.seed(53)
r<-createDataPartition(y=d$PersonalLoan,p=0.7,list=F)
dt<-d[r,]
dv<-d[-r,]

# Run Logistic Regression on the training dataset using Income as predictor


and PersonalLoan as outcome variable
m.lr<-train(PersonalLoan~Income, data=dt, method="glm",family="binomial")
summary(m.lr)$coef
summary(m.lr)

# Predict the result on validation data


fit.m=predict(m.lr,dv)

confusionMatrix(fit.m[,2],dv$PersonalLoan, positive = "1")

# Draw ROC Curve


library("ROCR")
fit.m=predict(m.lr,dv, type="prob")
pred<-prediction(fit.m[2],dv$PersonalLoan)
perf<-performance(pred,"tpr","fpr")
plot(perf,main="ROC Curve",colorize=T)

# As the ROC Curve shows that high TPR can be achieved at low
# cutoff probabilities, draw confusion matrix accordingly
predcton<-ifelse(fit.m[2]>0.2,1,0)
print("At cutoff=0.2")
print(table(dv$PersonalLoan,predcton))

# Try with even lower cutoff at the cost of FPR


predcton<-ifelse(fit.m[2]>0.22,1,0)
print("At cutoff=0.22")
print(table(dv$PersonalLoan,predcton))

# Draw lift charts


library(lift)
plotLift(m.vl,dv$PersonalLoan)
plotLift(m.vl,dv$PersonalLoan,cumulative = F)

plotLift(predcton,dv$PersonalLoan)
plotLift(predcton,dv$PersonalLoan,cumulative = F)

# Copy the data to csv file for further analysis


dout=data.frame(Outcome=predcton)
dv$Outcome=dout$Outcome
write.csv(dv,"PredUniversalBank.csv",row.names = F)

# This script performs Logistic Regression on the Universal Bank Dataset


# The dependent variable PersonalLoan is predicted based on single
predictor Income
# Read the dataset into a data frame
d<-read.csv("UniversalBank.csv")

# Divide into training and validation partition


set.seed(53)
r<-sample(seq_len(nrow(d)),0.7*nrow(d),replace=F)
dt<-d[r,]
dv<-d[-r,]

# Run Logistic Regression on the training dataset using Income as predictor


and PersonalLoan as outcome variable
nrow(t)
m.lr<-glm(PersonalLoan~Income,data=dt,family="binomial")

# As we are using only one predictor, we can plot and see the relation
between the observed and predicted points

plot(PersonalLoan~Income,data=dt)

lines(dt$Income,m.lr$fitted.values,type="p",col="blue")

# See the details of the model generated


print(summary(m.lr))

# See the anova table with Chi Squared test


anova(m.lr,test="Chisq")

# Use the model on validation data to see the performance


# Type 'response' ensures that the output is in the same scale as the
response variable, and not on logit scale
m.vl<-predict(m.lr,newdata=dv, type="response")

# Draw ROC Curve


library("ROCR")
pred<-prediction(m.vl,dv$PersonalLoan)
perf<-performance(pred,"tpr","fpr")
plot(perf,main="ROC Curve",colorize=T)

# As the ROC Curve shows that high TPR can be achieved at low
# cutoff probabilities, draw confusion matrix accordingly
predcton<-ifelse(m.vl>0.15,1,0)
print("At cutoff=0.15")
print(table(dv$PersonalLoan,predcton))

# Try with even lower cutoff at the cost of FPR


predcton<-ifelse(m.vl>0.1,1,0)
print("At cutoff=0.1")
print(table(dv$PersonalLoan,predcton))

# Draw lift charts


library(lift)
plotLift(m.vl,dv$PersonalLoan)
plotLift(m.vl,dv$PersonalLoan,cumulative = F)

plotLift(predcton,dv$PersonalLoan)
plotLift(predcton,dv$PersonalLoan,cumulative = F)

# Copy the data to csv file for further analysis


dout=data.frame(Outcome=predcton)
dv$Outcome=dout$Outcome
write.csv(dv,"PredUniversalBank.csv",row.names = F)
# Using caret package for KNN
library(caret)

# Read dataset
wine=read.csv(choose.files())
names(wine)

# Use for estimation

# Training and Validation Partition


set.seed(234)
rec=createDataPartition(y=wine$Origin,p=0.7,list=F)
winet=wine[rec,]
winev=wine[-rec,]

# Preprocess. Scaling is necessary if the range of different variables are


different
summary(winet)
pps=preProcess(winet[-1],method = c("scale","center"))
winetpp=predict(pps,winet)

# Check the output


names(winetpp)
head(winetpp)

# Develop the model and check


set.seed(234)
ctrl=trainControl(method="repeatedcv",number = 10,repeats = 3)
mod=train(Origin~.,data=winetpp,trControl=ctrl,method="knn",tuneLength=10)
mod
plot(mod)

# validate the model


winevpp=predict(pps,winev)
pred=predict(mod,winevpp)
pred
mape=sum(abs((pred-winevpp$Origin)/winevpp$Origin))/nrow(winevpp)*100
mape

# Use for Classification

# Convert the output as factor


wine$Origin=factor(wine$Origin)

# Training and Validation Partition


set.seed(234)
rec=createDataPartition(y=wine$Origin,p=0.7,list=F)
winet=wine[rec,]
winev=wine[-rec,]

# Preprocess. Scaling is necessary if the range of different variables are


different
summary(winet)
pps=preProcess(winet[-1],method = c("scale","center"))
winetpp=predict(pps,winet)

# Check the output


names(winetpp)
head(winetpp)

# Develop the model and check


set.seed(234)
ctrl=trainControl(method="repeatedcv",number = 10,repeats = 3)
mod=train(Origin~.,data=winetpp,trControl=ctrl,method="knn",tuneLength=20)
mod
plot(mod)

# validate the model


winevpp=predict(pps,winev)
pred=predict(mod,winevpp)
pred
table(winevpp$Origin,pred)

# Using k-Nearest Neighbor in the Iris dataset to classify in terms of


Species
# Load dataset & library

library(class)
di<-read.table("iris.csv",header=T,sep=",")

# Training and Validation Partitions


set.seed(1020)
part<-sample(1:nrow(di),ceiling(2/3*nrow(di)),replace=F)
trn<-di[part,]
val<-di[-part,]

# Run kNN algorithm


pred<-knn(trn[,-5],val[,-5],trn[,5],k=3)

# Generate Confusion Matrix


print("k=3")
print(table(pred,val[,5]))

# Finetune model with different values of k


pred<-knn(trn[,-5],val[,-5],trn[,5],k=1)
print("k=1")
print(table(pred,val[,5]))

pred<-knn(trn[,-5],val[,-5],trn[,5],k=5)
print("k=5")
print(table(pred,val[,5]))

# Using k-Nearest Neighbor in the Iris dataset to classify in terms of


Species
# Load dataset & library

library(class)
di<-read.table("iris.csv",header=T,sep=",")

# Training and Validation Partitions


set.seed(1020)
part<-sample(1:nrow(di),ceiling(2/3*nrow(di)),replace=F)
trn<-di[part,]
val<-di[-part,]

# Run kNN algorithm


pred<-knn(trn[,-5],val[,-5],trn[,5],k=3)

# Generate Confusion Matrix


print("k=3")
print(table(pred,val[,5]))

# Finetune model with different values of k


pred<-knn(trn[,-5],val[,-5],trn[,5],k=1)
print("k=1")
print(table(pred,val[,5]))

pred<-knn(trn[,-5],val[,-5],trn[,5],k=5)
print("k=5")
print(table(pred,val[,5]))

# Use Iris dataset with Naive Bayes - Caret Package

# Open Dataset
di<-read.csv("iris.csv",header=T)
head(di)

# Call Library
library(caret)

# Partition dataset
rec=createDataPartition(y=di$Species,p=0.7,list=F)
dit=di[rec,]
div=di[-rec,]

# Train the model and explore


mnb=train(Species~.,data=dit,method="nb")
summary(mnb)
names(mnb)
mnb$results

# Use the model to predict on validation dataset and evaluate


outp=predict(mnb,div)
table(div$Species,outp)

# Output the probability values instead of class


outp=predict(mnb,div,type="prob")

# Using Naive Bayes in the Iris dataset to classify in terms of Species


# Load dataset & library
library(klaR)

di<-read.table("iris.csv",header=T,sep=",")

# Training and Validation Partitions


set.seed(1020)
part<-sample(1:nrow(di),ceiling(2/3*nrow(di)),replace=F)
trn<-di[part,]
val<-di[-part,]

# Develop model over training set


m.tr<-NaiveBayes(Species~.,dat=trn)

# Plot the model


plot(m.tr)

# Predict the performance of the model using validation set


m.vl<-predict(m.tr,dat=trn)

# Print the confusion matrix for training set


print(table(trn$Species,m.vl$class))

# Print the confusion matrix for validation set


m.vl<-predict(m.tr,val)
print(table(val$Species,m.vl$class))

# Load a new dataset which does not have the classification


pd<-read.table("irispred.csv",header=T,sep=",")
# PCA using the provided functions
# Load the wine.txt dataframe
wine<-read.csv("wine.csv",header=T)

# Plot to see correlations


# Not including 'Price' in scatterplot matrix, as we are checking
correlations amongst the predictors
# 'Price' is the dependent variable
library(car)
scatterplotMatrix(wine[-4])

# Find the principal Components


wine.pc=prcomp(wine[-4],center=T,scale=T)

# See what are the results of prcomp


print(paste("What does the prcomp result in?"))
print(names(wine.pc))

# See the summary of the PCA


print(paste("Summary of PCA"))
print(summary(wine.pc))

# Get the eigenvalues as square of st dev - i.e., the variance


print(paste("Eigenvalues :"))
print(wine.pc$sdev^2)

# Draw a screeplot to decide how many components to take


screeplot(wine.pc,main="Scree Plot",xlab="Components")
screeplot(wine.pc,type="line", main="Scree Plot")

# Dotplot PC1
library(lattice)

load = wine.pc$rotation
print(paste("The loadings are as follows: "))
print(load)

# order the weights of PC1 per variable


ordered.load=load[order(load[,1]),1]
dotplot(ordered.load,main="Loadings Plot of PC1",xlab="Variable
Loadings",col="red",cex="1.5")

# Dotplot PC2
ordered.load2=load[order(load[,2]),2]
dotplot(ordered.load2,main="Loadings Plot of PC2",xlab="Variable
Loadings",col="red",cex="1.5")

# Draw a biplot
biplot(wine.pc,cex=c(1,0.7))

# Print the final scored data


print(wine.pc$x)

# Cereal dataset - to predict rating

# Use library caret


library(caret)

# load dataframe
cer=read.csv(choose.files(),header=T)

# remove the first column - Cereal - as that is just the name and may not
be important
cer=cer[,-1]

# partition into training and validation partitions


rec=createDataPartition(cer$rating,p=0.7,list=F)
certrain=cer[rec,]
cerval=cer[-rec,]

# create the model and see the outcomes


mlm=train(rating~.,data=certrain,method="lm")
summary(mlm)

# list the importance of variables - in this case by the absolute value of


t-statistics
varImp(mlm)
plot(varImp(mlm))

# predict on the validation set


predrating=predict(mlm,cerval)
mape=sum(abs(cerval$rating-predrating))/nrow(cerval)*100
mape

# This is a script to draw the plot for cooks distance of a regression


model
# and identify those points which have a higher value of cooks distance

# Read in the csv file

d<-read.csv("regtest.csv")

# Partition into training and validation data set


sam<-sample(seq_len(nrow(d)),0.7*nrow(d),replace=F)

dt<-d[sam,]

dv<-d[-sam,]
# Develop the regression model based on the training set
m.lm<-lm(endurance~age, data=dt)

# Compute cooks distance


cd<-cooks.distance(m.lm)

# Identify the data points which have a cooks distance greater than 0.04
tp<-seq(1:length(cd))
ip<-tp[cd>0.04]

iv<-cd[ip]

# Make the final plot with necessary identification


plot(cd)

text(ip,iv-(max(cd)*0.05),names(iv),col="blue",cex=0.7)

# Draw scatterplot matrix using ggplot


library(ggplot2)

# GGally is a package that is based on ggplot2 and enhances certain


features
library(GGally)

# Open iris dataset


di=read.csv(choose.files())

# Draw scatterplot
ggscatmat(di,columns=1:4)

# Draw scatterplot matrix for the numerical values


ggpairs(di, columns=1:4)

# Brush based on Species


ggpairs(di, columns=1:4,aes(color=di$Species))

# Draw scatterplot
ggscatmat(di,columns=1:4)

# Simple preprocessing of data using library caret

# load library
library(caret)

# Open the dataset and investigage


di<-read.csv(choose.files(),header=T)
head(di)
str(di)

# Partition the data


set.seed(34)
rec=createDataPartition(y=di$Species,p=0.7,list=F)
dit=di[rec,]
div=di[-rec,]
# Scale the data
# preprocessing options in caret:
# �center�: subtract mean from values.
# �scale�: divide values by standard deviation.
# �range�: normalize values.
# In case of missing values, one can use knn to impute "knnImpute"
pps=preProcess(dit,method=c("scale","center"))
train=predict(pps,dit)
val=predict(pps,div)

# One can also create dummy variables using caret


# Try to create dummy for "Species"
dmy=dummyVars("~.",dn,fullRank = T)

# fullRank True will create n-1 dymmys, while False will create n dummies
dnnew=data.frame(predict(dmy,newdata=di))

# the outcome is a matrix, hence it needs to be coarced to data frame

# Example for Feature Selection

# Read file - Cement.txt


d<-read.table(choose.files(),header=T)
names(d)

# Using library caret


library(caret)

# Use Recursive Feature Elimination (Backward Selection)


set.seed(10)
ctrl=rfeControl(functions=lmFuncs,method="cv",verbose=F)
lmfinal=rfe(d[,-11],d[,11],sizes=c(5:10),rfeControl = ctrl)
lmfinal

# Plot the result to understand the selection


plot(lmfinal)
plot(lmfinal, metric = "MAE")

# Using library leaps


library(leaps)

# method can be "Cp","adjr2", or "r2"


v=leaps(d[,-1],d[,1],method = "Cp")

# Use regsubsets (method may be exhaustive, forward, backward,seqrep)


v1=regsubsets(y~.,d,nbest=2,nvmax=12,method="exhaustive")
n=summary(v1)
n

# Check the result


n$which
n$adjr2
n$cp
n$r2

# Step Wise Selection using caret


cpm=train(y~.,d,method="glmStepAIC")
# use the previous model to predict the class
print(predict(m.tr,pd))

# cSplit(df, 1:ncol(df), sep=",", stripWhite=TRUE, type.convert=FALSE)

# Code to remove rows based on row names

# Load Cereals Dataset


d=read.csv(choose.files())

# Create data partitions


library(caret)
set.seed(55)
rec=createDataPartition(y=d$rating,p=0.7,list=F)
dt=d[rec,]
dv=d[-rec,]

# We want to remove records with rownames of 5 and 7


head(dt)
rowtoremove=c(5,7)
dtn=dt[!row.names(dt) %in% rowtoremove,]

# Check if those two records have been removed


head(dtn)

# Using k-Nearest Neighbor in the Iris dataset to classify in terms of


Species
# Load dataset & library

library(class)
di<-read.table("iris.csv",header=T,sep=",")

# Training and Validation Partitions


set.seed(1020)
part<-sample(1:nrow(di),ceiling(2/3*nrow(di)),replace=F)
trn<-di[part,]
val<-di[-part,]

# Run kNN algorithm


pred<-knn(trn[,-5],val[,-5],trn[,5],k=3)

# Generate Confusion Matrix


print("k=3")
print(table(pred,val[,5]))

# Finetune model with different values of k


pred<-knn(trn[,-5],val[,-5],trn[,5],k=1)
print("k=1")
print(table(pred,val[,5]))

pred<-knn(trn[,-5],val[,-5],trn[,5],k=5)
print("k=5")
print(table(pred,val[,5]))

# Using caret package for KNN


library(caret)

# Read dataset
wine=read.csv(choose.files())
names(wine)

# Use for estimation

# Training and Validation Partition


set.seed(234)
rec=createDataPartition(y=wine$Origin,p=0.7,list=F)
winet=wine[rec,]
winev=wine[-rec,]

# Preprocess. Scaling is necessary if the range of different variables are


different
summary(winet)
pps=preProcess(winet[-1],method = c("scale","center"))
winetpp=predict(pps,winet)

# Check the output


names(winetpp)
head(winetpp)

# Develop the model and check


set.seed(234)
ctrl=trainControl(method="repeatedcv",number = 10,repeats = 3)
mod=train(Origin~.,data=winetpp,trControl=ctrl,method="knn",tuneLength=10)
mod
plot(mod)

# validate the model


winevpp=predict(pps,winev)
pred=predict(mod,winevpp)
pred
mape=sum(abs((pred-winevpp$Origin)/winevpp$Origin))/nrow(winevpp)*100
mape

# Use for Classification

# Convert the output as factor


wine$Origin=factor(wine$Origin)

# Training and Validation Partition


set.seed(234)
rec=createDataPartition(y=wine$Origin,p=0.7,list=F)
winet=wine[rec,]
winev=wine[-rec,]

# Preprocess. Scaling is necessary if the range of different variables are


different
summary(winet)
pps=preProcess(winet[-1],method = c("scale","center"))
winetpp=predict(pps,winet)

# Check the output


names(winetpp)
head(winetpp)

# Develop the model and check


set.seed(234)
ctrl=trainControl(method="repeatedcv",number = 10,repeats = 3)
mod=train(Origin~.,data=winetpp,trControl=ctrl,method="knn",tuneLength=20)
mod
plot(mod)

# validate the model


winevpp=predict(pps,winev)
pred=predict(mod,winevpp)
pred
table(winevpp$Origin,pred)

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy