0% found this document useful (0 votes)
15 views11 pages

Sentimental

Uploaded by

trishhh3174
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views11 pages

Sentimental

Uploaded by

trishhh3174
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 11

#Installation

!pip install wordcloud


!pip install matplotlib
!pip install panda
!pip install xgboost

#importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split


from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import pickle
import re

#Load the data I

data = pd.read_csv(r"C:\Users\ASUS\Downloads\dataSetNew.tsv", delimiter = '\t',


quoting = 3)

print(f"Dataset shape {data.shape}")

#coloumn names

print(f"Features name: {data.columns.values}")

data.isnull().sum()

#finding null record


data[data['verified_reviews'].isna() == True]
data.dropna(inplace=True)

print(f" Dataset shape after dropping null values : {data.shape}")

data['length'] = data['verified_reviews'].apply(len)

data.head()

print(f"'verified_reviews' column value: {data.iloc[10]['verified_reviews']}")


#original value
print(f" Length of the review : {len(data.iloc[10]['verified_reviews'])}") #length
of the variable using len()
print(f"'length' column value: {data.iloc[10]['length']}") #value of the column
length

data.dtypes

len(data)

print(f"Rating value count: \n{data['rating'].value_counts()}")

data.dtypes

#Rate Distribuiom Count


data['rating'].value_counts().plot.bar( color = 'red' )
plt.title('Rating distribution count')
plt.xlabel('Ratngs')
plt.ylabel('Count')
plt.show()

print(f" PercentageDistribution: \n
{round(data['rating'].value_counts()/data.shape[0]*100,2)}")

fig = plt.figure(figsize=(7,7))

colors = ('red', 'green', 'blue','orange','yellow')

wp = {'linewidth':1, "edgecolor":'black'}

tags = data['rating'].value_counts()/data.shape[0]

explode=(0.1,0.1,0.1,0.1,0.1)

tags.plot(kind='pie', autopct="%1.1f%%", shadow=True, colors=colors, startangle=90,


wedgeprops=wp, explode=explode, label='Percentage wise distrubution of rating')

from io import BytesIO

graph = BytesIO()

fig.savefig(graph, format="png")

print(f"Feedback value count: \n{data['feedback'].value_counts()}")

review_0 = data[data['feedback'] == 0].iloc[1]['verified_reviews']


print(review_0)

review_1 = data[data['feedback'] == 1].iloc[1]['verified_reviews']


print(review_1)

data['feedback'].value_counts().plot.bar(color = 'blue')
plt.title('Feedback distribution count')
plt.xlabel('Feedback')
plt.ylabel('Count')
plt.show()

print(f"Feedback value count - percentage distribution: \


n{round(data['feedback'].value_counts()/data.shape[0]*100,2)}")

fig = plt.figure(figsize=(7,7))
colors = ('red', 'green')
wp = {'linewidth':1, "edgecolor":'black'}
tags = data['feedback'].value_counts()/data.shape[0]
explode=(0.1,0.1)
tags.plot(kind='pie', autopct="%1.1f%%", shadow=True, colors=colors, startangle=90,
wedgeprops=wp, explode=explode, label='Percentage wise distrubution of feedback')

data[data['feedback'] == 0]['rating'].value_counts()

data[data['feedback'] == 1]['rating'].value_counts()

print(f"Variation value count: \n{data['variation'].value_counts()}")

data['variation'].value_counts().plot.bar(color = 'orange')
plt.title('Variation distribution count')
plt.xlabel('Variation')
plt.ylabel('Count')
plt.show()

print(f"Variation value count - percentage distribution: \


n{round(data['variation'].value_counts()/data.shape[0]*100,2)}")

data.groupby('variation')['rating'].mean()

data.groupby('variation')['rating'].mean().sort_values().plot.bar(color = 'brown',
figsize=(11, 6))
plt.title("Mean rating according to variation")
plt.xlabel('Variation')
plt.ylabel('Mean rating')
plt.show()
data['length'].describe()

sns.histplot(data['length'],color='blue').set(title='Distribution of length of
review ')

sns.histplot(data[data['feedback']==0]
['length'],color='red').set(title='Distribution of length of review if feedback =
0')

sns.histplot(data[data['feedback']==1]
['length'],color='green').set(title='Distribution of length of review if feedback =
1')

data.groupby('length')['rating'].mean().plot.hist(color = 'blue', figsize=(7, 6),


bins = 20)
plt.title(" Review length wise mean ratings")
plt.xlabel('ratings')
plt.ylabel('length')
plt.show()

#Preprocessing and Modelling

corpus = []
stemmer = PorterStemmer()
for i in range(0, data.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', data.iloc[i]['verified_reviews'])
review = review.lower().split()
review = [stemmer.stem(word) for word in review if not word in STOPWORDS]
review = ' '.join(review)
corpus.append(review)

cv = CountVectorizer(max_features = 2500)

#Storing independent and dependent variables in X and y


X = cv.fit_transform(corpus).toarray()
y = data['feedback'].values

import os
import pickle

os.makedirs('Models', exist_ok=True)

pickle.dump(cv, open('Models/countVectorizer.pkl', 'wb'))


print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,


random_state = 15)

print(f"X train: {X_train.shape}")


print(f"y train: {y_train.shape}")
print(f"X test: {X_test.shape}")
print(f"y test: {y_test.shape}")

#keeping values between 0 and 1 by scaling

scaler = MinMaxScaler()

X_train_scl = scaler.fit_transform(X_train)
X_test_scl = scaler.transform(X_test)

pickle.dump(scaler, open('Models/scaler.pkl', 'wb'))

#Random Forest

model_rf = RandomForestClassifier()
model_rf.fit(X_train_scl, y_train)

#Accuracy of the model on training and testing data

print("Training Accuracy :", model_rf.score(X_train_scl, y_train))


print("Testing Accuracy :", model_rf.score(X_test_scl, y_test))

#Predicting on the test set


y_preds = model_rf.predict(X_test_scl)

#Confusion Matrix
cm = confusion_matrix(y_test, y_preds)

cm_display =
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model_rf.classes_)
cm_display.plot()
plt.show()

params = {
'bootstrap': [True],
'max_depth': [80, 100],
'min_samples_split': [8, 12],
'n_estimators': [100, 300]
}

cv_object = StratifiedKFold(n_splits = 2)

grid_search = GridSearchCV(estimator = model_rf, param_grid = params, cv =


cv_object, verbose = 0, return_train_score = True)
grid_search.fit(X_train_scl, y_train.ravel())

accuracies = cross_val_score(estimator = model_rf, X = X_train_scl, y = y_train, cv


= 10)

print("Accuracy :", accuracies.mean())


print("Standard Variance :", accuracies.std())

#Getting the best parameters from the grid search

print("Best Parameter Combination : {}".format(grid_search.best_params_))

print("Cross validation mean accuracy on train set :


{}".format(grid_search.cv_results_['mean_train_score'].mean()*100))
print("Cross validation mean accuracy on test set :
{}".format(grid_search.cv_results_['mean_test_score'].mean()*100))
print("Accuracy score for test set :", accuracy_score(y_test, y_preds))

#XgBoost

model_xgb = XGBClassifier()
model_xgb.fit(X_train_scl, y_train)

#Accuracy of the model on training and testing data

print("Training Accuracy :", model_xgb.score(X_train_scl, y_train))


print("Testing Accuracy :", model_xgb.score(X_test_scl, y_test))

y_preds = model_xgb.predict(X_test)

#Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
print(cm)

cm_display =
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model_xgb.classes_)
cm_display.plot()
plt.show()
#Saving the XGBoost classifier
pickle.dump(model_xgb, open('Models/model_xgb.pkl', 'wb'))

#SVM Model (Self made)

from sklearn.svm import SVC

# Fitting scaled X_train and y_train on SVM classifier


model_svm = SVC(kernel='linear', C=1, random_state=42)
model_svm.fit(X_train_scl, y_train)

# Accuracy of the model on training and testing data


print("Training Accuracy :", model_svm.score(X_train_scl, y_train))
print("Testing Accuracy :", model_svm.score(X_test_scl, y_test))

#Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
print(cm)

cm_display =
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model_svm.classes_)
cm_display.plot()
plt.show()

#Decision Tree

model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_scl, y_train)

#Accuracy of the model on training and testing data

print("Training Accuracy :", model_dt.score(X_train_scl, y_train))


print("Testing Accuracy :", model_dt.score(X_test_scl, y_test))

y_preds = model_dt.predict(X_test)

#Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
print(cm)

cm_display =
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model_dt.classes_)
cm_display.plot()
plt.show()

#SVM

from sklearn.svm import SVC

# Fitting scaled X_train and y_train on SVM classifier


model_svm = SVC()
model_svm.fit(X_train_scl, y_train)

#Accuracy of the model on training and testing data


print("Training Accuracy :", model_svm.score(X_train_scl, y_train))
print("Testing Accuracy :", model_svm.score(X_test_scl, y_test))

y_preds = model_svm.predict(X_test_scl)

# Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=model_svm.classes_)
cm_display.plot()
plt.show()

#Kneighbors

from sklearn.neighbors import KNeighborsClassifier

# Fitting scaled X_train and y_train on KNN classifier


model_knn = KNeighborsClassifier()
model_knn.fit(X_train_scl, y_train)

# Accuracy of the model on training and testing data


print("Training Accuracy :", model_knn.score(X_train_scl, y_train))
print("Testing Accuracy :", model_knn.score(X_test_scl, y_test))

#Naive Bayes Model

from sklearn.naive_bayes import GaussianNB

# Fitting scaled X_train and y_train on Naive Bayes classifier


model_nb = GaussianNB()
model_nb.fit(X_train_scl, y_train)
print("Training Accuracy :", model_nb.score(X_train_scl, y_train))
print("Testing Accuracy :", model_nb.score(X_test_scl, y_test))

#Logistic Regression

from sklearn.linear_model import LogisticRegression

# Fitting scaled X_train and y_train on Logistic Regression classifier


model_lr = LogisticRegression()
model_lr.fit(X_train_scl, y_train)

# Accuracy of the model on training and testing data


print("Training Accuracy :", model_lr.score(X_train_scl, y_train))
print("Testing Accuracy :", model_lr.score(X_test_scl, y_test))

#AdaBoost

from sklearn.ensemble import AdaBoostClassifier

# Fitting scaled X_train and y_train on AdaBoost classifier


model_adaboost = AdaBoostClassifier()
model_adaboost.fit(X_train_scl, y_train)

# Accuracy of the model on training and testing data


print("Training Accuracy :", model_adaboost.score(X_train_scl, y_train))
print("Testing Accuracy :", model_adaboost.score(X_test_scl, y_test))

# Hybrid (Combining RandomForest and Xgboost)------DONOT TRY (Construction is going


on)

from sklearn.ensemble import RandomForestClassifier, StackingClassifier

# Loading the preprocessed data and models


X_train_scl = pickle.load(open('Models/X_train_scl.pkl', 'rb'))
X_test_scl = pickle.load(open('Models/X_test_scl.pkl', 'rb'))
y_train = pickle.load(open('Models/y_train.pkl', 'rb'))
y_test = pickle.load(open('Models/y_test.pkl', 'rb'))
# Loading the trained models
model_rf = pickle.load(open('Models/model_rf.pkl', 'rb'))
model_xgb = pickle.load(open('Models/model_xgb.pkl', 'rb'))

# Creating a stacking classifier


estimators = [
('rf', model_rf),
('xgb', model_xgb)
]

stacking_clf = StackingClassifier(
estimators=estimators,
final_estimator=DecisionTreeClassifier()
)

# Training the stacking classifier


stacking_clf.fit(X_train_scl, y_train)

# Evaluating the stacking classifier


print("Training Accuracy :", stacking_clf.score(X_train_scl, y_train))
print("Testing Accuracy :", stacking_clf.score(X_test_scl, y_test))

# Predicting on the test set


y_preds = stacking_clf.predict(X_test_scl)

# Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=stacking_clf.classes_)
cm_display.plot()
plt.show()

# Saving the stacking classifier


pickle.dump(stacking_clf, open('Models/stacking_clf.pkl', 'wb'))

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy