Sentimental
Sentimental
#importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
#coloumn names
data.isnull().sum()
data['length'] = data['verified_reviews'].apply(len)
data.head()
data.dtypes
len(data)
data.dtypes
print(f" PercentageDistribution: \n
{round(data['rating'].value_counts()/data.shape[0]*100,2)}")
fig = plt.figure(figsize=(7,7))
wp = {'linewidth':1, "edgecolor":'black'}
tags = data['rating'].value_counts()/data.shape[0]
explode=(0.1,0.1,0.1,0.1,0.1)
graph = BytesIO()
fig.savefig(graph, format="png")
data['feedback'].value_counts().plot.bar(color = 'blue')
plt.title('Feedback distribution count')
plt.xlabel('Feedback')
plt.ylabel('Count')
plt.show()
fig = plt.figure(figsize=(7,7))
colors = ('red', 'green')
wp = {'linewidth':1, "edgecolor":'black'}
tags = data['feedback'].value_counts()/data.shape[0]
explode=(0.1,0.1)
tags.plot(kind='pie', autopct="%1.1f%%", shadow=True, colors=colors, startangle=90,
wedgeprops=wp, explode=explode, label='Percentage wise distrubution of feedback')
data[data['feedback'] == 0]['rating'].value_counts()
data[data['feedback'] == 1]['rating'].value_counts()
data['variation'].value_counts().plot.bar(color = 'orange')
plt.title('Variation distribution count')
plt.xlabel('Variation')
plt.ylabel('Count')
plt.show()
data.groupby('variation')['rating'].mean()
data.groupby('variation')['rating'].mean().sort_values().plot.bar(color = 'brown',
figsize=(11, 6))
plt.title("Mean rating according to variation")
plt.xlabel('Variation')
plt.ylabel('Mean rating')
plt.show()
data['length'].describe()
sns.histplot(data['length'],color='blue').set(title='Distribution of length of
review ')
sns.histplot(data[data['feedback']==0]
['length'],color='red').set(title='Distribution of length of review if feedback =
0')
sns.histplot(data[data['feedback']==1]
['length'],color='green').set(title='Distribution of length of review if feedback =
1')
corpus = []
stemmer = PorterStemmer()
for i in range(0, data.shape[0]):
review = re.sub('[^a-zA-Z]', ' ', data.iloc[i]['verified_reviews'])
review = review.lower().split()
review = [stemmer.stem(word) for word in review if not word in STOPWORDS]
review = ' '.join(review)
corpus.append(review)
cv = CountVectorizer(max_features = 2500)
import os
import pickle
os.makedirs('Models', exist_ok=True)
scaler = MinMaxScaler()
X_train_scl = scaler.fit_transform(X_train)
X_test_scl = scaler.transform(X_test)
#Random Forest
model_rf = RandomForestClassifier()
model_rf.fit(X_train_scl, y_train)
#Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
cm_display =
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model_rf.classes_)
cm_display.plot()
plt.show()
params = {
'bootstrap': [True],
'max_depth': [80, 100],
'min_samples_split': [8, 12],
'n_estimators': [100, 300]
}
cv_object = StratifiedKFold(n_splits = 2)
#XgBoost
model_xgb = XGBClassifier()
model_xgb.fit(X_train_scl, y_train)
y_preds = model_xgb.predict(X_test)
#Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
print(cm)
cm_display =
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model_xgb.classes_)
cm_display.plot()
plt.show()
#Saving the XGBoost classifier
pickle.dump(model_xgb, open('Models/model_xgb.pkl', 'wb'))
#Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
print(cm)
cm_display =
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model_svm.classes_)
cm_display.plot()
plt.show()
#Decision Tree
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_scl, y_train)
y_preds = model_dt.predict(X_test)
#Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
print(cm)
cm_display =
ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model_dt.classes_)
cm_display.plot()
plt.show()
#SVM
y_preds = model_svm.predict(X_test_scl)
# Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=model_svm.classes_)
cm_display.plot()
plt.show()
#Kneighbors
#Logistic Regression
#AdaBoost
stacking_clf = StackingClassifier(
estimators=estimators,
final_estimator=DecisionTreeClassifier()
)
# Confusion Matrix
cm = confusion_matrix(y_test, y_preds)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=stacking_clf.classes_)
cm_display.plot()
plt.show()