0% found this document useful (0 votes)

11 views6 pages

Sample

This document defines classes to load, preprocess, tokenize, and build a bidirectional LSTM model for text classification. It loads drug review data, preprocesses the text by removing punctuation, stopwords, frequent/rare words and stemming, encodes labels and pads sequences. It builds a bidirectional LSTM model, compiles and fits it on the training data, evaluates it on test data and saves the model.

Uploaded by

www.santhoshvjd123

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

11 views6 pages

Sample

Uploaded by

www.santhoshvjd123

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 6

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import spacy
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings("ignore")

path1 ="/content/drugsComTest_raw.tsv"
class DataFrame_Loader():

def __init__(self,error_bad_lines,sep):
self.error_bad_lines = error_bad_lines
self.sep = sep

print("Loadind DataFrame")

def load_data_files(self,path1):
dftrain = pd.read_csv(path1,error_bad_lines=True,sep='\t')
return dftrain

load = DataFrame_Loader(True,'\t')
df = load.load_data_files(path1)
df.head()

class DataFrame_Preprocessor():

def __init__(self,n_rare_words):
self.n_rare_words = 10

print("Preprocessor object created")

def __remove_punctuation(self,text):

PUNCT_TO_REMOVE = string.punctuation
"""custom function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

def __remove_stopwords(self,text):

STOPWORDS = set(stopwords.words('english'))
"""custom function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in
STOPWORDS])

def Get_Most_Commom(self,data):

cnt = Counter()
for text in df["review"].values:
for word in text.split():
cnt[word] += 1

return cnt.most_common(10)

def __remove_freqwords(self,text):

FREQWORDS = set([w for (w, wc) in count])

"""custom function to remove the frequent words"""
return " ".join([word for word in str(text).split() if word not in
FREQWORDS])

def __remove_rarewords(self,text):

RAREWORDS = set([w for (w, wc) in count[:-self.n_rare_words-1:-1]])

"""custom function to remove the rare words"""
return " ".join([word for word in str(text).split() if word not in
RAREWORDS])

def __stem_words(self,text):

stemmer = PorterStemmer()
return " ".join([stemmer.stem(word) for word in text.split()])

def Text_Preprocessing(self,data):

try:

data = data[['review','rating']]
data["review"] = data["review"].apply(lambda text:
self.__remove_punctuation(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_stopwords(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_freqwords(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_rarewords(text))
data["review"] = data["review"].apply(lambda text:
self.__stem_words(text))
data = data.astype(str).apply(lambda x: x.str.encode('ascii',
'ignore').str.decode('ascii'))
data['review'] = data['review'].str.replace('\d+', '')
return data

except ValueError as ve:

raise(ValueError("Error in Text Preprocessing {}".format(ve)))

preprocess = DataFrame_Preprocessor(10)
count = preprocess.Get_Most_Commom(df)
count
from sklearn.model_selection import train_test_split
class DataFrame_Preprocessor():

def __init__(self):

print("Preprocessor object created")

def preprocess(self,data):

data['rating'] = pd.to_numeric(data['rating'],errors='coerce')

data['Sentiment'] = np.where(data['rating'] > 6, 1, 0)

data= data[['review','Sentiment']]

x = data['review']

y = data['Sentiment']

return train_test_split(x,y,test_size=0.1, random_state=0)

PR = DataFrame_Preprocessor()
X_train, X_test, y_train, y_test = PR.preprocess(df)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from keras.utils import np_utils
from keras.models import Sequential
class Keras_Tokenizer():

def __init__(self,max_features):

self.max_features =6000

print("Tokenizer object created")

def __label_encoding(self,y_train):
"""
Encode the given list of class labels
:y_train_enc: returns list of encoded classes
:labels: actual class labels
"""
lbl_enc = LabelEncoder()

y_train_enc = lbl_enc.fit_transform(y_train)
labels = lbl_enc.classes_

return y_train_enc, labels

def __word_embedding(self,train, test, max_features, max_len=200):

try:
""" Keras Tokenizer class object """
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train)
train_data = tokenizer.texts_to_sequences(train)
test_data = tokenizer.texts_to_sequences(test)

""" Get the max_len """

vocab_size = len(tokenizer.word_index) + 1

""" Padd the sequence based on the max-length """

x_train = sequence.pad_sequences(train_data, maxlen=max_len,
padding='post')
x_test = sequence.pad_sequences(test_data, maxlen=max_len,
padding='post')
""" Return train, test and vocab size """
return tokenizer, x_train, x_test, vocab_size
except ValueError as ve:
raise(ValueError("Error in word embedding {}".format(ve)))

def __feature_extraction(self, train, test):

# Feature extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=self.max_features)
x_train_tfidf = tfidf_vectorizer.fit_transform(train)
x_test_tfidf = tfidf_vectorizer.transform(test)
return x_train_tfidf, x_test_tfidf

def preprocess(self,X_train, X_test):

return self.__word_embedding(X_train, X_test, self.max_features)

KT = Keras_Tokenizer(6000)

tokenizer, x_pad_train, x_pad_valid, vocab_size = KT.preprocess(X_train, X_test)

x_pad_train.shape,x_pad_valid.shape,vocab_size

from tensorflow import keras

class RNN_Bidirectional_lstm_Build_Pack():

def __init__(self,
input_length,
output_length,
vocab_size,
optimizer,
loss,
metrics,
batch_size,
epochs,
verbose):

self.input_length =200
self.output_length= 200
self.vocab_size = 33068
self.optimizer = 'adam'
self.loss = 'binary_crossentropy'
self.metrics = ['acc']
self.batch_size = 256
self.epochs = 20
self.verbose = 1

print("Tokenizer object created")

def build_rnn(self,vocab_size,output_dim, input_dim):

model = Sequential([
keras.layers.Embedding(self.vocab_size,output_dim = self.output_length,
input_length = self.input_length),
keras.layers.BatchNormalization(),

keras.layers.Bidirectional(keras.layers.LSTM(256,return_sequences=True)),
keras.layers.GlobalMaxPool1D(),
keras.layers.Dense(225,activation='relu'),
keras.layers.Dropout(0.3),
keras.layers.Dense(150,activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(95,activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(64,activation='relu'),
keras.layers.Dropout(0.1),
keras.layers.Dense(34,activation='relu'),
keras.layers.Dropout(0.1),
keras.layers.Dense(32,activation='relu'),
keras.layers.Dense(output_dim, activation='sigmoid')
])

return model

def Compile_and_Fit(self,rnn_model):

try:

rnn_model.compile(optimizer=self.optimizer, loss=self.loss,
metrics=self.metrics)

rnn_model.fit(x_pad_train,
y_train,
batch_size=self.batch_size,
epochs=self.epochs,
verbose= self.verbose)

score = rnn_model.evaluate(x_pad_valid, y_test, verbose=1)

#verbode = the amount of information displayed
print("Loss:%.3f Accuracy: %.3f" % (score[0], score[1]))

return rnn_model

except ValueError as Model_Error:

raise(ValueError("Model Compiling Error {}".format(Model_Error)))

Rnn_Model =
RNN_Bidirectional_lstm_Build_Pack(200,200,33068,'adam','binary_crossentropy',
['acc'],256,10,1)

rnn_model = Rnn_Model.build_rnn(vocab_size,1,200)
rnn_model.summary()
rnn_model = Rnn_Model.Compile_and_Fit(rnn_model)
y_preds = rnn_model.predict(x_pad_valid)

print("y_preds Shape ::",y_preds.shape)

for arr in y_preds:

for i in range(len(arr)):
if arr[i]>0.5:
arr[i] = 1
else:
arr[i] = 0

y_preds = y_preds.astype('int32')

pred_df = pd.DataFrame(y_preds, columns=['pred'])

print(pred_df.shape)
pred_df.head()

pred_df.value_counts()

from sklearn import metrics

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(metrics.accuracy_score(y_test, pred_df))

print(metrics.confusion_matrix(y_test, pred_df))

print(metrics.classification_report(y_test, pred_df))

rnn_model.save("rnn_model.h5")
import pickle
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Assignment 10 2
No ratings yet
Assignment 10 2
4 pages
C3 W2
No ratings yet
C3 W2
89 pages
DL
No ratings yet
DL
17 pages
DL Lab Answers Batch 2
No ratings yet
DL Lab Answers Batch 2
27 pages
ML PPT G3
No ratings yet
ML PPT G3
15 pages
Hand Written
No ratings yet
Hand Written
13 pages
IMDB - Colaboratory
No ratings yet
IMDB - Colaboratory
10 pages
Experiment 3 (D, E) (Embedding) (Plotting) PDF
No ratings yet
Experiment 3 (D, E) (Embedding) (Plotting) PDF
8 pages
Assingment-3 NLP
No ratings yet
Assingment-3 NLP
5 pages
NER Brahui NLP Project
No ratings yet
NER Brahui NLP Project
12 pages
Import As From Import From Import From Import From Import: # Load The IMDB Dataset
No ratings yet
Import As From Import From Import From Import From Import: # Load The IMDB Dataset
6 pages
AI Lab6
No ratings yet
AI Lab6
22 pages
Integer-Encoding-Simplernn - Ipynb - Colaboratory
No ratings yet
Integer-Encoding-Simplernn - Ipynb - Colaboratory
4 pages
Pythonprogram
No ratings yet
Pythonprogram
6 pages
SOURCE CODE-image - To - Text
No ratings yet
SOURCE CODE-image - To - Text
7 pages
Adobe Scan 08 Jan 2025
No ratings yet
Adobe Scan 08 Jan 2025
7 pages
DL - 20-WordEmbeddings - Ipynb - Colab
No ratings yet
DL - 20-WordEmbeddings - Ipynb - Colab
6 pages
Sentiment Analysis Using LSTM
No ratings yet
Sentiment Analysis Using LSTM
5 pages
NLP Assignment 4 (22bce9560)
No ratings yet
NLP Assignment 4 (22bce9560)
12 pages
ML Lab Test 1
No ratings yet
ML Lab Test 1
5 pages
DL 3
No ratings yet
DL 3
6 pages
Practical No10
No ratings yet
Practical No10
4 pages
Exp 6,7,8
No ratings yet
Exp 6,7,8
17 pages
DL 6th Exp Program
No ratings yet
DL 6th Exp Program
3 pages
DL Exps
No ratings yet
DL Exps
9 pages
DL - 5 Excuted
No ratings yet
DL - 5 Excuted
13 pages
DL Lab 8 Excuted
No ratings yet
DL Lab 8 Excuted
3 pages
1911 Encyclopædia Britannica
No ratings yet
1911 Encyclopædia Britannica
301 pages
Lab 5
No ratings yet
Lab 5
7 pages
Ud Module 4
No ratings yet
Ud Module 4
105 pages
DL5.ipynb - Colab
No ratings yet
DL5.ipynb - Colab
3 pages
CV Prince
No ratings yet
CV Prince
120 pages
Unit 4
No ratings yet
Unit 4
23 pages
Solicitation Letter
No ratings yet
Solicitation Letter
5 pages
748747019-Ad3511-Deep-Learning-Lab-Manual-Iii-Yearjnn (1) - 1
No ratings yet
748747019-Ad3511-Deep-Learning-Lab-Manual-Iii-Yearjnn (1) - 1
51 pages
Raw Nitex
No ratings yet
Raw Nitex
5 pages
Deep Learning Lab Manual
No ratings yet
Deep Learning Lab Manual
46 pages
Attention Mechanism
No ratings yet
Attention Mechanism
11 pages
AD3511 - Deep Learning Lab Manual
No ratings yet
AD3511 - Deep Learning Lab Manual
61 pages
THESESAASTU 2019 Diversion Weir
50% (2)
THESESAASTU 2019 Diversion Weir
68 pages
DL 5
No ratings yet
DL 5
7 pages
DL Exp-10,11,12
No ratings yet
DL Exp-10,11,12
6 pages
Code Text
No ratings yet
Code Text
4 pages
DL 5 Excuted
No ratings yet
DL 5 Excuted
13 pages
Deep Learning Manual
No ratings yet
Deep Learning Manual
53 pages
Transformer
No ratings yet
Transformer
10 pages
IRT Lab Programs
No ratings yet
IRT Lab Programs
9 pages
Sumati
No ratings yet
Sumati
10 pages
CCS355
No ratings yet
CCS355
29 pages
Downloaded by R GAYATHRI (R.gayathri@aalimec - Ac.in)
No ratings yet
Downloaded by R GAYATHRI (R.gayathri@aalimec - Ac.in)
56 pages
Sentence Embedding Code
No ratings yet
Sentence Embedding Code
9 pages
Sample Code
No ratings yet
Sample Code
8 pages
Medical Text Classifier GabrieldeOlaguibel
No ratings yet
Medical Text Classifier GabrieldeOlaguibel
12 pages
Sample Code
No ratings yet
Sample Code
8 pages
566f0619-9145-4b8f-b12b-cb8a5b0cd30d
No ratings yet
566f0619-9145-4b8f-b12b-cb8a5b0cd30d
17 pages
Department of Electrical Engineering: M.B.M Engineering College, Jodhpur
No ratings yet
Department of Electrical Engineering: M.B.M Engineering College, Jodhpur
16 pages
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
No ratings yet
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
8 pages
DL 6
No ratings yet
DL 6
5 pages
DL Lab Manual
No ratings yet
DL Lab Manual
18 pages
Short Essay On Abraham Lincoln
100% (2)
Short Essay On Abraham Lincoln
3 pages
Python CA 4
No ratings yet
Python CA 4
9 pages
Homework Riddles
100% (1)
Homework Riddles
5 pages
Science Fair Literature Review Example
100% (2)
Science Fair Literature Review Example
4 pages
Utilization of Low-Density Polyethylene (LDPE) Plastic in Production of Cement Brick
No ratings yet
Utilization of Low-Density Polyethylene (LDPE) Plastic in Production of Cement Brick
41 pages
Image Caption2
No ratings yet
Image Caption2
9 pages
DL Programs
No ratings yet
DL Programs
12 pages
Deep Learning Programs Updated
No ratings yet
Deep Learning Programs Updated
24 pages
Aiml Notes Chapter-3
No ratings yet
Aiml Notes Chapter-3
34 pages
BVT Bed Re Ets: Vie I
No ratings yet
BVT Bed Re Ets: Vie I
228 pages
Flaws in Education System
No ratings yet
Flaws in Education System
47 pages
ACR-Orientation Work Arrangement
No ratings yet
ACR-Orientation Work Arrangement
10 pages
Practical Skill Improvement Needs of Technical College Mechanical Engineering Craft Practice Curriculum in Nigeria
No ratings yet
Practical Skill Improvement Needs of Technical College Mechanical Engineering Craft Practice Curriculum in Nigeria
9 pages
Nndlrepo 2
No ratings yet
Nndlrepo 2
3 pages
3rd Quarter Test SCIENCE 6
No ratings yet
3rd Quarter Test SCIENCE 6
10 pages
Untitled
No ratings yet
Untitled
4 pages
Fluorescence Micros
No ratings yet
Fluorescence Micros
22 pages
PPTPTPTPTTP
No ratings yet
PPTPTPTPTTP
13 pages
Effects of Habitat Fragmentation On The Persistence of Medium and Large Mammal Species in The Brazilian Savanna of Goiás State
No ratings yet
Effects of Habitat Fragmentation On The Persistence of Medium and Large Mammal Species in The Brazilian Savanna of Goiás State
9 pages
Module 4.1 - Minimum Design Lateral Force
No ratings yet
Module 4.1 - Minimum Design Lateral Force
6 pages
Diversity Race Module Allen
No ratings yet
Diversity Race Module Allen
47 pages
FFBL FML FPCL Answer Key
No ratings yet
FFBL FML FPCL Answer Key
19 pages
Business Etiquette in South Korea - 20230908 - 122053 - 0000
No ratings yet
Business Etiquette in South Korea - 20230908 - 122053 - 0000
8 pages
English Questions
No ratings yet
English Questions
6 pages
5 - Vocabulary Exercises - Motivation
No ratings yet
5 - Vocabulary Exercises - Motivation
3 pages
Ministry of Science and Technology Department of Science and Technology Science and Technology of Yoga and Meditation (SATYAM)
No ratings yet
Ministry of Science and Technology Department of Science and Technology Science and Technology of Yoga and Meditation (SATYAM)
2 pages
CE118 Project Part 1
No ratings yet
CE118 Project Part 1
42 pages
The Next Big Thing Quantum Computings Potential On Chemicals
No ratings yet
The Next Big Thing Quantum Computings Potential On Chemicals
7 pages
SATs Revision Pack - 20-04-2025
No ratings yet
SATs Revision Pack - 20-04-2025
9 pages
A Model of Self-Regulation
No ratings yet
A Model of Self-Regulation
15 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Sample

Uploaded by

Sample

Uploaded by

import pandas as pd

print("Preprocessor object created")

FREQWORDS = set([w for (w, wc) in count])

RAREWORDS = set([w for (w, wc) in count[:-self.n_rare_words-1:-1]])

except ValueError as ve:

print("Preprocessor object created")

data['Sentiment'] = np.where(data['rating'] > 6, 1, 0)

return train_test_split(x,y,test_size=0.1, random_state=0)

print("Tokenizer object created")

return y_train_enc, labels

def __word_embedding(self,train, test, max_features, max_len=200):

""" Get the max_len """

""" Padd the sequence based on the max-length """

def __feature_extraction(self, train, test):

def preprocess(self,X_train, X_test):

return self.__word_embedding(X_train, X_test, self.max_features)

tokenizer, x_pad_train, x_pad_valid, vocab_size = KT.preprocess(X_train, X_test)

from tensorflow import keras

print("Tokenizer object created")

def build_rnn(self,vocab_size,output_dim, input_dim):

score = rnn_model.evaluate(x_pad_valid, y_test, verbose=1)

except ValueError as Model_Error:

print("y_preds Shape ::",y_preds.shape)

for arr in y_preds:

pred_df = pd.DataFrame(y_preds, columns=['pred'])

from sklearn import metrics

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.