0% found this document useful (0 votes)
11 views6 pages

Sample

This document defines classes to load, preprocess, tokenize, and build a bidirectional LSTM model for text classification. It loads drug review data, preprocesses the text by removing punctuation, stopwords, frequent/rare words and stemming, encodes labels and pads sequences. It builds a bidirectional LSTM model, compiles and fits it on the training data, evaluates it on test data and saves the model.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views6 pages

Sample

This document defines classes to load, preprocess, tokenize, and build a bidirectional LSTM model for text classification. It loads drug review data, preprocesses the text by removing punctuation, stopwords, frequent/rare words and stemming, encodes labels and pads sequences. It builds a bidirectional LSTM model, compiles and fits it on the training data, evaluates it on test data and saves the model.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import spacy
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings("ignore")

path1 ="/content/drugsComTest_raw.tsv"
class DataFrame_Loader():

def __init__(self,error_bad_lines,sep):
self.error_bad_lines = error_bad_lines
self.sep = sep

print("Loadind DataFrame")

def load_data_files(self,path1):
dftrain = pd.read_csv(path1,error_bad_lines=True,sep='\t')
return dftrain

load = DataFrame_Loader(True,'\t')
df = load.load_data_files(path1)
df.head()

class DataFrame_Preprocessor():

def __init__(self,n_rare_words):
self.n_rare_words = 10

print("Preprocessor object created")

def __remove_punctuation(self,text):

PUNCT_TO_REMOVE = string.punctuation
"""custom function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

def __remove_stopwords(self,text):

STOPWORDS = set(stopwords.words('english'))
"""custom function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in
STOPWORDS])

def Get_Most_Commom(self,data):

cnt = Counter()
for text in df["review"].values:
for word in text.split():
cnt[word] += 1

return cnt.most_common(10)

def __remove_freqwords(self,text):

FREQWORDS = set([w for (w, wc) in count])


"""custom function to remove the frequent words"""
return " ".join([word for word in str(text).split() if word not in
FREQWORDS])

def __remove_rarewords(self,text):

RAREWORDS = set([w for (w, wc) in count[:-self.n_rare_words-1:-1]])


"""custom function to remove the rare words"""
return " ".join([word for word in str(text).split() if word not in
RAREWORDS])

def __stem_words(self,text):

stemmer = PorterStemmer()
return " ".join([stemmer.stem(word) for word in text.split()])

def Text_Preprocessing(self,data):

try:

data = data[['review','rating']]
data["review"] = data["review"].apply(lambda text:
self.__remove_punctuation(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_stopwords(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_freqwords(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_rarewords(text))
data["review"] = data["review"].apply(lambda text:
self.__stem_words(text))
data = data.astype(str).apply(lambda x: x.str.encode('ascii',
'ignore').str.decode('ascii'))
data['review'] = data['review'].str.replace('\d+', '')
return data

except ValueError as ve:


raise(ValueError("Error in Text Preprocessing {}".format(ve)))

preprocess = DataFrame_Preprocessor(10)
count = preprocess.Get_Most_Commom(df)
count
from sklearn.model_selection import train_test_split
class DataFrame_Preprocessor():

def __init__(self):

print("Preprocessor object created")


def preprocess(self,data):

data['rating'] = pd.to_numeric(data['rating'],errors='coerce')

data['Sentiment'] = np.where(data['rating'] > 6, 1, 0)

data= data[['review','Sentiment']]

x = data['review']

y = data['Sentiment']

return train_test_split(x,y,test_size=0.1, random_state=0)

PR = DataFrame_Preprocessor()
X_train, X_test, y_train, y_test = PR.preprocess(df)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from keras.utils import np_utils
from keras.models import Sequential
class Keras_Tokenizer():

def __init__(self,max_features):

self.max_features =6000

print("Tokenizer object created")

def __label_encoding(self,y_train):
"""
Encode the given list of class labels
:y_train_enc: returns list of encoded classes
:labels: actual class labels
"""
lbl_enc = LabelEncoder()

y_train_enc = lbl_enc.fit_transform(y_train)
labels = lbl_enc.classes_

return y_train_enc, labels

def __word_embedding(self,train, test, max_features, max_len=200):

try:
""" Keras Tokenizer class object """
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train)
train_data = tokenizer.texts_to_sequences(train)
test_data = tokenizer.texts_to_sequences(test)

""" Get the max_len """


vocab_size = len(tokenizer.word_index) + 1

""" Padd the sequence based on the max-length """


x_train = sequence.pad_sequences(train_data, maxlen=max_len,
padding='post')
x_test = sequence.pad_sequences(test_data, maxlen=max_len,
padding='post')
""" Return train, test and vocab size """
return tokenizer, x_train, x_test, vocab_size
except ValueError as ve:
raise(ValueError("Error in word embedding {}".format(ve)))

def __feature_extraction(self, train, test):


# Feature extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=self.max_features)
x_train_tfidf = tfidf_vectorizer.fit_transform(train)
x_test_tfidf = tfidf_vectorizer.transform(test)
return x_train_tfidf, x_test_tfidf

def preprocess(self,X_train, X_test):

return self.__word_embedding(X_train, X_test, self.max_features)

KT = Keras_Tokenizer(6000)

tokenizer, x_pad_train, x_pad_valid, vocab_size = KT.preprocess(X_train, X_test)


x_pad_train.shape,x_pad_valid.shape,vocab_size

from tensorflow import keras


class RNN_Bidirectional_lstm_Build_Pack():

def __init__(self,
input_length,
output_length,
vocab_size,
optimizer,
loss,
metrics,
batch_size,
epochs,
verbose):

self.input_length =200
self.output_length= 200
self.vocab_size = 33068
self.optimizer = 'adam'
self.loss = 'binary_crossentropy'
self.metrics = ['acc']
self.batch_size = 256
self.epochs = 20
self.verbose = 1

print("Tokenizer object created")

def build_rnn(self,vocab_size,output_dim, input_dim):

model = Sequential([
keras.layers.Embedding(self.vocab_size,output_dim = self.output_length,
input_length = self.input_length),
keras.layers.BatchNormalization(),

keras.layers.Bidirectional(keras.layers.LSTM(256,return_sequences=True)),
keras.layers.GlobalMaxPool1D(),
keras.layers.Dense(225,activation='relu'),
keras.layers.Dropout(0.3),
keras.layers.Dense(150,activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(95,activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(64,activation='relu'),
keras.layers.Dropout(0.1),
keras.layers.Dense(34,activation='relu'),
keras.layers.Dropout(0.1),
keras.layers.Dense(32,activation='relu'),
keras.layers.Dense(output_dim, activation='sigmoid')
])

return model

def Compile_and_Fit(self,rnn_model):

try:

rnn_model.compile(optimizer=self.optimizer, loss=self.loss,
metrics=self.metrics)

rnn_model.fit(x_pad_train,
y_train,
batch_size=self.batch_size,
epochs=self.epochs,
verbose= self.verbose)

score = rnn_model.evaluate(x_pad_valid, y_test, verbose=1)


#verbode = the amount of information displayed
print("Loss:%.3f Accuracy: %.3f" % (score[0], score[1]))

return rnn_model

except ValueError as Model_Error:


raise(ValueError("Model Compiling Error {}".format(Model_Error)))

Rnn_Model =
RNN_Bidirectional_lstm_Build_Pack(200,200,33068,'adam','binary_crossentropy',
['acc'],256,10,1)

rnn_model = Rnn_Model.build_rnn(vocab_size,1,200)
rnn_model.summary()
rnn_model = Rnn_Model.Compile_and_Fit(rnn_model)
y_preds = rnn_model.predict(x_pad_valid)

print("y_preds Shape ::",y_preds.shape)

for arr in y_preds:


for i in range(len(arr)):
if arr[i]>0.5:
arr[i] = 1
else:
arr[i] = 0

y_preds = y_preds.astype('int32')

pred_df = pd.DataFrame(y_preds, columns=['pred'])

print(pred_df.shape)
pred_df.head()

pred_df.value_counts()

from sklearn import metrics


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(metrics.accuracy_score(y_test, pred_df))

print(metrics.confusion_matrix(y_test, pred_df))

print(metrics.classification_report(y_test, pred_df))

rnn_model.save("rnn_model.h5")
import pickle
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy