NLP Sentimental Analysis 1736351356
NLP Sentimental Analysis 1736351356
import numpy as np
import pandas as pd
df = pd.read_csv("/content/data.csv",
encoding="utf-8",
encoding_errors="replace")
df.columns = ["text", "sentiment"]
df.head(10)
def color_sentiment(val):
"""Colors the sentiment column based on the
sentiment."""
if val == 'positive':
color = 'lightgreen'
elif val == 'negative':
color = 'lightcoral'
else: # neutral
color = 'lightblue'
return f'background-color: {color}'
return cleaned_df
# Example usage:
cleaned_data = clean_sentiment_data(df)
display(cleaned_data.shape)
cleaned_data.head()
sentiment_counts =
cleaned_data['sentiment'].value_counts(normalize=Tru
e) * 100
plt.figure(figsize=(8, 6))
ax = sns.barplot(x=sentiment_counts.index,
y=sentiment_counts.values, palette="viridis")
# Add percentage labels to the bars
for p in ax.patches:
ax.annotate(f'{p.get_height():.1f}%', (p.get_x() +
p.get_width() / 2., p.get_height()),
ha='center', va='center', fontsize=12,
color='black', xytext=(0, 5),
textcoords='offset points')
plt.xlabel("Sentiment", fontsize=12)
plt.ylabel("Percentage", fontsize=12)
plt.title("Distribution of Sentiments", fontsize=14)
plt.show()
def convert_to_lowercase(df):
"""Step 1: Convert text to lowercase"""
df['text_lower'] = df['text'].str.lower()
return df
df = convert_to_lowercase(cleaned_data)
df.head()
def remove_special_characters(df):
"""Step 2: Remove special characters and
numbers"""
df['text_clean'] = df['text_lower'].apply(lambda
x: re.sub(r'[^a-zA-Z\s]', '', x))
return df
df = remove_special_characters(df)
df.head()
Removing Special Characters and Numbers is Important
in NLP Sentiment Analysis
Removing special characters and numbers in NLP sentiment analysis is a
crucial pre-processing step for several reasons:
def remove_urls(df):
"""Step 3: Remove URLs"""
df['text_no_urls'] = df['text_clean'].apply(lambda x:
re.sub(r'http\S+|www.\S+', '', x))
return df
df = remove_urls(df)
df.head()
Removing URLs is Important in NLP Sentiment
Analysis
Removing URLs from text data is a crucial step in Natural Language
Processing (NLP) sentiment analysis for several key reasons:
def remove_extra_whitespace(df):
"""Step 4: Remove extra whitespace"""
df['text_stripped'] = df['text_no_urls'].apply(lambda x:
' '.join(x.split()))
return df
df = remove_extra_whitespace(df)
df.head()
Importance of Removing Extra Whitespace in NLP
Sentiment Analysis:
Removing extra whitespace is crucial in NLP sentiment analysis for several
reasons:
def tokenize_text(df):
"""Step 5: Tokenization"""
df['tokens'] = df['text_stripped'].apply(word_tokenize)
return df
df = tokenize_text(df)
df.head()
Importance of Tokenization in NLP Sentiment Analysis
The provided code snippet defines a function tokenize_text(df) that
performs tokenization on a given DataFrame df. It creates a new column
tokens by applying the word_tokenize function from the nltk library to the
text_stripped column.
including:
Part-of-speech tagging: Identifying the grammatical role
import nltk
from nltk.corpus import wordnet
import os
def lemmatize_text(df):
"""Step 7: Lemmatization (after ensuring WordNet
corpus is downloaded)"""
wordnet_path = os.path.join('corpora', 'wordnet') #
Create the relative path to 'wordnet'
if not any(os.path.exists(os.path.join(path,
wordnet_path)) for path in nltk.data.path):
print("WordNet corpus not found. Downloading...")
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['tokens_no_stop'].apply(lambda
x: [lemmatizer.lemmatize(word) for word in x])
Download Code + Data: https://t.me/AIMLDeepThaught/557
return df
df = lemmatize_text(df)
df.head()
What is Lemmatization?
df = join_tokens(df)
df.head()
def remove_stopwords(df):
"""Step 6: Remove stopwords"""
stop_words = set(stopwords.words('english'))
df['tokens_no_stop'] = df['tokens'].apply(lambda x:
[word for word in x if word.lower() not in stop_words])
return df
df = remove_stopwords(df)
df.head()
Importance and Use of Removing Stopwords in NLP
Sentiment Analysis:
Reduced Noise: Stopwords are common words that don't carry
significant meaning in the context of sentiment analysis. Removing
them helps to:
o Improve accuracy: By focusing on the more informative
words, sentiment analysis models can better identify the true
sentiment expressed in the text.
o Reduce dimensionality: Removing stopwords reduces the
number of features (words) that the model needs to process,
which can improve model performance and efficiency.
Enhanced Interpretability:
o The resulting analysis is more concise and easier to understand
as it focuses on the core words that convey the sentiment.
Improved Model Efficiency:
o By removing irrelevant words, the model can be trained and run
faster.
In Summary:
def encode_labels(df):
"""Step 9: Encode sentiment labels"""
le = LabelEncoder()
df['sentiment_encoded'] =
le.fit_transform(df['sentiment'])
return df, le
4. Consistent Representation:
In summary:
In Summary
In Summary:
RoBERTa is a state-of-the-art language model that has significantly advanced the field
of sentiment analysis. Its ability to capture complex linguistic patterns and nuances,
combined with its robustness and efficiency, makes it a valuable tool for a wide range
of NLP applications.
%%time
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer,
RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
class SentimentDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.long)
}
# Initialize tokenizer
print("Initializing RoBERTa tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# Create datasets
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)
# Initialize model
print("Initializing RoBERTa model...")
model = RobertaForSequenceClassification.from_pretrained('roberta-
base', num_labels=3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Set up optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)
# Training loop
print(f"Training on {device}")
for epoch in range(num_epochs):
model.train()
total_loss = 0
print(f"\nEpoch {epoch + 1}/{num_epochs}")
# Training
for batch_idx, batch in enumerate(train_loader):
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
loss.backward()
optimizer.step()
if (batch_idx + 1) % 10 == 0:
print(f"Batch {batch_idx + 1}/{len(train_loader)}, Loss:
{loss.item():.4f}")
# Evaluation
model.eval()
test_preds = []
test_true = []
print("\nEvaluating...")
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels']
test_preds.extend(preds)
test_true.extend(labels.numpy())
# Calculate metrics
accuracy = accuracy_score(test_true, test_preds)
print(f'Average training loss: {total_loss/len(train_loader):.4f}')
print(f'Test Accuracy: {accuracy:.4f}')
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
pred = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]
# Get confidence scores
probs = torch.nn.functional.softmax(outputs.logits,
dim=1).cpu().numpy()[0]
# Main execution
if __name__ == "__main__":
# Assuming df is your input dataframe
print("Starting training process...")
model, tokenizer = train_sentiment_model(df)
# Example prediction
example_text = "The geosolutions technology will leverage benefon
gps solutions"
sentiment, confidence = predict_sentiment(example_text, model,
tokenizer,
torch.device('cuda' if torch.cuda.is_available()
else 'cpu'))
print(f"\nExample prediction for: '{example_text}'")
print(f"Predicted sentiment: {sentiment} (confidence: {confidence:.2f})")
Download Code + Data:
https://t.me/AIMLDeepThaught/557