From 0f5e5bd951c15696159da53ebd114f3cc8cfee3a Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Sat, 30 Mar 2019 22:02:34 +0530 Subject: [PATCH 1/2] Implementing Class Tfidf * added class of Tfidf * added scoring function BM25 --- nlp.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/nlp.py b/nlp.py index f42f9c981..b86de3bc4 100644 --- a/nlp.py +++ b/nlp.py @@ -4,6 +4,7 @@ from utils import weighted_choice import urllib.request import re +import math # ______________________________________________________________________________ # Grammars and Lexicons @@ -567,3 +568,69 @@ def HITS(query): pages[p].hub = sum(authority[x] for x in getOutlinks(pages[p])) normalize(pages) return pages + +#implementing Tfidf +class tfidfVectorizer: + def __init__(self,text): + self.text = text + + + #vectorize the text + def vectorizer(self,text): + rx = r"[\w]+" + documents = list(re.findall(rx,k) for k in text) + return documents + + #Get unique words as a feature + def getUniqueWords(self,allWords): + flat_list = [item.lower() for sublist in allWords for item in sublist] + unique_list = [] + + for x in flat_list: + if x not in unique_list: + unique_list.append(x) + return unique_list + + #Calculate term frequency and document frequency + def Tf_df(self,text): + vect = self.vectorizer(text) + unique = self.getUniqueWords(vect) + tf =[[0 for i in range(len(unique))] for j in range(len(text))] + for i in range(len(text)): + for j in range(len(vect[i])): + for k in range(len(unique)): + if (unique[k] == vect[i][j].lower()): + tf[i][k] = tf[i][k]+1 + df = [0 for i in range(len(unique))] + for i in range(len(unique)): + for j in range(len(text)): + if tf[j][i]!=0: + df[i]= df[i]+1 + return (tf,df) + + # Calculate IDF + def Idf(self,text): + tf,df = self.Tf_df(text) + N = len(text) + idf = [0 for i in range(len(df))] + for i in range(len(df)): + idf[i] = math.log10((N-df[i]+0.5)/(df[i]+0.5)) + return idf + + #Calculate BM25 + def BM25(self,text,k=2,b=0.75): + tf,df = self.Tf_df(text) + idf = self.Idf(text) + bm25 = 0 + vect = self.vectorizer(text) + Davg = 0 + for k in range(len(vect)): + length.append(len(vect[k])) #length of term in document + for t in length: + Davg = Davg+t # average length of term in document + Davg = Davg/len(text) + + for i in range(len(df)): + for j in range(len(text)): + bm25=bm25+(idf[i]*(tf[j][i]*(k+1))/(tf[j][i] - k*(1 - b + b*length[j]/Davg))) + return bm25 From 49397e07c43b5b80fe07fe7e65e99a99a47964e8 Mon Sep 17 00:00:00 2001 From: Ashish Gupta Date: Sat, 30 Mar 2019 22:03:29 +0530 Subject: [PATCH 2/2] Revert "Implementing Class Tfidf" --- nlp.py | 67 ---------------------------------------------------------- 1 file changed, 67 deletions(-) diff --git a/nlp.py b/nlp.py index b86de3bc4..f42f9c981 100644 --- a/nlp.py +++ b/nlp.py @@ -4,7 +4,6 @@ from utils import weighted_choice import urllib.request import re -import math # ______________________________________________________________________________ # Grammars and Lexicons @@ -568,69 +567,3 @@ def HITS(query): pages[p].hub = sum(authority[x] for x in getOutlinks(pages[p])) normalize(pages) return pages - -#implementing Tfidf -class tfidfVectorizer: - def __init__(self,text): - self.text = text - - - #vectorize the text - def vectorizer(self,text): - rx = r"[\w]+" - documents = list(re.findall(rx,k) for k in text) - return documents - - #Get unique words as a feature - def getUniqueWords(self,allWords): - flat_list = [item.lower() for sublist in allWords for item in sublist] - unique_list = [] - - for x in flat_list: - if x not in unique_list: - unique_list.append(x) - return unique_list - - #Calculate term frequency and document frequency - def Tf_df(self,text): - vect = self.vectorizer(text) - unique = self.getUniqueWords(vect) - tf =[[0 for i in range(len(unique))] for j in range(len(text))] - for i in range(len(text)): - for j in range(len(vect[i])): - for k in range(len(unique)): - if (unique[k] == vect[i][j].lower()): - tf[i][k] = tf[i][k]+1 - df = [0 for i in range(len(unique))] - for i in range(len(unique)): - for j in range(len(text)): - if tf[j][i]!=0: - df[i]= df[i]+1 - return (tf,df) - - # Calculate IDF - def Idf(self,text): - tf,df = self.Tf_df(text) - N = len(text) - idf = [0 for i in range(len(df))] - for i in range(len(df)): - idf[i] = math.log10((N-df[i]+0.5)/(df[i]+0.5)) - return idf - - #Calculate BM25 - def BM25(self,text,k=2,b=0.75): - tf,df = self.Tf_df(text) - idf = self.Idf(text) - bm25 = 0 - vect = self.vectorizer(text) - Davg = 0 - for k in range(len(vect)): - length.append(len(vect[k])) #length of term in document - for t in length: - Davg = Davg+t # average length of term in document - Davg = Davg/len(text) - - for i in range(len(df)): - for j in range(len(text)): - bm25=bm25+(idf[i]*(tf[j][i]*(k+1))/(tf[j][i] - k*(1 - b + b*length[j]/Davg))) - return bm25 pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy