Skip to content

Commit e6d5fcf

Browse files
Chipe1norvig
authored andcommitted
Intersection query for relevant_pages (#509)
* Modified relevant_pages() * Additional tests for relevant_pages()
1 parent cd08bec commit e6d5fcf

File tree

2 files changed

+18
-12
lines changed

2 files changed

+18
-12
lines changed

nlp.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -301,15 +301,17 @@ def expand_pages(pages):
301301

302302

303303
def relevant_pages(query):
304-
"""Relevant pages are pages that contain the query in its entireity.
305-
If a page's content contains the query it is returned by the function."""
306-
relevant = {}
307-
print("pagesContent in function: ", pagesContent)
308-
for addr, page in pagesIndex.items():
309-
if query.lower() in pagesContent[addr].lower():
310-
relevant[addr] = page
311-
return relevant
312-
304+
"""Relevant pages are pages that contain all of the query words. They are obtained by
305+
intersecting the hit lists of the query words."""
306+
hit_intersection = {addr for addr in pagesIndex}
307+
query_words = query.split()
308+
for query_word in query_words:
309+
hit_list = set()
310+
for addr in pagesIndex:
311+
if query_word.lower() in pagesContent[addr].lower():
312+
hit_list.add(addr)
313+
hit_intersection = hit_intersection.intersection(hit_list)
314+
return {addr: pagesIndex[addr] for addr in hit_intersection}
313315

314316
def normalize(pages):
315317
"""From the pseudocode: Normalize divides each page's score by the sum of

tests/test_nlp.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_lexicon():
3030
href="https://google.com.au"
3131
< href="/wiki/TestThing" > href="/wiki/TestBoy"
3232
href="/wiki/TestLiving" href="/wiki/TestMan" >"""
33-
testHTML2 = "Nothing"
33+
testHTML2 = "a mom and a dad"
3434
testHTML3 = """
3535
<!DOCTYPE html>
3636
<html>
@@ -106,9 +106,13 @@ def test_expand_pages():
106106

107107

108108
def test_relevant_pages():
109-
pages = relevant_pages("male")
110-
assert all((x in pages.keys()) for x in ['A', 'C', 'E'])
109+
pages = relevant_pages("his dad")
110+
assert all((x in pages) for x in ['A', 'C', 'E'])
111111
assert all((x not in pages) for x in ['B', 'D', 'F'])
112+
pages = relevant_pages("mom and dad")
113+
assert all((x in pages) for x in ['A', 'B', 'C', 'D', 'E', 'F'])
114+
pages = relevant_pages("philosophy")
115+
assert all((x not in pages) for x in ['A', 'B', 'C', 'D', 'E', 'F'])
112116

113117

114118
def test_normalize():

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy