Skip to content

Commit 04c7d51

Browse files
Jonathon Belottinorvig
authored andcommitted
Implementing HITS algorithm (aimacode#244)
* Implementing HITS algorithm * Moving HITS work to nlp.py and test_nlp.py
1 parent 61ef267 commit 04c7d51

File tree

2 files changed

+296
-2
lines changed

2 files changed

+296
-2
lines changed

nlp.py

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
# from the third edition until this gets reviewed.)
55

66
from collections import defaultdict
7+
import urllib.request
8+
import re
79

810
# ______________________________________________________________________________
911
# Grammars and Lexicons
@@ -206,3 +208,178 @@ def CYK_parse(words, grammar):
206208
P[X, start, length] = max(P[X, start, length],
207209
P[Y, start, len1] * P[Z, start+len1, len2] * p)
208210
return P
211+
212+
213+
# ______________________________________________________________________________
214+
# Page Ranking
215+
216+
# First entry in list is the base URL, and then following are relative URL pages
217+
examplePagesSet = ["https://en.wikipedia.org/wiki/", "Aesthetics", "Analytic_philosophy",
218+
"Ancient_Greek", "Aristotle", "Astrology","Atheism", "Baruch_Spinoza",
219+
"Belief", "Betrand Russell", "Confucius", "Consciousness",
220+
"Continental Philosophy", "Dialectic", "Eastern_Philosophy",
221+
"Epistemology", "Ethics", "Existentialism", "Friedrich_Nietzsche",
222+
"Idealism", "Immanuel_Kant", "List_of_political_philosophers", "Logic",
223+
"Metaphysics", "Philosophers", "Philosophy", "Philosophy_of_mind", "Physics",
224+
"Plato", "Political_philosophy", "Pythagoras", "Rationalism","Social_philosophy",
225+
"Socrates", "Subjectivity", "Theology", "Truth", "Western_philosophy"]
226+
227+
228+
def loadPageHTML( addressList ):
229+
"""Download HTML page content for every URL address passed as argument"""
230+
contentDict = {}
231+
for addr in addressList:
232+
with urllib.request.urlopen(addr) as response:
233+
raw_html = response.read().decode('utf-8')
234+
# Strip raw html of unnessecary content. Basically everything that isn't link or text
235+
html = stripRawHTML(raw_html)
236+
contentDict[addr] = html
237+
return contentDict
238+
239+
def initPages( addressList ):
240+
"""Create a dictionary of pages from a list of URL addresses"""
241+
pages = {}
242+
for addr in addressList:
243+
pages[addr] = Page(addr)
244+
return pages
245+
246+
def stripRawHTML( raw_html ):
247+
"""Remove the <head> section of the HTML which contains links to stylesheets etc.,
248+
and remove all other unnessecary HTML"""
249+
# TODO: Strip more out of the raw html
250+
return re.sub("<head>.*?</head>", "", raw_html, flags=re.DOTALL) # remove <head> section
251+
252+
def determineInlinks( page ):
253+
"""Given a set of pages that have their outlinks determined, we can fill
254+
out a page's inlinks by looking through all other page's outlinks"""
255+
inlinks = []
256+
for addr, indexPage in pagesIndex.items():
257+
if page.address == indexPage.address:
258+
continue
259+
elif page.address in indexPage.outlinks:
260+
inlinks.append(addr)
261+
return inlinks
262+
263+
def findOutlinks( page, handleURLs=None ):
264+
"""Search a page's HTML content for URL links to other pages"""
265+
urls = re.findall(r'href=[\'"]?([^\'" >]+)', pagesContent[page.address])
266+
if handleURLs:
267+
urls = handleURLs(urls)
268+
return urls
269+
270+
def onlyWikipediaURLS( urls ):
271+
"""Some example HTML page data is from wikipedia. This function converts
272+
relative wikipedia links to full wikipedia URLs"""
273+
wikiURLs = [url for url in urls if url.startswith('/wiki/')]
274+
return ["https://en.wikipedia.org"+url for url in wikiURLs]
275+
276+
277+
# ______________________________________________________________________________
278+
# HITS Helper Functions
279+
280+
def expand_pages( pages ):
281+
"""From Textbook: adds in every page that links to or is linked from one of
282+
the relevant pages."""
283+
expanded = {}
284+
for addr,page in pages.items():
285+
if addr not in expanded:
286+
expanded[addr] = page
287+
for inlink in page.inlinks:
288+
if inlink not in expanded:
289+
expanded[inlink] = pagesIndex[inlink]
290+
for outlink in page.outlinks:
291+
if outlink not in expanded:
292+
expanded[outlink] = pagesIndex[outlink]
293+
return expanded
294+
295+
def relevant_pages(query):
296+
"""relevant pages are pages that contain the query in its entireity.
297+
If a page's content contains the query it is returned by the function"""
298+
relevant = {}
299+
print("pagesContent in function: ", pagesContent)
300+
for addr, page in pagesIndex.items():
301+
if query.lower() in pagesContent[addr].lower():
302+
relevant[addr] = page
303+
return relevant
304+
305+
def normalize( pages ):
306+
"""From the pseudocode: Normalize divides each page's score by the sum of
307+
the squares of all pages' scores (separately for both the authority and hubs scores).
308+
"""
309+
summed_hub = sum(page.hub**2 for _,page in pages.items())
310+
summed_auth = sum(page.authority**2 for _,page in pages.items())
311+
for _, page in pages.items():
312+
page.hub /= summed_hub
313+
page.authority /= summed_auth
314+
315+
class ConvergenceDetector(object):
316+
"""If the hub and authority values of the pages are no longer changing, we have
317+
reached a convergence and further iterations will have no effect. This detects convergence
318+
so that we can stop the HITS algorithm as early as possible."""
319+
def __init__(self):
320+
self.hub_history = None
321+
self.auth_history = None
322+
323+
def __call__(self):
324+
return self.detect()
325+
326+
def detect(self):
327+
curr_hubs = [page.hub for addr, page in pagesIndex.items()]
328+
curr_auths = [page.authority for addr, page in pagesIndex.items()]
329+
if self.hub_history == None:
330+
self.hub_history, self.auth_history = [],[]
331+
else:
332+
diffsHub = [abs(x-y) for x, y in zip(curr_hubs,self.hub_history[-1])]
333+
diffsAuth = [abs(x-y) for x, y in zip(curr_auths,self.auth_history[-1])]
334+
aveDeltaHub = sum(diffsHub)/float(len(pagesIndex))
335+
aveDeltaAuth = sum(diffsAuth)/float(len(pagesIndex))
336+
if aveDeltaHub < 0.01 and aveDeltaAuth < 0.01: # may need tweaking
337+
return True
338+
if len(self.hub_history) > 2: # prevent list from getting long
339+
del self.hub_history[0]
340+
del self.auth_history[0]
341+
self.hub_history.append([x for x in curr_hubs])
342+
self.auth_history.append([x for x in curr_auths])
343+
return False
344+
345+
346+
def getInlinks( page ):
347+
if not page.inlinks:
348+
page.inlinks = determineInlinks(page)
349+
return [p for addr, p in pagesIndex.items() if addr in page.inlinks ]
350+
351+
def getOutlinks( page ):
352+
if not page.outlinks:
353+
page.outlinks = findOutlinks(page)
354+
return [p for addr, p in pagesIndex.items() if addr in page.outlinks]
355+
356+
357+
# ______________________________________________________________________________
358+
# HITS Algorithm
359+
360+
class Page(object):
361+
def __init__(self, address, hub=0, authority=0, inlinks=None, outlinks=None):
362+
self.address = address
363+
self.hub = hub
364+
self.authority = authority
365+
self.inlinks = inlinks
366+
self.outlinks = outlinks
367+
368+
pagesContent = {} # maps Page relative or absolute URL/location to page's HTML content
369+
pagesIndex = {}
370+
convergence = ConvergenceDetector() # assign function to variable to mimic pseudocode's syntax
371+
372+
def HITS(query):
373+
"""The HITS algorithm for computing hubs and authorities with respect to a query."""
374+
pages = expand_pages(relevant_pages(query)) # in order to 'map' faithfully to pseudocode we
375+
for p in pages: # won't pass the list of pages as an argument
376+
p.authority = 1
377+
p.hub = 1
378+
while True: # repeat until... convergence
379+
for p in pages:
380+
p.authority = sum(x.hub for x in getInlinks(p)) # p.authority ← ∑i Inlinki(p).Hub
381+
p.hub = sum(x.authority for x in getOutlinks(p)) # p.hub ← ∑i Outlinki(p).Authority
382+
normalize(pages)
383+
if convergence():
384+
break
385+
return pages

tests/test_nlp.py

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,127 @@
11
import pytest
2-
from nlp import *
3-
2+
import nlp
3+
from nlp import loadPageHTML, stripRawHTML, determineInlinks, findOutlinks, onlyWikipediaURLS
4+
from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks
5+
from nlp import getOutlinks, Page, HITS
6+
from nlp import Rules, Lexicon
7+
# Clumsy imports because we want to access certain nlp.py globals explicitly, because
8+
# they are accessed by function's within nlp.py
49

510
def test_rules():
611
assert Rules(A="B C | D E") == {'A': [['B', 'C'], ['D', 'E']]}
712

813

914
def test_lexicon():
1015
assert Lexicon(Art="the | a | an") == {'Art': ['the', 'a', 'an']}
16+
17+
18+
# ______________________________________________________________________________
19+
# Data Setup
20+
21+
testHTML = """Keyword String 1: A man is a male human.
22+
Keyword String 2: Like most other male mammals, a man inherits an
23+
X from his mom and a Y from his dad.
24+
Links:
25+
href="https://google.com.au"
26+
< href="/wiki/TestThing" > href="/wiki/TestBoy"
27+
href="/wiki/TestLiving" href="/wiki/TestMan" >"""
28+
testHTML2 = "Nothing"
29+
30+
pA = Page("A", 1, 6, ["B","C","E"],["D"])
31+
pB = Page("B", 2, 5, ["E"],["A","C","D"])
32+
pC = Page("C", 3, 4, ["B","E"],["A","D"])
33+
pD = Page("D", 4, 3, ["A","B","C","E"],[])
34+
pE = Page("E", 5, 2, [],["A","B","C","D","F"])
35+
pF = Page("F", 6, 1, ["E"],[])
36+
pageDict = {pA.address:pA,pB.address:pB,pC.address:pC,
37+
pD.address:pD,pE.address:pE,pF.address:pF}
38+
nlp.pagesIndex = pageDict
39+
nlp.pagesContent ={pA.address:testHTML,pB.address:testHTML2,
40+
pC.address:testHTML,pD.address:testHTML2,
41+
pE.address:testHTML,pF.address:testHTML2}
42+
43+
# This test takes a long time (> 60 secs)
44+
# def test_loadPageHTML():
45+
# # first format all the relative URLs with the base URL
46+
# addresses = [examplePagesSet[0] + x for x in examplePagesSet[1:]]
47+
# loadedPages = loadPageHTML(addresses)
48+
# relURLs = ['Ancient_Greek','Ethics','Plato','Theology']
49+
# fullURLs = ["https://en.wikipedia.org/wiki/"+x for x in relURLs]
50+
# assert all(x in loadedPages for x in fullURLs)
51+
# assert all(loadedPages.get(key,"") != "" for key in addresses)
52+
53+
def test_stripRawHTML():
54+
addr = "https://en.wikipedia.org/wiki/Ethics"
55+
aPage = loadPageHTML([addr])
56+
someHTML = aPage[addr]
57+
strippedHTML = stripRawHTML(someHTML)
58+
assert "<head>" not in strippedHTML and "</head>" not in strippedHTML
59+
60+
def test_determineInlinks():
61+
# TODO
62+
assert True
63+
64+
def test_findOutlinks_wiki():
65+
testPage = pageDict[pA.address]
66+
outlinks = findOutlinks(testPage, handleURLs=onlyWikipediaURLS)
67+
assert "https://en.wikipedia.org/wiki/TestThing" in outlinks
68+
assert "https://en.wikipedia.org/wiki/TestThing" in outlinks
69+
assert "https://google.com.au" not in outlinks
70+
# ______________________________________________________________________________
71+
# HITS Helper Functions
72+
73+
def test_expand_pages():
74+
pages = {k: pageDict[k] for k in ('F')}
75+
pagesTwo = {k: pageDict[k] for k in ('A','E')}
76+
expanded_pages = expand_pages(pages)
77+
assert all(x in expanded_pages for x in ['F','E'])
78+
assert all(x not in expanded_pages for x in ['A','B','C','D'])
79+
expanded_pages = expand_pages(pagesTwo)
80+
print(expanded_pages)
81+
assert all(x in expanded_pages for x in ['A','B','C','D','E','F'])
82+
83+
def test_relevant_pages():
84+
pages = relevant_pages("male")
85+
assert all((x in pages.keys()) for x in ['A','C','E'])
86+
assert all((x not in pages) for x in ['B','D','F'])
87+
88+
def test_normalize():
89+
normalize( pageDict )
90+
print(page.hub for addr,page in nlp.pagesIndex.items())
91+
expected_hub = [1/91,2/91,3/91,4/91,5/91,6/91] # Works only for sample data above
92+
expected_auth = list(reversed(expected_hub))
93+
assert len(expected_hub) == len(expected_auth) == len(nlp.pagesIndex)
94+
assert expected_hub == [page.hub for addr,page in sorted(nlp.pagesIndex.items())]
95+
assert expected_auth == [page.authority for addr,page in sorted(nlp.pagesIndex.items())]
96+
97+
def test_detectConvergence():
98+
# run detectConvergence once to initialise history
99+
convergence = ConvergenceDetector()
100+
convergence()
101+
assert convergence() # values haven't changed so should return True
102+
# make tiny increase/decrease to all values
103+
for _, page in nlp.pagesIndex.items():
104+
page.hub += 0.0003
105+
page.authority += 0.0004
106+
# retest function with values. Should still return True
107+
assert convergence()
108+
for _, page in nlp.pagesIndex.items():
109+
page.hub += 3000000
110+
page.authority += 3000000
111+
# retest function with values. Should now return false
112+
assert not convergence()
113+
114+
def test_getInlinks():
115+
inlnks = getInlinks(pageDict['A'])
116+
assert sorted([page.address for page in inlnks]) == pageDict['A'].inlinks
117+
118+
def test_getOutlinks():
119+
outlnks = getOutlinks(pageDict['A'])
120+
assert sorted([page.address for page in outlnks]) == pageDict['A'].outlinks
121+
122+
def test_HITS():
123+
# TODO
124+
assert True # leave for now
125+
126+
if __name__ == '__main__':
127+
pytest.main()

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy