Skip to content

Implementing HITS algorithm #244

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 7, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
# from the third edition until this gets reviewed.)

from collections import defaultdict
import urllib.request
import re

# ______________________________________________________________________________
# Grammars and Lexicons
Expand Down Expand Up @@ -206,3 +208,178 @@ def CYK_parse(words, grammar):
P[X, start, length] = max(P[X, start, length],
P[Y, start, len1] * P[Z, start+len1, len2] * p)
return P


# ______________________________________________________________________________
# Page Ranking

# First entry in list is the base URL, and then following are relative URL pages
examplePagesSet = ["https://en.wikipedia.org/wiki/", "Aesthetics", "Analytic_philosophy",
"Ancient_Greek", "Aristotle", "Astrology","Atheism", "Baruch_Spinoza",
"Belief", "Betrand Russell", "Confucius", "Consciousness",
"Continental Philosophy", "Dialectic", "Eastern_Philosophy",
"Epistemology", "Ethics", "Existentialism", "Friedrich_Nietzsche",
"Idealism", "Immanuel_Kant", "List_of_political_philosophers", "Logic",
"Metaphysics", "Philosophers", "Philosophy", "Philosophy_of_mind", "Physics",
"Plato", "Political_philosophy", "Pythagoras", "Rationalism","Social_philosophy",
"Socrates", "Subjectivity", "Theology", "Truth", "Western_philosophy"]


def loadPageHTML( addressList ):
"""Download HTML page content for every URL address passed as argument"""
contentDict = {}
for addr in addressList:
with urllib.request.urlopen(addr) as response:
raw_html = response.read().decode('utf-8')
# Strip raw html of unnessecary content. Basically everything that isn't link or text
html = stripRawHTML(raw_html)
contentDict[addr] = html
return contentDict

def initPages( addressList ):
"""Create a dictionary of pages from a list of URL addresses"""
pages = {}
for addr in addressList:
pages[addr] = Page(addr)
return pages

def stripRawHTML( raw_html ):
"""Remove the <head> section of the HTML which contains links to stylesheets etc.,
and remove all other unnessecary HTML"""
# TODO: Strip more out of the raw html
return re.sub("<head>.*?</head>", "", raw_html, flags=re.DOTALL) # remove <head> section

def determineInlinks( page ):
"""Given a set of pages that have their outlinks determined, we can fill
out a page's inlinks by looking through all other page's outlinks"""
inlinks = []
for addr, indexPage in pagesIndex.items():
if page.address == indexPage.address:
continue
elif page.address in indexPage.outlinks:
inlinks.append(addr)
return inlinks

def findOutlinks( page, handleURLs=None ):
"""Search a page's HTML content for URL links to other pages"""
urls = re.findall(r'href=[\'"]?([^\'" >]+)', pagesContent[page.address])
if handleURLs:
urls = handleURLs(urls)
return urls

def onlyWikipediaURLS( urls ):
"""Some example HTML page data is from wikipedia. This function converts
relative wikipedia links to full wikipedia URLs"""
wikiURLs = [url for url in urls if url.startswith('/wiki/')]
return ["https://en.wikipedia.org"+url for url in wikiURLs]


# ______________________________________________________________________________
# HITS Helper Functions

def expand_pages( pages ):
"""From Textbook: adds in every page that links to or is linked from one of
the relevant pages."""
expanded = {}
for addr,page in pages.items():
if addr not in expanded:
expanded[addr] = page
for inlink in page.inlinks:
if inlink not in expanded:
expanded[inlink] = pagesIndex[inlink]
for outlink in page.outlinks:
if outlink not in expanded:
expanded[outlink] = pagesIndex[outlink]
return expanded

def relevant_pages(query):
"""relevant pages are pages that contain the query in its entireity.
If a page's content contains the query it is returned by the function"""
relevant = {}
print("pagesContent in function: ", pagesContent)
for addr, page in pagesIndex.items():
if query.lower() in pagesContent[addr].lower():
relevant[addr] = page
return relevant

def normalize( pages ):
"""From the pseudocode: Normalize divides each page's score by the sum of
the squares of all pages' scores (separately for both the authority and hubs scores).
"""
summed_hub = sum(page.hub**2 for _,page in pages.items())
summed_auth = sum(page.authority**2 for _,page in pages.items())
for _, page in pages.items():
page.hub /= summed_hub
page.authority /= summed_auth

class ConvergenceDetector(object):
"""If the hub and authority values of the pages are no longer changing, we have
reached a convergence and further iterations will have no effect. This detects convergence
so that we can stop the HITS algorithm as early as possible."""
def __init__(self):
self.hub_history = None
self.auth_history = None

def __call__(self):
return self.detect()

def detect(self):
curr_hubs = [page.hub for addr, page in pagesIndex.items()]
curr_auths = [page.authority for addr, page in pagesIndex.items()]
if self.hub_history == None:
self.hub_history, self.auth_history = [],[]
else:
diffsHub = [abs(x-y) for x, y in zip(curr_hubs,self.hub_history[-1])]
diffsAuth = [abs(x-y) for x, y in zip(curr_auths,self.auth_history[-1])]
aveDeltaHub = sum(diffsHub)/float(len(pagesIndex))
aveDeltaAuth = sum(diffsAuth)/float(len(pagesIndex))
if aveDeltaHub < 0.01 and aveDeltaAuth < 0.01: # may need tweaking
return True
if len(self.hub_history) > 2: # prevent list from getting long
del self.hub_history[0]
del self.auth_history[0]
self.hub_history.append([x for x in curr_hubs])
self.auth_history.append([x for x in curr_auths])
return False


def getInlinks( page ):
if not page.inlinks:
page.inlinks = determineInlinks(page)
return [p for addr, p in pagesIndex.items() if addr in page.inlinks ]

def getOutlinks( page ):
if not page.outlinks:
page.outlinks = findOutlinks(page)
return [p for addr, p in pagesIndex.items() if addr in page.outlinks]


# ______________________________________________________________________________
# HITS Algorithm

class Page(object):
def __init__(self, address, hub=0, authority=0, inlinks=None, outlinks=None):
self.address = address
self.hub = hub
self.authority = authority
self.inlinks = inlinks
self.outlinks = outlinks

pagesContent = {} # maps Page relative or absolute URL/location to page's HTML content
pagesIndex = {}
convergence = ConvergenceDetector() # assign function to variable to mimic pseudocode's syntax

def HITS(query):
"""The HITS algorithm for computing hubs and authorities with respect to a query."""
pages = expand_pages(relevant_pages(query)) # in order to 'map' faithfully to pseudocode we
for p in pages: # won't pass the list of pages as an argument
p.authority = 1
p.hub = 1
while True: # repeat until... convergence
for p in pages:
p.authority = sum(x.hub for x in getInlinks(p)) # p.authority ← ∑i Inlinki(p).Hub
p.hub = sum(x.authority for x in getOutlinks(p)) # p.hub ← ∑i Outlinki(p).Authority
normalize(pages)
if convergence():
break
return pages
121 changes: 119 additions & 2 deletions tests/test_nlp.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,127 @@
import pytest
from nlp import *

import nlp
from nlp import loadPageHTML, stripRawHTML, determineInlinks, findOutlinks, onlyWikipediaURLS
from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks
from nlp import getOutlinks, Page, HITS
from nlp import Rules, Lexicon
# Clumsy imports because we want to access certain nlp.py globals explicitly, because
# they are accessed by function's within nlp.py

def test_rules():
assert Rules(A="B C | D E") == {'A': [['B', 'C'], ['D', 'E']]}


def test_lexicon():
assert Lexicon(Art="the | a | an") == {'Art': ['the', 'a', 'an']}


# ______________________________________________________________________________
# Data Setup

testHTML = """Keyword String 1: A man is a male human.
Keyword String 2: Like most other male mammals, a man inherits an
X from his mom and a Y from his dad.
Links:
href="https://google.com.au"
< href="/wiki/TestThing" > href="/wiki/TestBoy"
href="/wiki/TestLiving" href="/wiki/TestMan" >"""
testHTML2 = "Nothing"

pA = Page("A", 1, 6, ["B","C","E"],["D"])
pB = Page("B", 2, 5, ["E"],["A","C","D"])
pC = Page("C", 3, 4, ["B","E"],["A","D"])
pD = Page("D", 4, 3, ["A","B","C","E"],[])
pE = Page("E", 5, 2, [],["A","B","C","D","F"])
pF = Page("F", 6, 1, ["E"],[])
pageDict = {pA.address:pA,pB.address:pB,pC.address:pC,
pD.address:pD,pE.address:pE,pF.address:pF}
nlp.pagesIndex = pageDict
nlp.pagesContent ={pA.address:testHTML,pB.address:testHTML2,
pC.address:testHTML,pD.address:testHTML2,
pE.address:testHTML,pF.address:testHTML2}

# This test takes a long time (> 60 secs)
# def test_loadPageHTML():
# # first format all the relative URLs with the base URL
# addresses = [examplePagesSet[0] + x for x in examplePagesSet[1:]]
# loadedPages = loadPageHTML(addresses)
# relURLs = ['Ancient_Greek','Ethics','Plato','Theology']
# fullURLs = ["https://en.wikipedia.org/wiki/"+x for x in relURLs]
# assert all(x in loadedPages for x in fullURLs)
# assert all(loadedPages.get(key,"") != "" for key in addresses)

def test_stripRawHTML():
addr = "https://en.wikipedia.org/wiki/Ethics"
aPage = loadPageHTML([addr])
someHTML = aPage[addr]
strippedHTML = stripRawHTML(someHTML)
assert "<head>" not in strippedHTML and "</head>" not in strippedHTML

def test_determineInlinks():
# TODO
assert True

def test_findOutlinks_wiki():
testPage = pageDict[pA.address]
outlinks = findOutlinks(testPage, handleURLs=onlyWikipediaURLS)
assert "https://en.wikipedia.org/wiki/TestThing" in outlinks
assert "https://en.wikipedia.org/wiki/TestThing" in outlinks
assert "https://google.com.au" not in outlinks
# ______________________________________________________________________________
# HITS Helper Functions

def test_expand_pages():
pages = {k: pageDict[k] for k in ('F')}
pagesTwo = {k: pageDict[k] for k in ('A','E')}
expanded_pages = expand_pages(pages)
assert all(x in expanded_pages for x in ['F','E'])
assert all(x not in expanded_pages for x in ['A','B','C','D'])
expanded_pages = expand_pages(pagesTwo)
print(expanded_pages)
assert all(x in expanded_pages for x in ['A','B','C','D','E','F'])

def test_relevant_pages():
pages = relevant_pages("male")
assert all((x in pages.keys()) for x in ['A','C','E'])
assert all((x not in pages) for x in ['B','D','F'])

def test_normalize():
normalize( pageDict )
print(page.hub for addr,page in nlp.pagesIndex.items())
expected_hub = [1/91,2/91,3/91,4/91,5/91,6/91] # Works only for sample data above
expected_auth = list(reversed(expected_hub))
assert len(expected_hub) == len(expected_auth) == len(nlp.pagesIndex)
assert expected_hub == [page.hub for addr,page in sorted(nlp.pagesIndex.items())]
assert expected_auth == [page.authority for addr,page in sorted(nlp.pagesIndex.items())]

def test_detectConvergence():
# run detectConvergence once to initialise history
convergence = ConvergenceDetector()
convergence()
assert convergence() # values haven't changed so should return True
# make tiny increase/decrease to all values
for _, page in nlp.pagesIndex.items():
page.hub += 0.0003
page.authority += 0.0004
# retest function with values. Should still return True
assert convergence()
for _, page in nlp.pagesIndex.items():
page.hub += 3000000
page.authority += 3000000
# retest function with values. Should now return false
assert not convergence()

def test_getInlinks():
inlnks = getInlinks(pageDict['A'])
assert sorted([page.address for page in inlnks]) == pageDict['A'].inlinks

def test_getOutlinks():
outlnks = getOutlinks(pageDict['A'])
assert sorted([page.address for page in outlnks]) == pageDict['A'].outlinks

def test_HITS():
# TODO
assert True # leave for now

if __name__ == '__main__':
pytest.main()
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy