Skip to content

Commit 53da94f

Browse files
committed
updates
1 parent 239a0ff commit 53da94f

File tree

3 files changed

+37
-53
lines changed

3 files changed

+37
-53
lines changed

08_basic_email_web_crawler.py

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,40 +7,36 @@
77
link_re = re.compile(r'href="(.*?)"')
88

99

10-
def crawl(url, maxlevel):
10+
def crawl(url):
1111

1212
result = set()
1313

14-
while maxlevel > 0:
14+
req = requests.get(url)
1515

16-
# Get the webpage
17-
req = requests.get(url)
16+
# Check if successful
17+
if(req.status_code != 200):
18+
return []
1819

19-
# Check if successful
20-
if(req.status_code != 200):
21-
return []
20+
# Find links
21+
links = link_re.findall(req.text)
2222

23-
# Find and follow all the links
24-
links = link_re.findall(req.text)
25-
for link in links:
26-
# Get an absolute URL for a link
27-
link = urlparse.urljoin(url, link)
23+
print "\nFound {} links".format(len(links))
2824

29-
# Find all emails on current page
30-
result.update(email_re.findall(req.text))
25+
# Search links for emails
26+
for link in links:
3127

32-
print "Crawled level: {}".format(maxlevel)
28+
# Get an absolute URL for a link
29+
link = urlparse.urljoin(url, link)
3330

34-
# new level
35-
maxlevel -= 1
36-
37-
# recurse
38-
crawl(link, maxlevel)
31+
# Find all emails on current page
32+
result.update(email_re.findall(req.text))
3933

4034
return result
4135

42-
emails = crawl('http://www.website_goes_here_dot_com', 2)
36+
if __name__ == '__main__':
37+
emails = crawl('http://www.realpython.com')
4338

44-
print "\nScrapped e-mail addresses:"
45-
for email in emails:
46-
print email
39+
print "\nScrapped e-mail addresses:"
40+
for email in emails:
41+
print email
42+
print "\n"

09_basic_link_web_crawler.py

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,39 +6,27 @@
66
link_re = re.compile(r'href="(.*?)"')
77

88

9-
def crawl(url, maxlevel):
9+
def crawl(url):
1010

11-
result = set()
11+
req = requests.get(url)
1212

13-
while maxlevel > 0:
13+
# Check if successful
14+
if(req.status_code != 200):
15+
return []
1416

15-
# Get the webpage
16-
req = requests.get(url)
17+
# Find links
18+
links = link_re.findall(req.text)
1719

18-
# Check if successful
19-
if(req.status_code != 200):
20-
return []
20+
print "\nFound {} links".format(len(links))
2121

22-
# Find and follow all the links
23-
links = link_re.findall(req.text)
24-
for link in links:
25-
# Get an absolute URL for a link
26-
link = urlparse.urljoin(url, link)
27-
# add links to result set
28-
result.update(link)
22+
# Search links for emails
23+
for link in links:
2924

30-
print "Crawled level: {}".format(maxlevel)
25+
# Get an absolute URL for a link
26+
link = urlparse.urljoin(url, link)
3127

32-
# new level
33-
maxlevel -= 1
28+
print link
3429

35-
# recurse
36-
crawl(link, maxlevel)
3730

38-
return result
39-
40-
emails = crawl('http://www.website_goes_here_dot_com', 2)
41-
42-
print "\nScrapped links:"
43-
for link in links:
44-
print link
31+
if __name__ == '__main__':
32+
crawl('http://www.realpython.com')

readme.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@
77
1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
88
1. **06_execution_time.py**: class used for timing execution of code
99
1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
10-
1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively
11-
1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website recursively
10+
1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website
11+
1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website
1212
1. **10_find_files_recursively.py**: recursively grab files from a directory

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy