Skip to content

Commit f0d7c85

Browse files
authored
Update scraper.py
1 parent 112ba7a commit f0d7c85

File tree

1 file changed

+10
-12
lines changed

1 file changed

+10
-12
lines changed

scraper.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,28 @@
11
import requests
22
from bs4 import BeautifulSoup
33

4-
proxy = {'http': 'http://SPusername:SPpassword@gate.smartproxy.com:7000'}
5-
url = 'http://books.toscrape.com/catalogue/page-1.html'
4+
proxy = {'http': 'http://username:password@gate.smartproxy.com:10000'} # Proxy authentication information
5+
url = 'http://books.toscrape.com/' # Website to make a GET request to
66

7-
r = requests.get(url, proxies=proxy)
8-
html = BeautifulSoup(r.content, 'html.parser')
7+
r = requests.get(url, proxies=proxy) # Make the GET request to a target URL using proxies
8+
html = BeautifulSoup(r.content, 'html.parser') # Parse the HTML
99

10-
all_books = html.find_all('article', class_='product_pod')
10+
all_books = html.find_all('article', class_='product_pod') # Find all article elements with the class "product_pod"
1111

12-
for book in all_books:
12+
for book in all_books: # Loop through each element and find the title, price, availability, and description
1313
title = book.h3.a['title']
1414
price = book.find('p', class_='price_color').text
1515
availability = book.find('p', class_ ='instock availability').text.strip()
1616
link_to_book = book.h3.a['href']
17+
link = "http://books.toscrape.com/{0}".format(link_to_book)
1718

18-
link = "http://books.toscrape.com/catalogue/{0}".format(link_to_book)
19-
20-
r2 = requests.get(link)
19+
r2 = requests.get(link, proxies=proxy) # Make a new request to the URL extracted earlier
2120
html2 = BeautifulSoup(r2.content, 'html.parser')
22-
2321
description = html2.find('p', class_='').text
2422

2523
print(title)
2624
print(price)
2725
print(availability)
28-
print("{0}...".format(description[:150]))
26+
print("{0}...".format(description[:150])) # Truncate text that is too long (over 150 characters)
2927
print(link)
30-
print()
28+
print() # Print an empty line to separate each result

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy