1
+
2
+ /**
3
+ * // This is the HtmlParser's API interface.
4
+ * // You should not implement it, or speculate about its implementation
5
+ * interface HtmlParser {
6
+ * public List<String> getUrls(String url) {}
7
+ * }
8
+ */
9
+ class Solution {
10
+ /**
11
+ * Solution is going to rely on the parallel streams to spin up parallel immutable tasks in a multithreaded fashion. This is ideal as it
12
+ * would use a default thread pool on its own. Alternatively, a thread pool can also be created for the same.
13
+ *
14
+ * We use a DFS mechanism for crawling all the web pages. For the DFS function to operate, we get all the pages of the url and recursively
15
+ * call the operation to recursively traverse all the child pages while maintaining a set to see if the url has already been crawled before.
16
+ *
17
+ * It uses the Java Streams to collect all the items in the collection and return it by recursively computing the rest of the items and then
18
+ * crawling it. This is a complete immutable functional approach, alternatively, it can also be done recursively, by using an additional
19
+ * storage for the resposnes and using it for the results.
20
+ */
21
+ public List <String > crawl (String startUrl , HtmlParser htmlParser ) {
22
+ final String hostName = this .getHostName (startUrl );
23
+ final Set <String > visitedUrls = Collections .synchronizedSet (new HashSet ());
24
+ return this .crawlUrl (startUrl , htmlParser , hostName , visitedUrls )
25
+ .collect (Collectors .toList ());
26
+ }
27
+
28
+ private Stream <String > crawlUrl (String startUrl ,
29
+ HtmlParser htmlParser ,
30
+ String domainName ,
31
+ Set <String > visitedUrls ) {
32
+
33
+ visitedUrls .add (startUrl );
34
+ final Stream <String > childStream = htmlParser .getUrlsQ (startUrl )
35
+ .parallelStream ()
36
+ .filter (url -> getHostName (url ).equals (domainName ))
37
+ .filter (url -> !visitedUrls .contains (url ))
38
+ .flatMap (url -> crawlUrl (url , htmlParser , domainName , visitedUrls ));
39
+
40
+ return Stream .concat (Stream .of (startUrl ), childStream );
41
+ }
42
+
43
+ private String getHostName (String url ) {
44
+ final int index = url .indexOf ("/" , 7 ); // Finding the indexes for the url post protocol.
45
+ return (index == -1 ) ? url : url .substring (0 , index );
46
+ }
47
+ }
0 commit comments