Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
added web crawler code
  • Loading branch information
Vishal M Yadav committed Dec 2, 2024
commit d26ec838e082759404f78469df696a8d67d10dcb
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@
<version>RELEASE</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.16.1</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package com.gatomalvado.done.multithreadedwebcrawler;

import java.util.List;

public interface HtmlParser {

List<String> parseHtml(String inputUrl);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package com.gatomalvado.done.multithreadedwebcrawler;

import java.util.concurrent.ExecutionException;

public class Main {

public static void main(String[] args) throws ExecutionException, InterruptedException {
System.out.println("Hello Multithreaded Web Crawler!");
String inputUrl = "https://en.wikipedia.org/wiki/As_It_Was";

MultithreadedWebCrawler crawler = new MultithreadedWebCrawler(new SingleThreadedHtmlParser(), 10);
crawler.startCrawl(inputUrl);
crawler.showParsedUrls();
}



}
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package com.gatomalvado.done.multithreadedwebcrawler;

import java.util.ArrayDeque;
import java.util.Collections;
import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

public class MultithreadedWebCrawler {

private final HtmlParser htmlParser;
private final Map<String, Boolean> map;
private final ExecutorService executorService;
private final int limit;

public MultithreadedWebCrawler(HtmlParser htmlParser, int limit) {
this.htmlParser = htmlParser;
this.map = new ConcurrentHashMap<>();
this.limit = limit;
this.executorService = Executors.newFixedThreadPool(12);
}

public void startCrawl(String inputUrl) throws ExecutionException, InterruptedException {
Future<List<String>> extractedUrls = executorService.submit(crawl(inputUrl));
Deque<Future<List<String>>> queue = new ArrayDeque<>();
queue.add(extractedUrls);
while (!queue.isEmpty()) {
if(map.size() >= limit) {
break;
}
Thread.sleep(3000);

Future<List<String>> extractedUrlsFuture = queue.removeFirst();
List<String> parsedUrls = extractedUrlsFuture.get();
for(String parsedUrl : parsedUrls) {
if (!map.containsKey(parsedUrl)) {
Callable<List<String>> callable = crawl(parsedUrl);
queue.add(executorService.submit(callable));
}
}
}
executorService.shutdown();
// Wait for existing tasks to complete
executorService.awaitTermination(1, TimeUnit.SECONDS);
}

public void showParsedUrls() {
for(String key : map.keySet()) {
System.out.println(key);
}
}

private Callable<List<String>> crawl(String url) {
return () -> {
if(!map.containsKey(url)) {
List<String> parsedUrls = htmlParser.parseHtml(url);
map.put(url, true);
return parsedUrls.stream().filter((u) -> !map.containsKey(u)).collect(Collectors.toUnmodifiableList());
}
return Collections.emptyList();
};
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package com.gatomalvado.done.multithreadedwebcrawler;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class SingleThreadedHtmlParser implements HtmlParser {

@Override
public List<String> parseHtml(String inputUrl) {
String rawHtml = readUrl(inputUrl);
return getUrlsFromWebsite(rawHtml);
}

private List<String> getUrlsFromWebsite(String rawHtml) {
List<String> urls = new LinkedList<>();
Document doc = Jsoup.parse(rawHtml);
Elements elements = doc.select("a[href]");

for(Element element : elements) {
String link = element.attr("abs:href");
if(!link.isEmpty()) {
urls.add(link);
}
}

return urls;
}

private String readUrl(String webLink) {
String rawHtml = "";
try {
URL url = new URL(webLink);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
String inputLine = "";
while ((inputLine = reader.readLine()) != null) {
rawHtml += inputLine;
}
reader.close();
} catch (Exception e) {
System.out.println("Error reading url: " + webLink);
}

return rawHtml;
}
}
12 changes: 12 additions & 0 deletions src/main/java/com/gatomalvado/done/webcrawler/Main.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package com.gatomalvado.done.webcrawler;

public class Main {

public static void main(String[] args) throws InterruptedException {
System.out.println("Hello Simple Webcrawler!");
WebCrawler webCrawler = new WebCrawler();
webCrawler.crawl("https://en.wikipedia.org/wiki/As_It_Was");
System.out.println(webCrawler.getDiscoveredWebsites());
}

}
96 changes: 96 additions & 0 deletions src/main/java/com/gatomalvado/done/webcrawler/WebCrawler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package com.gatomalvado.done.webcrawler;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import lombok.Getter;

public class WebCrawler {

private Queue<String> queue;

@Getter
private Set<String> discoveredWebsites;

private int websitesProcessed;

public WebCrawler() {
this.queue = new LinkedList<>();
this.discoveredWebsites = new HashSet<>();
this.websitesProcessed = 0;
}

public void crawl(String seedUrl) throws InterruptedException {
this.queue.offer(seedUrl);
Thread.sleep(3000);
while (!this.queue.isEmpty()) {
String currentUrl = this.queue.poll();
String rawHtml = readUrl(currentUrl);
if("".equals(rawHtml)) {
continue;
}

List<String> urlsParsed = getUrlsFromWebsite(rawHtml);

for (String websiteUrl : urlsParsed) {
if(!discoveredWebsites.contains(websiteUrl)) {
// System.out.println("Website found with URL: " + websiteUrl);
queue.add(websiteUrl);
}
}

this.discoveredWebsites.add(currentUrl);
this.websitesProcessed++;

if(this.websitesProcessed == 10000) {
break;
}
}
}

private List<String> getUrlsFromWebsite(String rawHtml) {
List<String> urls = new LinkedList<>();
Document doc = Jsoup.parse(rawHtml);
Elements elements = doc.select("a[href]");

for(Element element : elements) {
String link = element.attr("abs:href");
if(!link.isEmpty()) {
urls.add(link);
}
}

return urls;
}

private String readUrl(String webLink) {
String rawHtml = "";
try {
URL url = new URL(webLink);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
String inputLine = "";
while ((inputLine = reader.readLine()) != null) {
rawHtml += inputLine;
}
reader.close();
} catch (Exception e) {
System.out.println("Error reading url: " + webLink);
}

return rawHtml;
}

}

This file was deleted.

9 changes: 0 additions & 9 deletions src/main/java/com/gatomalvado/todo/webcrawler/Main.java

This file was deleted.