Skip to content

Commit

Permalink
Merge pull request #1 from yennanliu/Scraping-dev-001-code-refine
Browse files Browse the repository at this point in the history
Scraping-dev-001-code-refine
  • Loading branch information
yennanliu authored Jul 9, 2024
2 parents b81528b + ab04e26 commit 235f335
Show file tree
Hide file tree
Showing 12 changed files with 863 additions and 628 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,51 @@

import com.yen.scrpe.Task.PokemonCollectTask;
import com.yen.scrpe.Task.ScrapeTaskFactory;
import com.yen.scrpe.Task.ScrapeTaskFactory2;
import com.yen.scrpe.service.ScrapeService;
import com.yen.scrpe.service.ScrapeServiceMultiThread;
import com.yen.scrpe.service.ScrapeServiceMultiThreadV2Gpt;

import java.io.IOException;

/**
* // https://www.zenrows.com/blog/web-scraping-java#java-web-crawling
* // data source : https://scrapeme.live/shop/
*
* // https://www.zenrows.com/blog/web-scraping-java#java-web-crawling // data source :
* https://scrapeme.live/shop/
*/
public class ScrappingApplication {

public static void main(String[] args) throws IOException, InterruptedException {
public static void main(String[] args) throws IOException, InterruptedException {

Long start = System.currentTimeMillis();
Long start = System.currentTimeMillis();

// to limit the number to scrape to 5
int LIMIT = 3; // 50;
// to limit the number to scrape to 5
int LIMIT = 10; // 50;

// ScrapeService scrapeService = new ScrapeService();
// PokemonCollectTask pokemonCollectTask = new PokemonCollectTask(scrapeService);
// pokemonCollectTask.run(LIMIT);
/** V1 : single thread (original code ) */
// ScrapeService scrapeService = new ScrapeService();
// PokemonCollectTask pokemonCollectTask = new PokemonCollectTask(scrapeService);
// pokemonCollectTask.run(LIMIT);
//
// ScrapeTaskFactory scrapeTaskFactory = new ScrapeTaskFactory(scrapeService, pokemonCollectTask, LIMIT);
// scrapeTaskFactory.run();

// ScrapeTaskFactory scrapeTaskFactory = new ScrapeTaskFactory(scrapeService, PokemonCollectTask, LIMIT);
// scrapeTaskFactory.run();

ScrapeServiceMultiThread scrapeServiceMultiThread = new ScrapeServiceMultiThread();
scrapeServiceMultiThread.testRun();
/** V2 : multi thread (gpt) */
ScrapeServiceMultiThreadV2Gpt scrapeService = new ScrapeServiceMultiThreadV2Gpt();
PokemonCollectTask pokemonCollectTask = new PokemonCollectTask(scrapeService);
pokemonCollectTask.run(LIMIT);

// System.out.println("pokemonProducts.size() = " + pokemonCollectTask.getPokemonProducts().size());
// System.out.println("pokemonProducts = " + pokemonCollectTask.getPokemonProducts());
ScrapeTaskFactory2 scrapeTaskFactory = new ScrapeTaskFactory2(scrapeService, pokemonCollectTask, LIMIT);
scrapeTaskFactory.run();

Long end = System.currentTimeMillis();
System.out.println("-----> Total duration = " + ( end - start));
}
// ScrapeServiceMultiThread scrapeServiceMultiThread = new
// ScrapeServiceMultiThread();
// scrapeServiceMultiThread.testRun();

}
// System.out.println("pokemonProducts.size() = " +
// pokemonCollectTask.getPokemonProducts().size());
// System.out.println("pokemonProducts = " + pokemonCollectTask.getPokemonProducts());

Long end = System.currentTimeMillis();
System.out.println("-----> Total duration = " + (end - start));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@

public interface BaseScrapeTask {

public void run(int limit) throws IOException;
void run(int limit) throws IOException, InterruptedException;
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import com.yen.scrpe.model.PokemonProduct;
import com.yen.scrpe.service.BaseScrapeService;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
Expand All @@ -11,81 +10,81 @@

public class PokemonCollectTask implements BaseScrapeTask {

// attr
//int limit;
// attr
// int limit;

private BaseScrapeService scrapeService;
// initializing the list of Java object to store
// the scraped data
private List<PokemonProduct> pokemonProducts;
// initializing the set of web page urls
private Set<String> pagesDiscovered; // discovered while crawling the target website
// initializing the queue of urls to scrape
private List<String> pagesToScrape;
private BaseScrapeService scrapeService;
// initializing the list of Java object to store
// the scraped data
private List<PokemonProduct> pokemonProducts;
// initializing the set of web page urls
private Set<String> pagesDiscovered; // discovered while crawling the target website
// initializing the queue of urls to scrape
private List<String> pagesToScrape;

// constructor
// PokemonCollectTaskV1.run(scrapeService, pagesToScrape, pokemonProducts, pagesDiscovered, LIMIT);
public PokemonCollectTask(){
// constructor
// PokemonCollectTaskV1.run(scrapeService, pagesToScrape, pokemonProducts, pagesDiscovered,
// LIMIT);
public PokemonCollectTask() {}

}
public PokemonCollectTask(BaseScrapeService scrapeService) {

public PokemonCollectTask(BaseScrapeService scrapeService){
this.scrapeService = scrapeService;
this.pokemonProducts = new ArrayList<>();
this.pagesDiscovered = new HashSet<>();
this.pagesToScrape = new ArrayList<>();

this.scrapeService = scrapeService;
this.pokemonProducts = new ArrayList<>();
this.pagesDiscovered = new HashSet<>();
this.pagesToScrape = new ArrayList<>();
// initializing the scraping queue with the
this.pagesToScrape.add("https://scrapeme.live/shop/page/1/");
}
// initializing the scraping queue
this.pagesToScrape.add("https://scrapeme.live/shop/page/1/");
this.pagesToScrape.add("https://scrapeme.live/shop/page/2/");
}

// getter, setter
public BaseScrapeService getScrapeService() {
return scrapeService;
}
// getter, setter
public BaseScrapeService getScrapeService() {
return scrapeService;
}

public void setScrapeService(BaseScrapeService scrapeService) {
this.scrapeService = scrapeService;
}
public void setScrapeService(BaseScrapeService scrapeService) {
this.scrapeService = scrapeService;
}

public List<PokemonProduct> getPokemonProducts() {
return pokemonProducts;
}
public List<PokemonProduct> getPokemonProducts() {
return pokemonProducts;
}

public void setPokemonProducts(List<PokemonProduct> pokemonProducts) {
this.pokemonProducts = pokemonProducts;
}
public void setPokemonProducts(List<PokemonProduct> pokemonProducts) {
this.pokemonProducts = pokemonProducts;
}

public Set<String> getPagesDiscovered() {
return pagesDiscovered;
}
public Set<String> getPagesDiscovered() {
return pagesDiscovered;
}

public void setPagesDiscovered(Set<String> pagesDiscovered) {
this.pagesDiscovered = pagesDiscovered;
}
public void setPagesDiscovered(Set<String> pagesDiscovered) {
this.pagesDiscovered = pagesDiscovered;
}

public List<String> getPagesToScrape() {
return pagesToScrape;
}
public List<String> getPagesToScrape() {
return pagesToScrape;
}

public void setPagesToScrape(List<String> pagesToScrape) {
this.pagesToScrape = pagesToScrape;
}
public void setPagesToScrape(List<String> pagesToScrape) {
this.pagesToScrape = pagesToScrape;
}

// method
public void run(int limit) throws IOException {
// method
public void run(int limit) throws IOException, InterruptedException {

int i = 0;
int i = 0;

while (!this.pagesToScrape.isEmpty() && i < limit) {
System.out.println(">>> i = " + i);
while (!this.pagesToScrape.isEmpty() && i < limit) {
System.out.println(">>> i = " + i);

/** help func*/
this.scrapeService.scrapeProductPage(pokemonProducts, pagesDiscovered, pagesToScrape, i);
/** help func */
this.scrapeService.scrapeProductPage(pokemonProducts, pagesDiscovered, pagesToScrape, i);

// incrementing the iteration number
i++;
}
// incrementing the iteration number
i++;
}

}
}
Original file line number Diff line number Diff line change
@@ -1,49 +1,52 @@
package com.yen.scrpe.Task;

import com.yen.scrpe.service.BaseScrapeService;

import java.io.IOException;

/** Factory for constructing scraping job
/**
* Factory for constructing scraping job
*
* Design pattern : Factory
* <p>Design pattern : Factory
*
* - https://www.runoob.com/design-pattern/factory-pattern.html
* <p>- https://www.runoob.com/design-pattern/factory-pattern.html
*/
public class ScrapeTaskFactory {

// attr
private BaseScrapeService scrapeService;
// attr
private BaseScrapeService scrapeService;

private BaseScrapeTask scrapeTask;
private BaseScrapeTask scrapeTask;

private String jobName;
private String jobName;

private int limit;
private int limit;

// constructor
public ScrapeTaskFactory(){
// constructor
public ScrapeTaskFactory() {}

}
public ScrapeTaskFactory(
BaseScrapeService scrapeService, BaseScrapeTask scrapeTask, Integer limit) {

public ScrapeTaskFactory(BaseScrapeService scrapeService, BaseScrapeTask scrapeTask, Integer limit){
this.scrapeService = scrapeService;
this.scrapeTask = scrapeTask;
this.limit = limit;
}

this.scrapeService = scrapeService;
this.scrapeTask = scrapeTask;
this.limit = limit;
}
// method
public void run() throws IOException, InterruptedException {

// method
public void run() throws IOException {
// this.jobName = "PokemonCollectTask";
// System.out.println("this.jobName = " + this.jobName);

switch (this.jobName){
case "PokemonCollectTask":
// pokemonCollectTask.run(LIMIT);
this.scrapeTask.run(this.limit);
default:
throw new RuntimeException("Not a valid task name :" + this.scrapeTask.toString());
this.scrapeTask.run(this.limit);

}
}
// switch (this.jobName) {
// case "PokemonCollectTask":
// // pokemonCollectTask.run(LIMIT);
// this.scrapeTask.run(this.limit);
// default:
// throw new RuntimeException("Not a valid task name :" + this.scrapeTask.toString());
// }
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package com.yen.scrpe.Task;

import com.yen.scrpe.Task.PokemonCollectTask;
import com.yen.scrpe.model.PokemonProduct;
import com.yen.scrpe.service.ScrapeServiceMultiThreadV2Gpt;

import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

public class ScrapeTaskFactory2 {

private final ScrapeServiceMultiThreadV2Gpt scrapeService;
private final PokemonCollectTask pokemonCollectTask;
private final int limit;

public ScrapeTaskFactory2(ScrapeServiceMultiThreadV2Gpt scrapeService, PokemonCollectTask pokemonCollectTask, int limit) {
this.scrapeService = scrapeService;
this.pokemonCollectTask = pokemonCollectTask;
this.limit = limit;
}

public void run() throws IOException, InterruptedException {
List<PokemonProduct> pokemonProducts = new LinkedList<>();
Set<String> pagesDiscovered = new HashSet<>();
List<String> pagesToScrape = new LinkedList<>();
pagesToScrape.add("https://scrapeme.live/shop");

//pokemonCollectTask.run(pokemonProducts, pagesDiscovered, pagesToScrape, limit);
pokemonCollectTask.run(limit);

System.out.println("Scraping completed.");
System.out.println("Collected Pokemon Products:");
for (PokemonProduct product : pokemonProducts) {
System.out.println(product);
}

// Ensure the executor service is shut down
scrapeService.shutdown();
}

}
Loading

0 comments on commit 235f335

Please sign in to comment.