문제
JobFieldMap 개인 프로젝트 진행 중 크롤링 시 시간이 오래 걸리는 문제가 발생하였다. 만약 실제 서비스화된다면 페이지의 내용을 크롤링 하는데에 1분이 넘는 시간이 걸려서 불편한 상황이 이루어질 수 있다.
기존 코드는 아래와 같다.
(변경 이전 크롤링 코드)
public List<CompanyDTO> crollingAndSave() {
String companyname;
String recruitPosition;
String reward;
String source;
String workDetail = "";
Path path = Paths.get(System.getProperty("user.dir"), "src/main/resources/chromedriver.exe");
System.setProperty("webdriver.chrome.driver", path.toString());
ChromeOptions options = new ChromeOptions();
options.addArguments("--start-maximized");
options.addArguments("--disable-popup-blocking");
options.addArguments("--disable-default-apps");
ChromeDriver driver = new ChromeDriver(options);
driver.get("https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.recommend_order&years=-1&locations=all");
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(20));
List<CompanyDTO> jobCards = new ArrayList<>();
while (true) {
try {
List<WebElement> elements = wait.until(
ExpectedConditions.visibilityOfAllElementsLocatedBy(By.cssSelector("li.Card_Card__WdaEk > div"))
);
for (int i = 0; i < 8; i++) {
driver.get("https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.recommend_order&years=-1&locations=all");
wait = new WebDriverWait(driver, Duration.ofSeconds(20));
elements = wait.until(
ExpectedConditions.visibilityOfAllElementsLocatedBy(By.cssSelector("li.Card_Card__WdaEk > div"))
);
WebElement jobCardElement = elements.get(i);
int retryCount = 0;
while (retryCount < 3) {
try {
StringBuilder workDetailBuilder = new StringBuilder();
WebElement outside = jobCardElement.findElement(By.cssSelector("a"));
companyname = outside.getAttribute("data-company-name");
recruitPosition = outside.getAttribute("data-position-name");
source = outside.getAttribute("href");
WebElement fortheReward = jobCardElement.findElement(By.cssSelector("a > div > div > span"));
reward = fortheReward.getText();
driver.get(source);
wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector("article.JobDescription_JobDescription__dq8G5 > div > div")));
List<WebElement> insideElements1 = driver.findElements(By.cssSelector("article.JobDescription_JobDescription__dq8G5 > div > div"));
for (WebElement insideDivElement : insideElements1) {
String one = insideDivElement.findElement(By.cssSelector("div > p > span")).getText();
workDetailBuilder.append(one);
}
workDetail = workDetailBuilder.length() > 2000 ? workDetailBuilder.substring(0, 2000) : workDetailBuilder.toString();
WebElement locationInfo = driver.findElement(By.cssSelector("article.JobWorkPlace_JobWorkPlace__Q6Gml > div > div.JobWorkPlace_JobWorkPlace__map__location____MvP > span"));
String locationName = locationInfo.getText();
CompanyDTO dto = new CompanyDTO();
dto.setCompanyname(companyname);
dto.setRecruitPosition(recruitPosition);
dto.setReward(reward);
dto.setSource(source);
dto.setWorkDetail(workDetail);
dto.setLocationName(locationName);
jobCards.add(dto);
// 브라우저 뒤로가기
driver.navigate().back();
// 페이지가 로드될 때까지 대기
wait.until(ExpectedConditions.visibilityOfAllElementsLocatedBy(By.cssSelector("li.Card_Card__WdaEk > div")));
// 작업 성공 시 루프 종료
break;
} catch (StaleElementReferenceException e) {
retryCount++;
if (retryCount >= 3) {
throw e; // 재시도 횟수를 초과하면 예외를 던짐
}
}
}
}
break;
} catch (StaleElementReferenceException | NoSuchElementException e) {
break;
}
}
// 데이터베이스에 저장
for (CompanyDTO dto : jobCards) {
CompanyEntity entity = convertDtoToEntity(dto);
companyRepository.save(entity);
}
driver.quit();
return jobCards;
}
private CompanyEntity convertDtoToEntity(CompanyDTO dto) {
CompanyEntity entity = new CompanyEntity();
entity.setCompanyname(dto.getCompanyname());
entity.setRecruitPosition(dto.getRecruitPosition());
entity.setReward(dto.getReward());
entity.setSource(dto.getSource());
entity.setWorkDetail(dto.getWorkDetail());
entity.setLocationName(dto.getLocationName());
return entity;
}
위 코드내용을 정리해서 말하자면,
원티드 웹페이지의 여러 회사들이 구인하는 개발 파트의 페이지
를 driver.get()으로 받고
각 회사 소개 페이지를 다시 driver.get()으로 받아서 세부 내용을 받아온다.
시간이 너무 많이 걸리는 문제를 해결하기 위해 서칭을 하다가
크롤링 시간 단축 블로그 글을 보게되었고,
크롤링시 각 회사 포지션 구인 페이지를 동시에 들어가도록 병렬 형태로 로직을 짜야겠다고 생각하였다.
병렬 처리를 통해 이 코드의 성능을 향상시키려면, 각 jobCardElement에 대한 크롤링 작업을 별도의 스레드에서 수행하도록 해야했고, Java에서 병렬 처리를 위해 CompletableFuture와 ExecutorService를 활용하였다.
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
public List<CompanyDTO> crollingAndSave() {
String companyname;
String recruitPosition;
String reward;
String source;
String workDetail = "";
Path path = Paths.get(System.getProperty("user.dir"), "src/main/resources/chromedriver.exe");
System.setProperty("webdriver.chrome.driver", path.toString());
ChromeOptions options = new ChromeOptions();
options.addArguments("--start-maximized");
options.addArguments("--disable-popup-blocking");
options.addArguments("--disable-default-apps");
WebDriver driver = new ChromeDriver(options);
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(20));
driver.get("https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.recommend_order&years=-1&locations=all");
List<CompanyDTO> jobCards = new ArrayList<>();
ExecutorService executorService = Executors.newFixedThreadPool(8);
try {
List<WebElement> elements = wait.until(
ExpectedConditions.visibilityOfAllElementsLocatedBy(By.cssSelector("li.Card_Card__WdaEk > div"))
);
List<CompletableFuture<Void>> futures = new ArrayList<>();
for (int i = 0; i < elements.size(); i++) {
int index = i;
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
try (WebDriver localDriver = new ChromeDriver(options)) {
WebDriverWait localWait = new WebDriverWait(localDriver, Duration.ofSeconds(20));
localDriver.get("https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.recommend_order&years=-1&locations=all");
WebElement jobCardElement = localWait.until(
ExpectedConditions.visibilityOfAllElementsLocatedBy(By.cssSelector("li.Card_Card__WdaEk > div"))
).get(index);
StringBuilder workDetailBuilder = new StringBuilder();
WebElement outside = jobCardElement.findElement(By.cssSelector("a"));
companyname = outside.getAttribute("data-company-name");
recruitPosition = outside.getAttribute("data-position-name");
source = outside.getAttribute("href");
WebElement fortheReward = jobCardElement.findElement(By.cssSelector("a > div > div > span"));
reward = fortheReward.getText();
localDriver.get(source);
localWait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector("article.JobDescription_JobDescription__dq8G5 > div > div")));
List<WebElement> insideElements1 = localDriver.findElements(By.cssSelector("article.JobDescription_JobDescription__dq8G5 > div > div"));
for (WebElement insideDivElement : insideElements1) {
String one = insideDivElement.findElement(By.cssSelector("div > p > span")).getText();
workDetailBuilder.append(one);
}
workDetail = workDetailBuilder.length() > 2000 ? workDetailBuilder.substring(0, 2000) : workDetailBuilder.toString();
WebElement locationInfo = localDriver.findElement(By.cssSelector("article.JobWorkPlace_JobWorkPlace__Q6Gml > div > div.JobWorkPlace_JobWorkPlace__map__location____MvP > span"));
String locationName = locationInfo.getText();
CompanyDTO dto = new CompanyDTO();
dto.setCompanyname(companyname);
dto.setRecruitPosition(recruitPosition);
dto.setReward(reward);
dto.setSource(source);
dto.setWorkDetail(workDetail);
dto.setLocationName(locationName);
synchronized (jobCards) {
jobCards.add(dto);
}
} catch (Exception e) {
e.printStackTrace();
}
}, executorService);
futures.add(future);
}
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
} catch (Exception e) {
e.printStackTrace();
} finally {
executorService.shutdown();
try {
if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {
executorService.shutdownNow();
}
} catch (InterruptedException ex) {
executorService.shutdownNow();
}
driver.quit();
}
// 데이터베이스에 저장
for (CompanyDTO dto : jobCards) {
CompanyEntity entity = convertDtoToEntity(dto);
companyRepository.save(entity);
}
return jobCards;
}
'spring' 카테고리의 다른 글
[인프런 워밍업 클럽 1기/BE] 3번째 발자국 (0) | 2024.05.19 |
---|---|
[인프런 워밍업 클럽 7차 과제-5/16] 백엔드 (0) | 2024.05.16 |
[인프런 워밍업 클럽 6차 과제-5/11] 백엔드 (0) | 2024.05.11 |
[인프런 워밍업 클럽 4차 과제-5/7] 백엔드 (1) | 2024.05.07 |
[인프런 워밍업 클럽 3차 과제-5/3] (0) | 2024.05.03 |