diff --git a/helpmehelpapplication/src/main/java/ntnu/systemutvikling/team6/scraper/URLCharityScraper.java b/helpmehelpapplication/src/main/java/ntnu/systemutvikling/team6/scraper/URLCharityScraper.java index 53aa2dc..1aab6ae 100644 --- a/helpmehelpapplication/src/main/java/ntnu/systemutvikling/team6/scraper/URLCharityScraper.java +++ b/helpmehelpapplication/src/main/java/ntnu/systemutvikling/team6/scraper/URLCharityScraper.java @@ -2,6 +2,7 @@ import java.time.Duration; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; @@ -12,141 +13,178 @@ import org.openqa.selenium.support.ui.WebDriverWait; public class URLCharityScraper { - ChromeOptions options; - String url; + private final String url; + private final WebDriver driver; + private String description; + private String logoURL; + private final List categories; + private final List keyValues; + + // Used for production public URLCharityScraper(String url) { - this.options = new ChromeOptions(); + this.categories = new ArrayList<>(); + this.keyValues = new ArrayList<>(); + ChromeOptions options = new ChromeOptions(); options.addArguments("--headless=new"); options.addArguments("--window-size=1920,1080"); options.addArguments("--disable-gpu"); options.addArguments("--no-sandbox"); options.addArguments("--disable-dev-shm-usage"); this.url = url; + this.driver = new ChromeDriver(options); + } + + // Used for testing + public URLCharityScraper(String url, WebDriver driver) { + this.categories = new ArrayList<>(); + this.keyValues = new ArrayList<>(); + this.url = url; + this.driver = driver; + } + + private void quitDriver() { + if (driver instanceof ChromeDriver) { + driver.quit(); + } } - public String updateDescription() { - WebDriver driver = new ChromeDriver(options); - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); - StringBuilder descriptionString = new StringBuilder(); + private void updateDescription() { + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); + StringBuilder descriptionString = new StringBuilder(); - try { driver.get(this.url); wait.until( - ExpectedConditions.numberOfElementsToBeMoreThan(By.cssSelector(".information div"), 0)); + ExpectedConditions.numberOfElementsToBeMoreThan(By.cssSelector(".information div"), 0)); + + List firstDescription = driver.findElements(By.cssSelector(".information div p")); - // Check for if description is long and contains a "read more" link - List doesReadMoreExist = driver.findElements(By.cssSelector("a.read-more")); + for (WebElement element : firstDescription) { + if (element.getText().isBlank()) { + continue; + } + descriptionString.append(element.getText()).append("\n\n"); + } + // Check for if description is long and contains a "read more" link + List doesReadMoreExist = driver.findElements(By.cssSelector("a.read-more")); - if (!doesReadMoreExist.isEmpty()) { - WebElement descReadMore = driver.findElement(By.cssSelector("a.read-more")); - descReadMore.click(); - wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".extra-info"))); - } + if (!doesReadMoreExist.isEmpty()) { + WebElement descReadMore = driver.findElement(By.cssSelector("a.read-more")); + descReadMore.click(); + wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".extra-info"))); + } - List description = driver.findElements(By.cssSelector(".information div")); + List extraDescription = driver.findElements(By.cssSelector(".extra-info p")); - for (WebElement element : description) { - descriptionString.append(element.getText()); + for (WebElement element : extraDescription) { + if (element.getText().isBlank()) { + continue; + } + descriptionString.append(element.getText()).append("\n\n"); } - } finally { - driver.quit(); - } - return descriptionString.toString(); + this.description = descriptionString.toString(); } - public String updateLogo() { - WebDriver driver = new ChromeDriver(options); - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); - String logoURL; + private void updateLogo() { + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); - try { - driver.get(this.url); + driver.get(this.url); - wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".logo > img"))); + wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".logo > img"))); - WebElement logo = driver.findElement(By.cssSelector(".logo > img")); + WebElement logo = driver.findElement(By.cssSelector(".logo > img")); - logoURL = logo.getAttribute("src"); - } finally { - driver.quit(); - } - return logoURL; + this.logoURL = logo.getAttribute("src"); } - public List updateCategories() { - WebDriver driver = new ChromeDriver(options); + private void updateCategories() { WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); - List categoriesList = new ArrayList<>(); - - try { - driver.get(this.url); - wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".tag-label"))); + driver.get(this.url); - List categories = driver.findElements(By.cssSelector(".tag-label")); + wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".tag-label"))); - for (WebElement element : categories) { - categoriesList.add(element.getText()); - } + List categories = driver.findElements(By.cssSelector(".tag-label")); - } finally { - driver.quit(); + for (WebElement element : categories) { + this.categories.add(element.getText()); } - - return categoriesList; } - public List updateKeyNumbers() { - WebDriver driver = new ChromeDriver(options); - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); - String percentage; - WebElement element; - List keyNumbersList = new ArrayList<>(); + private void updateKeyNumbers() { + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); + String percentage; + WebElement element; - try { - driver.get(this.url); + driver.get(this.url); - wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath( - "//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']"))); + wait.until( + ExpectedConditions.visibilityOfElementLocated( + By.xpath( + "//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']"))); - element = driver.findElement( - By.xpath("//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']") - ); + element = + driver.findElement( + By.xpath("//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']")); - percentage = element.getAttribute("data-percentage"); + percentage = element.getAttribute("data-percentage"); - keyNumbersList.add(percentage); + this.keyValues.add(percentage); - wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath( - "//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']"))); + wait.until( + ExpectedConditions.visibilityOfElementLocated( + By.xpath( + "//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']"))); - element = driver.findElement( - By.xpath("//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']") - ); + element = + driver.findElement( + By.xpath( + "//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']")); - percentage = element.getAttribute("data-percentage"); + percentage = element.getAttribute("data-percentage"); - keyNumbersList.add(percentage); + this.keyValues.add(percentage); - wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath( - "//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']"))); + wait.until( + ExpectedConditions.visibilityOfElementLocated( + By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']"))); - element = driver.findElement( - By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']") - ); + element = + driver.findElement( + By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']")); - percentage = element.getAttribute("data-percentage"); + percentage = element.getAttribute("data-percentage"); - keyNumbersList.add(percentage); + this.keyValues.add(percentage); + } + public void scrapeCharityPage() { + try { + this.updateDescription(); + this.updateLogo(); + this.updateCategories(); + this.updateKeyNumbers(); } finally { - driver.quit(); + this.quitDriver(); } + } + + public String getDescription() { + return this.description; + } + + public String getLogoURL() { + return this.logoURL; + } + + public List getCategories() { + return Collections.unmodifiableList(this.categories); + } - return keyNumbersList; + public List getKeyValues() { + return Collections.unmodifiableList(this.keyValues); } }