diff --git a/helpmehelpapplication/src/main/java/ntnu/systemutvikling/team6/scraper/URLCharityScraper.java b/helpmehelpapplication/src/main/java/ntnu/systemutvikling/team6/scraper/URLCharityScraper.java index 1aab6ae..7c5f88a 100644 --- a/helpmehelpapplication/src/main/java/ntnu/systemutvikling/team6/scraper/URLCharityScraper.java +++ b/helpmehelpapplication/src/main/java/ntnu/systemutvikling/team6/scraper/URLCharityScraper.java @@ -12,6 +12,10 @@ import org.openqa.selenium.support.ui.ExpectedConditions; import org.openqa.selenium.support.ui.WebDriverWait; +/** + * Class for scraping the description, URL of the logo, string of categories, and key values of the charities + * registered in IK. + */ public class URLCharityScraper { private final String url; private final WebDriver driver; @@ -20,171 +24,256 @@ public class URLCharityScraper { private final List categories; private final List keyValues; - - // Used for production - public URLCharityScraper(String url) { - this.categories = new ArrayList<>(); - this.keyValues = new ArrayList<>(); - ChromeOptions options = new ChromeOptions(); - options.addArguments("--headless=new"); - options.addArguments("--window-size=1920,1080"); - options.addArguments("--disable-gpu"); - options.addArguments("--no-sandbox"); - options.addArguments("--disable-dev-shm-usage"); - this.url = url; - this.driver = new ChromeDriver(options); + /** + * Constructor used for production code. + * + *

It initializes the lists used for categories and keyValues, as well as defining the parameters used + * for the selenium Chromium-based browser that does the scraping.

+ * + * @param url the URL for the charity's webpage on IK + */ + public URLCharityScraper(String url) { + this.categories = new ArrayList<>(); + this.keyValues = new ArrayList<>(); + + ChromeOptions options = new ChromeOptions(); + options.addArguments("--headless=new"); + options.addArguments("--window-size=1920,1080"); + options.addArguments("--disable-gpu"); + options.addArguments("--no-sandbox"); + options.addArguments("--disable-dev-shm-usage"); + + this.url = url; + this.driver = new ChromeDriver(options); } - // Used for testing + /** + * Constructor used for testing. + * + *

It accepts both a url (in this case used as a dud) and a {@link WebDriver} as parameters. The WebDriver is + * passed to make testing easier.

+ * + * @param url the URL for the charity's webpage on IK (for this constructor it should not be a real URL) + * @param driver the {@code WebDriver} object used for scraping + */ public URLCharityScraper(String url, WebDriver driver) { - this.categories = new ArrayList<>(); - this.keyValues = new ArrayList<>(); - this.url = url; - this.driver = driver; + this.categories = new ArrayList<>(); + this.keyValues = new ArrayList<>(); + this.url = url; + this.driver = driver; } - private void quitDriver() { - if (driver instanceof ChromeDriver) { - driver.quit(); - } - } - - private void updateDescription() { - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); - StringBuilder descriptionString = new StringBuilder(); - - driver.get(this.url); - - wait.until( - ExpectedConditions.numberOfElementsToBeMoreThan(By.cssSelector(".information div"), 0)); - - List firstDescription = driver.findElements(By.cssSelector(".information div p")); - - for (WebElement element : firstDescription) { - if (element.getText().isBlank()) { - continue; - } - descriptionString.append(element.getText()).append("\n\n"); - } - // Check for if description is long and contains a "read more" link - List doesReadMoreExist = driver.findElements(By.cssSelector("a.read-more")); - - if (!doesReadMoreExist.isEmpty()) { - WebElement descReadMore = driver.findElement(By.cssSelector("a.read-more")); - descReadMore.click(); - wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".extra-info"))); - } + /** + * Creates a {@link WebDriverWait} object for halting scraping until the correct pre-conditions are met. + * + * @return the {@code WebDriverWait} object to be used in the methods + */ + protected WebDriverWait createWait() { + return new WebDriverWait(driver, Duration.ofSeconds(30)); + } - List extraDescription = driver.findElements(By.cssSelector(".extra-info p")); + /** + * Calls the {@code findElements} method from the {@code WebDriver} object and returns a list of the returned + * {@link WebElement} objects. + * + * @param by a selector for {@code WebElement} objects + * @return a list of found {@code WebElement} objects matching the given selector + */ + protected List findElements(By by) { + return driver.findElements(by); + } - for (WebElement element : extraDescription) { - if (element.getText().isBlank()) { - continue; - } - descriptionString.append(element.getText()).append("\n\n"); - } + /** + * Calls the {@code findElement} method from the {@code WebDriver} object and returns a list of the returned + * {@code WebElement} objects. + * + * @param by a selector for {@code WebElement} objects + * @return a list of found {@code WebElement} objects matching the given selector + */ + protected WebElement findElement(By by) { + return driver.findElement(by); + } + /** + * Quits the driver instance, making it unusable. + */ + protected void closeDriver() { + driver.quit(); + } - this.description = descriptionString.toString(); - } + /** + * Scrapes the URL for the paragraphs containing the description of the charity. + */ + protected void updateDescription() { + WebDriverWait wait = createWait(); + StringBuilder descriptionString = new StringBuilder(); - private void updateLogo() { - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); + wait.until(ExpectedConditions.numberOfElementsToBeMoreThan( + By.cssSelector(".information div"), 0)); - driver.get(this.url); + List firstDescription = + findElements(By.cssSelector(".information div p")); - wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".logo > img"))); + for (WebElement element : firstDescription) { + if (!element.getText().isBlank()) { + descriptionString.append(element.getText()).append("\n\n"); + } + } - WebElement logo = driver.findElement(By.cssSelector(".logo > img")); + List readMoreLinks = + findElements(By.cssSelector("a.read-more")); - this.logoURL = logo.getAttribute("src"); - } + if (!readMoreLinks.isEmpty()) { + WebElement readMore = findElement(By.cssSelector("a.read-more")); + readMore.click(); - private void updateCategories() { - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); + wait.until(ExpectedConditions.visibilityOfElementLocated( + By.cssSelector(".extra-info"))); + } - driver.get(this.url); + List extraDescription = + findElements(By.cssSelector(".extra-info p")); - wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".tag-label"))); + for (WebElement element : extraDescription) { + if (!element.getText().isBlank()) { + descriptionString.append(element.getText()).append("\n\n"); + } + } - List categories = driver.findElements(By.cssSelector(".tag-label")); + this.description = descriptionString.toString(); + } - for (WebElement element : categories) { - this.categories.add(element.getText()); - } - } + /** + * Scrapes the URL for the image URL of the logo for the charity. + */ + void updateLogo() { + WebDriverWait wait = createWait(); - private void updateKeyNumbers() { - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); - String percentage; - WebElement element; + wait.until(ExpectedConditions.visibilityOfElementLocated( + By.cssSelector(".logo > img"))); - driver.get(this.url); + WebElement logo = findElement(By.cssSelector(".logo > img")); + this.logoURL = logo.getAttribute("src"); + } - wait.until( - ExpectedConditions.visibilityOfElementLocated( - By.xpath( - "//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']"))); + /** + * Scrapes the URL for the category labels containing the categories for the charity. + */ + void updateCategories() { + WebDriverWait wait = createWait(); - element = - driver.findElement( - By.xpath("//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']")); + wait.until(ExpectedConditions.visibilityOfElementLocated( + By.cssSelector(".tag-label"))); - percentage = element.getAttribute("data-percentage"); + List elements = + findElements(By.cssSelector(".tag-label")); - this.keyValues.add(percentage); + for (WebElement element : elements) { + this.categories.add(element.getText()); + } + } - wait.until( - ExpectedConditions.visibilityOfElementLocated( - By.xpath( - "//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']"))); + /** + * Scrapes the URL for the statistics of the charity; the percentage collected, the percentage that goes to the + * administration, and the percentage that is put towards the cause. + */ + void updateKeyValues() { + WebDriverWait wait = createWait(); - element = - driver.findElement( - By.xpath( - "//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']")); + String percentage; + WebElement element; - percentage = element.getAttribute("data-percentage"); + wait.until(ExpectedConditions.visibilityOfElementLocated( + By.xpath("//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']"))); - this.keyValues.add(percentage); + element = findElement(By.xpath( + "//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']")); + percentage = element.getAttribute("data-percentage"); + this.keyValues.add(percentage); - wait.until( - ExpectedConditions.visibilityOfElementLocated( - By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']"))); + wait.until(ExpectedConditions.visibilityOfElementLocated( + By.xpath("//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']"))); - element = - driver.findElement( - By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']")); + element = findElement(By.xpath( + "//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']")); + percentage = element.getAttribute("data-percentage"); + this.keyValues.add(percentage); - percentage = element.getAttribute("data-percentage"); + wait.until(ExpectedConditions.visibilityOfElementLocated( + By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']"))); - this.keyValues.add(percentage); - } + element = findElement(By.xpath( + "//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']")); + percentage = element.getAttribute("data-percentage"); + this.keyValues.add(percentage); + } - public void scrapeCharityPage() { - try { - this.updateDescription(); - this.updateLogo(); - this.updateCategories(); - this.updateKeyNumbers(); - } finally { - this.quitDriver(); - } - } + /** + * Runs all the scraper methods at once, updating the object parameters. + */ + public void scrapeCharityPage() { + try { + driver.get(this.url); + + updateDescription(); + updateLogo(); + updateCategories(); + updateKeyValues(); + + } finally { + closeDriver(); + } + } - public String getDescription() { - return this.description; - } + /** + * Returns the description of the charity. + * + * @return a String containing the description of the charity. + */ + public String getDescription() { + return description; + } - public String getLogoURL() { - return this.logoURL; - } + /** + * Returns the URL of the logo for the charity. + * + * @return a String containing the URL for the logo of the charity. + */ + public String getLogoURL() { + return logoURL; + } - public List getCategories() { - return Collections.unmodifiableList(this.categories); - } + /** + * Returns a String of the categories for the charity with ',' as a delimiter. + * + * @return a String of strings containing the categories for the charity + */ + public String getCategories() { + StringBuilder categoriesString = new StringBuilder(); + + for (int i = 0; i < this.categories.size(); i++) { + categoriesString.append(this.categories.get(i)); + if (i < this.categories.size() - 1) { + categoriesString.append(","); + } + } + return categoriesString.toString(); + } - public List getKeyValues() { - return Collections.unmodifiableList(this.keyValues); - } -} + /** + * Returns a String of the key value percentages for the charity with ':' as a delimiter, verified by IK. + * + * @return a String of the key values for the charity- + */ + public String getKeyValues() { + StringBuilder keyValuesString = new StringBuilder(); + + for (int i = 0; i < this.keyValues.size(); i++) { + keyValuesString.append(this.keyValues.get(i)); + if (i < this.keyValues.size() - 1) { + keyValuesString.append(":"); + } + } + return keyValuesString.toString(); + } +} \ No newline at end of file