Skip to content

Commit

Permalink
Updated URLCharityScraper
Browse files Browse the repository at this point in the history
Fixed method to get description to be more robust, and it now correctly spaces paragraphs. Made scraping methods private, and a public method to run them all. Restructured the class for easier testing with mockito.
  • Loading branch information
roaraf committed Apr 7, 2026
1 parent e6efe6f commit 017e820
Showing 1 changed file with 120 additions and 82 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.time.Duration;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
Expand All @@ -12,141 +13,178 @@
import org.openqa.selenium.support.ui.WebDriverWait;

public class URLCharityScraper {
ChromeOptions options;
String url;
private final String url;
private final WebDriver driver;
private String description;
private String logoURL;
private final List<String> categories;
private final List<String> keyValues;


// Used for production
public URLCharityScraper(String url) {
this.options = new ChromeOptions();
this.categories = new ArrayList<>();
this.keyValues = new ArrayList<>();
ChromeOptions options = new ChromeOptions();
options.addArguments("--headless=new");
options.addArguments("--window-size=1920,1080");
options.addArguments("--disable-gpu");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
this.url = url;
this.driver = new ChromeDriver(options);
}

// Used for testing
public URLCharityScraper(String url, WebDriver driver) {
this.categories = new ArrayList<>();
this.keyValues = new ArrayList<>();
this.url = url;
this.driver = driver;
}

private void quitDriver() {
if (driver instanceof ChromeDriver) {
driver.quit();
}
}

public String updateDescription() {
WebDriver driver = new ChromeDriver(options);
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
StringBuilder descriptionString = new StringBuilder();
private void updateDescription() {
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
StringBuilder descriptionString = new StringBuilder();

try {
driver.get(this.url);

wait.until(
ExpectedConditions.numberOfElementsToBeMoreThan(By.cssSelector(".information div"), 0));
ExpectedConditions.numberOfElementsToBeMoreThan(By.cssSelector(".information div"), 0));

List<WebElement> firstDescription = driver.findElements(By.cssSelector(".information div p"));

// Check for if description is long and contains a "read more" link
List<WebElement> doesReadMoreExist = driver.findElements(By.cssSelector("a.read-more"));
for (WebElement element : firstDescription) {
if (element.getText().isBlank()) {
continue;
}
descriptionString.append(element.getText()).append("\n\n");
}
// Check for if description is long and contains a "read more" link
List<WebElement> doesReadMoreExist = driver.findElements(By.cssSelector("a.read-more"));

if (!doesReadMoreExist.isEmpty()) {
WebElement descReadMore = driver.findElement(By.cssSelector("a.read-more"));
descReadMore.click();
wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".extra-info")));
}
if (!doesReadMoreExist.isEmpty()) {
WebElement descReadMore = driver.findElement(By.cssSelector("a.read-more"));
descReadMore.click();
wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".extra-info")));
}

List<WebElement> description = driver.findElements(By.cssSelector(".information div"));
List<WebElement> extraDescription = driver.findElements(By.cssSelector(".extra-info p"));

for (WebElement element : description) {
descriptionString.append(element.getText());
for (WebElement element : extraDescription) {
if (element.getText().isBlank()) {
continue;
}
descriptionString.append(element.getText()).append("\n\n");
}

} finally {
driver.quit();
}

return descriptionString.toString();
this.description = descriptionString.toString();
}

public String updateLogo() {
WebDriver driver = new ChromeDriver(options);
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
String logoURL;
private void updateLogo() {
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));

try {
driver.get(this.url);
driver.get(this.url);

wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".logo > img")));
wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".logo > img")));

WebElement logo = driver.findElement(By.cssSelector(".logo > img"));
WebElement logo = driver.findElement(By.cssSelector(".logo > img"));

logoURL = logo.getAttribute("src");
} finally {
driver.quit();
}
return logoURL;
this.logoURL = logo.getAttribute("src");
}

public List<String> updateCategories() {
WebDriver driver = new ChromeDriver(options);
private void updateCategories() {
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
List<String> categoriesList = new ArrayList<>();

try {
driver.get(this.url);

wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".tag-label")));
driver.get(this.url);

List<WebElement> categories = driver.findElements(By.cssSelector(".tag-label"));
wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".tag-label")));

for (WebElement element : categories) {
categoriesList.add(element.getText());
}
List<WebElement> categories = driver.findElements(By.cssSelector(".tag-label"));

} finally {
driver.quit();
for (WebElement element : categories) {
this.categories.add(element.getText());
}

return categoriesList;
}

public List<String> updateKeyNumbers() {
WebDriver driver = new ChromeDriver(options);
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
String percentage;
WebElement element;
List<String> keyNumbersList = new ArrayList<>();
private void updateKeyNumbers() {
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
String percentage;
WebElement element;

try {
driver.get(this.url);
driver.get(this.url);

wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath(
"//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']")));
wait.until(
ExpectedConditions.visibilityOfElementLocated(
By.xpath(
"//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']")));

element = driver.findElement(
By.xpath("//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']")
);
element =
driver.findElement(
By.xpath("//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']"));

percentage = element.getAttribute("data-percentage");
percentage = element.getAttribute("data-percentage");

keyNumbersList.add(percentage);
this.keyValues.add(percentage);

wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath(
"//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']")));
wait.until(
ExpectedConditions.visibilityOfElementLocated(
By.xpath(
"//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']")));

element = driver.findElement(
By.xpath("//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']")
);
element =
driver.findElement(
By.xpath(
"//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']"));

percentage = element.getAttribute("data-percentage");
percentage = element.getAttribute("data-percentage");

keyNumbersList.add(percentage);
this.keyValues.add(percentage);

wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath(
"//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']")));
wait.until(
ExpectedConditions.visibilityOfElementLocated(
By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']")));

element = driver.findElement(
By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']")
);
element =
driver.findElement(
By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']"));

percentage = element.getAttribute("data-percentage");
percentage = element.getAttribute("data-percentage");

keyNumbersList.add(percentage);
this.keyValues.add(percentage);
}

public void scrapeCharityPage() {
try {
this.updateDescription();
this.updateLogo();
this.updateCategories();
this.updateKeyNumbers();
} finally {
driver.quit();
this.quitDriver();
}
}

public String getDescription() {
return this.description;
}

public String getLogoURL() {
return this.logoURL;
}

public List<String> getCategories() {
return Collections.unmodifiableList(this.categories);
}

return keyNumbersList;
public List<String> getKeyValues() {
return Collections.unmodifiableList(this.keyValues);
}
}

0 comments on commit 017e820

Please sign in to comment.