Skip to content

Commit

Permalink
Updated URLCharityScraper
Browse files Browse the repository at this point in the history
Edited methods to return strings instead for easier uploading to database.
  • Loading branch information
roaraf committed Apr 7, 2026
1 parent 017e820 commit 358aeb4
Showing 1 changed file with 219 additions and 130 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;

/**
* Class for scraping the description, URL of the logo, string of categories, and key values of the charities
* registered in IK.
*/
public class URLCharityScraper {
private final String url;
private final WebDriver driver;
Expand All @@ -20,171 +24,256 @@ public class URLCharityScraper {
private final List<String> categories;
private final List<String> keyValues;


// Used for production
public URLCharityScraper(String url) {
this.categories = new ArrayList<>();
this.keyValues = new ArrayList<>();
ChromeOptions options = new ChromeOptions();
options.addArguments("--headless=new");
options.addArguments("--window-size=1920,1080");
options.addArguments("--disable-gpu");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
this.url = url;
this.driver = new ChromeDriver(options);
/**
* Constructor used for production code.
*
* <p>It initializes the lists used for categories and keyValues, as well as defining the parameters used
* for the selenium Chromium-based browser that does the scraping.</p>
*
* @param url the URL for the charity's webpage on IK
*/
public URLCharityScraper(String url) {
this.categories = new ArrayList<>();
this.keyValues = new ArrayList<>();

ChromeOptions options = new ChromeOptions();
options.addArguments("--headless=new");
options.addArguments("--window-size=1920,1080");
options.addArguments("--disable-gpu");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");

this.url = url;
this.driver = new ChromeDriver(options);
}

// Used for testing
/**
* Constructor used for testing.
*
* <p>It accepts both a url (in this case used as a dud) and a {@link WebDriver} as parameters. The WebDriver is
* passed to make testing easier.</p>
*
* @param url the URL for the charity's webpage on IK (for this constructor it should not be a real URL)
* @param driver the {@code WebDriver} object used for scraping
*/
public URLCharityScraper(String url, WebDriver driver) {
this.categories = new ArrayList<>();
this.keyValues = new ArrayList<>();
this.url = url;
this.driver = driver;
this.categories = new ArrayList<>();
this.keyValues = new ArrayList<>();
this.url = url;
this.driver = driver;
}

private void quitDriver() {
if (driver instanceof ChromeDriver) {
driver.quit();
}
}

private void updateDescription() {
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
StringBuilder descriptionString = new StringBuilder();

driver.get(this.url);

wait.until(
ExpectedConditions.numberOfElementsToBeMoreThan(By.cssSelector(".information div"), 0));

List<WebElement> firstDescription = driver.findElements(By.cssSelector(".information div p"));

for (WebElement element : firstDescription) {
if (element.getText().isBlank()) {
continue;
}
descriptionString.append(element.getText()).append("\n\n");
}
// Check for if description is long and contains a "read more" link
List<WebElement> doesReadMoreExist = driver.findElements(By.cssSelector("a.read-more"));

if (!doesReadMoreExist.isEmpty()) {
WebElement descReadMore = driver.findElement(By.cssSelector("a.read-more"));
descReadMore.click();
wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".extra-info")));
}
/**
* Creates a {@link WebDriverWait} object for halting scraping until the correct pre-conditions are met.
*
* @return the {@code WebDriverWait} object to be used in the methods
*/
protected WebDriverWait createWait() {
return new WebDriverWait(driver, Duration.ofSeconds(30));
}

List<WebElement> extraDescription = driver.findElements(By.cssSelector(".extra-info p"));
/**
* Calls the {@code findElements} method from the {@code WebDriver} object and returns a list of the returned
* {@link WebElement} objects.
*
* @param by a selector for {@code WebElement} objects
* @return a list of found {@code WebElement} objects matching the given selector
*/
protected List<WebElement> findElements(By by) {
return driver.findElements(by);
}

for (WebElement element : extraDescription) {
if (element.getText().isBlank()) {
continue;
}
descriptionString.append(element.getText()).append("\n\n");
}
/**
* Calls the {@code findElement} method from the {@code WebDriver} object and returns a list of the returned
* {@code WebElement} objects.
*
* @param by a selector for {@code WebElement} objects
* @return a list of found {@code WebElement} objects matching the given selector
*/
protected WebElement findElement(By by) {
return driver.findElement(by);
}

/**
* Quits the driver instance, making it unusable.
*/
protected void closeDriver() {
driver.quit();
}

this.description = descriptionString.toString();
}
/**
* Scrapes the URL for the paragraphs containing the description of the charity.
*/
protected void updateDescription() {
WebDriverWait wait = createWait();
StringBuilder descriptionString = new StringBuilder();

private void updateLogo() {
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
wait.until(ExpectedConditions.numberOfElementsToBeMoreThan(
By.cssSelector(".information div"), 0));

driver.get(this.url);
List<WebElement> firstDescription =
findElements(By.cssSelector(".information div p"));

wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".logo > img")));
for (WebElement element : firstDescription) {
if (!element.getText().isBlank()) {
descriptionString.append(element.getText()).append("\n\n");
}
}

WebElement logo = driver.findElement(By.cssSelector(".logo > img"));
List<WebElement> readMoreLinks =
findElements(By.cssSelector("a.read-more"));

this.logoURL = logo.getAttribute("src");
}
if (!readMoreLinks.isEmpty()) {
WebElement readMore = findElement(By.cssSelector("a.read-more"));
readMore.click();

private void updateCategories() {
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
wait.until(ExpectedConditions.visibilityOfElementLocated(
By.cssSelector(".extra-info")));
}

driver.get(this.url);
List<WebElement> extraDescription =
findElements(By.cssSelector(".extra-info p"));

wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector(".tag-label")));
for (WebElement element : extraDescription) {
if (!element.getText().isBlank()) {
descriptionString.append(element.getText()).append("\n\n");
}
}

List<WebElement> categories = driver.findElements(By.cssSelector(".tag-label"));
this.description = descriptionString.toString();
}

for (WebElement element : categories) {
this.categories.add(element.getText());
}
}
/**
* Scrapes the URL for the image URL of the logo for the charity.
*/
void updateLogo() {
WebDriverWait wait = createWait();

private void updateKeyNumbers() {
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
String percentage;
WebElement element;
wait.until(ExpectedConditions.visibilityOfElementLocated(
By.cssSelector(".logo > img")));

driver.get(this.url);
WebElement logo = findElement(By.cssSelector(".logo > img"));
this.logoURL = logo.getAttribute("src");
}

wait.until(
ExpectedConditions.visibilityOfElementLocated(
By.xpath(
"//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']")));
/**
* Scrapes the URL for the category labels containing the categories for the charity.
*/
void updateCategories() {
WebDriverWait wait = createWait();

element =
driver.findElement(
By.xpath("//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']"));
wait.until(ExpectedConditions.visibilityOfElementLocated(
By.cssSelector(".tag-label")));

percentage = element.getAttribute("data-percentage");
List<WebElement> elements =
findElements(By.cssSelector(".tag-label"));

this.keyValues.add(percentage);
for (WebElement element : elements) {
this.categories.add(element.getText());
}
}

wait.until(
ExpectedConditions.visibilityOfElementLocated(
By.xpath(
"//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']")));
/**
* Scrapes the URL for the statistics of the charity; the percentage collected, the percentage that goes to the
* administration, and the percentage that is put towards the cause.
*/
void updateKeyValues() {
WebDriverWait wait = createWait();

element =
driver.findElement(
By.xpath(
"//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']"));
String percentage;
WebElement element;

percentage = element.getAttribute("data-percentage");
wait.until(ExpectedConditions.visibilityOfElementLocated(
By.xpath("//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']")));

this.keyValues.add(percentage);
element = findElement(By.xpath(
"//li[.//h2[normalize-space()='Innsamlingsprosent']]//div[@class='graph']"));
percentage = element.getAttribute("data-percentage");
this.keyValues.add(percentage);

wait.until(
ExpectedConditions.visibilityOfElementLocated(
By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']")));
wait.until(ExpectedConditions.visibilityOfElementLocated(
By.xpath("//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']")));

element =
driver.findElement(
By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']"));
element = findElement(By.xpath(
"//li[.//h2[normalize-space()='Administrasjonsprosent']]//div[@class='graph']"));
percentage = element.getAttribute("data-percentage");
this.keyValues.add(percentage);

percentage = element.getAttribute("data-percentage");
wait.until(ExpectedConditions.visibilityOfElementLocated(
By.xpath("//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']")));

this.keyValues.add(percentage);
}
element = findElement(By.xpath(
"//li[.//h2[normalize-space()='Formålsprosent']]//div[@class='graph']"));
percentage = element.getAttribute("data-percentage");
this.keyValues.add(percentage);
}

public void scrapeCharityPage() {
try {
this.updateDescription();
this.updateLogo();
this.updateCategories();
this.updateKeyNumbers();
} finally {
this.quitDriver();
}
}
/**
* Runs all the scraper methods at once, updating the object parameters.
*/
public void scrapeCharityPage() {
try {
driver.get(this.url);

updateDescription();
updateLogo();
updateCategories();
updateKeyValues();

} finally {
closeDriver();
}
}

public String getDescription() {
return this.description;
}
/**
* Returns the description of the charity.
*
* @return a String containing the description of the charity.
*/
public String getDescription() {
return description;
}

public String getLogoURL() {
return this.logoURL;
}
/**
* Returns the URL of the logo for the charity.
*
* @return a String containing the URL for the logo of the charity.
*/
public String getLogoURL() {
return logoURL;
}

public List<String> getCategories() {
return Collections.unmodifiableList(this.categories);
}
/**
* Returns a String of the categories for the charity with ',' as a delimiter.
*
* @return a String of strings containing the categories for the charity
*/
public String getCategories() {
StringBuilder categoriesString = new StringBuilder();

for (int i = 0; i < this.categories.size(); i++) {
categoriesString.append(this.categories.get(i));
if (i < this.categories.size() - 1) {
categoriesString.append(",");
}
}
return categoriesString.toString();
}

public List<String> getKeyValues() {
return Collections.unmodifiableList(this.keyValues);
}
}
/**
* Returns a String of the key value percentages for the charity with ':' as a delimiter, verified by IK.
*
* @return a String of the key values for the charity-
*/
public String getKeyValues() {
StringBuilder keyValuesString = new StringBuilder();

for (int i = 0; i < this.keyValues.size(); i++) {
keyValuesString.append(this.keyValues.get(i));
if (i < this.keyValues.size() - 1) {
keyValuesString.append(":");
}
}
return keyValuesString.toString();
}
}

0 comments on commit 358aeb4

Please sign in to comment.