-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix[merge]: resolve conflicts and fix method name mismatches
- Loading branch information
Showing
40 changed files
with
1,432 additions
and
474 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
115 changes: 115 additions & 0 deletions
115
src/main/java/edu/group5/app/model/organization/OrganizationScraper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| package edu.group5.app.model.organization; | ||
|
|
||
| import org.jsoup.Jsoup; | ||
| import org.jsoup.nodes.Document; | ||
| import org.jsoup.nodes.Element; | ||
| import org.jsoup.nodes.TextNode; | ||
| import org.jsoup.select.Elements; | ||
|
|
||
| import java.util.stream.Collectors; | ||
|
|
||
| import java.util.HashMap; | ||
| import java.util.Map; | ||
|
|
||
| /** | ||
| * Handles web scraping of organization information from Innsamlingskontrollen. | ||
| * Responsible for fetching logos and descriptions from organization pages. | ||
| * All results are cached to avoid redundant network requests. | ||
| */ | ||
| public class OrganizationScraper { | ||
| private final Map<String, String> logoCache = new HashMap<>(); | ||
| private final Map<String, String> descriptionCache = new HashMap<>(); | ||
|
|
||
| /** | ||
| * Fetches the description for the given URL by scraping all text content | ||
| * inside {@code <section class="information">}. Results are cached. | ||
| * | ||
| * <p>Strategy:</p> | ||
| * <ol> | ||
| * <li>Tries to get all <p> tags (skipping the first one) and concatenates them</li> | ||
| * <li>If no paragraphs found, gets all text content from the section</li> | ||
| * <li>Returns null if section not found or is empty</li> | ||
| * </ol> | ||
| * | ||
| * @param pageUrl the URL for the organization's page; may be null or blank | ||
| * @return the description text, or null if not found or pageUrl is invalid | ||
| */ | ||
| public String fetchDescription(String pageUrl) { | ||
| if (pageUrl == null || pageUrl.isBlank()) { | ||
| return null; | ||
| } | ||
|
|
||
| if (descriptionCache.containsKey(pageUrl)) { | ||
| return descriptionCache.get(pageUrl); | ||
| } | ||
|
|
||
| try { | ||
| Document doc = Jsoup.connect(pageUrl) | ||
| .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") | ||
| .timeout(5000).get(); | ||
|
|
||
| Element section = doc.selectFirst("section.information"); | ||
| if (section != null) { | ||
| section.select("div.extra-info").remove(); | ||
| section.select("a.read-more").remove(); | ||
|
|
||
| // Extract all <p> tags and <div> elements as separate paragraphs | ||
| String description = section.select("p, div").stream() | ||
| .filter(el -> el.tagName().equals("p") || el.select("p").isEmpty()) | ||
| .filter(el -> !el.hasClass("extra-info") && !el.hasClass("logo")) | ||
| .map(Element::text) | ||
| .map(text -> text.replace("Les mer", "").trim()) | ||
| .filter(text -> !text.isBlank()) | ||
| .collect(Collectors.joining("\n\n")); | ||
|
|
||
| // Fallback: if no paragraphs found, get all text from section | ||
| if (description.isBlank()) { | ||
| description = section.text().trim(); | ||
| } | ||
| description = description.replace("Les mer", "").trim(); | ||
|
|
||
| // Only cache and return if we found something meaningful | ||
| if (!description.isBlank()) { | ||
| descriptionCache.put(pageUrl, description); | ||
| return description; | ||
| } | ||
| } | ||
| } catch (Exception e) { | ||
| System.out.println("Could not get description for: " + pageUrl); | ||
| } | ||
| return null; | ||
| } | ||
|
|
||
| /** | ||
| * Fetches the logo URL for the given page by scraping the {@code div.logo img} | ||
| * element. Results are cached so each URL is only fetched once. | ||
| * | ||
| * @param pageUrl the URL for the organization's page; may be null or blank | ||
| * @return the absolute logo URL, or null if not found or pageUrl is invalid | ||
| */ | ||
| public String fetchLogoUrl(String pageUrl) { | ||
| if (pageUrl == null || pageUrl.isBlank()) { | ||
| return null; | ||
| } | ||
|
|
||
| if (logoCache.containsKey(pageUrl)) { | ||
| return logoCache.get(pageUrl); | ||
| } | ||
|
|
||
| try { | ||
| Document doc = Jsoup.connect(pageUrl) | ||
| .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") | ||
| .timeout(5000).get(); | ||
| Element img = doc.selectFirst("div.logo img"); | ||
|
|
||
| if (img != null) { | ||
| String logoUrl = img.absUrl("src"); | ||
| logoCache.put(pageUrl, logoUrl); | ||
| return logoUrl; | ||
| } | ||
| } catch (Exception e) { | ||
| System.out.println("Could not get logo for: " + pageUrl); | ||
| } | ||
| return null; | ||
| } | ||
| } |
Oops, something went wrong.