From 83db88fab912fdc93463271a7327a25adca0c924 Mon Sep 17 00:00:00 2001 From: toravest Date: Mon, 7 Apr 2025 11:29:12 +0200 Subject: [PATCH 1/5] add universal functions, to import to notebook, ex. ensure_rain/snow --- src/my_package/util.py | 74 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/src/my_package/util.py b/src/my_package/util.py index 2eb13c3..0c01171 100644 --- a/src/my_package/util.py +++ b/src/my_package/util.py @@ -1,3 +1,7 @@ +import numpy as np +import pandas as pd + +# Function to replace the norcid 'æøå' def replace_nordic(city_name): for letter in city_name: if letter in 'æøå': @@ -6,7 +10,73 @@ def replace_nordic(city_name): city_name = city_name.replace('å', 'aa') return city_name - +# Function to convert from kelvin to celsius temp def kelvin_to_celsius(temp_in_kelvin): temp_in_celsius = temp_in_kelvin - 273.15 - return temp_in_celsius \ No newline at end of file + return temp_in_celsius + +# Fucntion to check if there are a ['rain.1h'] column in the dataset, if not make one +def ensure_rain_column(df): + try: + _ = df['rain.1h'] + + # If no rain, make the rain column and fill it with NaN + except KeyError: + print("'Rain' is not present in the JSON file.") + df['rain.1h'] = np.nan + + return df + +# Fucntion to check if there are a ['snow.1h'] column in the dataset, if not make one +def ensure_snow_column(df): + try: + _ = df['snow.1h'] + + # If no snow, make the snow column and fill it with NaN + except KeyError: + print("'Snow' is not present in the JSON file.") + df['snow.1h'] = np.nan + + return df + +# Function to fill NaN values in ['rain.1h'] columns with 0 +def fill_rain_column(df): + try: + # Replace the NaN with 0, using pandas function + df['rain.1h'] = df['rain.1h'].fillna(0) + + except KeyError: + print(["'rain.1h', not in df"]) + + return df + +# Function to fill NaN values in ['snow.1h'] columns with 0 +def fill_snow_column(df): + try: + # Replace the NaN with 0, using pandas function + df['snow.1h'] = df['snow.1h'].fillna(0) + + except KeyError: + print(["'snow.1h', not in df"]) + + return df + +# Function to 'normalize' the dataset, with index-changing and dropping meta-data +def extract_city_df(weather_data): + if 'list' in weather_data: + # Normalize the json for better readability + df = pd.json_normalize(weather_data['list']) + + # Delete duplicates based on the dt row, all the other values can appear more than once, but the date should only appear once + df = df.drop_duplicates(subset=['dt']) + + # The weather column dosnt have any releated information, therefor we delete it + df = df.drop(columns="weather") + + # Convert 'dt' column from Unix timestamp to datetime and set it as the index + df['dt'] = pd.to_datetime(df['dt'], unit='s') + df.set_index('dt', inplace=True) + return df + + else: + return None \ No newline at end of file From 5094b7e790370bc1661db3012cbb0803b0c0fb32 Mon Sep 17 00:00:00 2001 From: toravest Date: Mon, 7 Apr 2025 12:23:41 +0200 Subject: [PATCH 2/5] add compare-one-day-notebook. --- notebooks/notebook_compare_one_day_data.ipynb | 598 ++++++++++++++++++ 1 file changed, 598 insertions(+) create mode 100644 notebooks/notebook_compare_one_day_data.ipynb diff --git a/notebooks/notebook_compare_one_day_data.ipynb b/notebooks/notebook_compare_one_day_data.ipynb new file mode 100644 index 0000000..d63e089 --- /dev/null +++ b/notebooks/notebook_compare_one_day_data.ipynb @@ -0,0 +1,598 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook - Compare one day data\n", + "\n", + "Denne notebooken henter data fra to ønskede steder i Norge, og sammenligner dataen ved hjelp av visualisering i matplotlib av temperatur og nedbør." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Velg dag for vær-sammenligning\n", + "For å kunne hente data og gjøre en analyse trenger programmet å vite hvilken dag du vil hente ut for, også skrives alle timene fra den dagen ut. Programmet kan ikke hente ut data fra nåværende, eller senere datoer, altså må man velge datoer fra tidligere tidspunkt.\n", + "\n", + "Dataen skrives inn slik: (yyyy, mm, dd)\n", + "Her følger et eksempel: \n", + "|Hva|Hvordan|Eksempel|\n", + "|:---|:---:|:---:|\n", + "|år|yyyy|2025|\n", + "|måned|mm|03| \n", + "|dato|dd|01| \n", + "\n", + "Denne dataen skrives da inn på følgende hvis: (2025, 03, 01)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import time\n", + "\n", + "# Makes a function so the start and end date is the same date, with all hours of that date\n", + "def get_unix_timestamps_for_day():\n", + " date_input = input(\"Choose a date (yyyy, mm, dd): \")\n", + " date_components = date_input.split(\",\")\n", + " year = int(date_components[0])\n", + " month = int(date_components[1])\n", + " day = int(date_components[2])\n", + "\n", + " # Goes through all hours of the day, use %Y-%m-%d etc. from pythons strftime to convert datetime into a readable string \n", + " timestamps = []\n", + " for hour in range(24):\n", + " dt = datetime.datetime(year, month, day, hour, 0)\n", + " unix_timestamp = int(time.mktime(dt.timetuple()))\n", + " timestamps.append((unix_timestamp, dt.strftime('%Y-%m-%d %H:%M:%S'))) \n", + " \n", + " # Prevents from getting data for the current day, or the future\n", + " if dt >= datetime.datetime.now():\n", + " print(\"Failed, cant use future dates\")\n", + "\n", + " # If \n", + " raise ValueError\n", + "\n", + " # Prints the date chosen\n", + " print(f\"Selected date: {year}-{month:02d}-{day:02d}\")\n", + "\n", + " # Prints the timestamp and the date an hour of the day after\n", + " for ts, readable in timestamps:\n", + " print(f\"Unix Timestamp: {ts} -> {readable}\")\n", + " \n", + " return date_input, [ts[0] for ts in timestamps]\n", + "\n", + "date, timestamps = get_unix_timestamps_for_day()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Velg første sted til sammenligningen\n", + "For å kunne sammenligne data fra to steder, trenger vi først to steder å sammenligne. Stedene er foreløpig begrenset til Norge.\n", + "\n", + "Her kunne vi brukt en løkke, spesielt en while løkke for å sjekke at stedet fungerer, og for å kjøre begge stedene på samme celle. Men vi har valgt å kjøre en og en for lettere å oppfatte feil ved sted, og at dersom sted 1 blir feil, må du ikke velge sted 2 på nytt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.fetch_data import fetch_data\n", + "\n", + "# Import function to replace nordic (æøå)\n", + "from my_package.util import replace_nordic\n", + "\n", + "# User choose a city they want the weather data from\n", + "city_1 = input(\"Enter first city name: \")\n", + "\n", + "city_1 = replace_nordic(city_1)\n", + "\n", + "# Start_date is the first timestamp, end_date is the last\n", + "start_date, end_date = timestamps[0], timestamps[-1]\n", + "\n", + "# Stores the values in the variables\n", + "data_city_1, folder = fetch_data(start_date, end_date, city_1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Velg andre sted til sammenligningen\n", + "Velg det andre stedet som skal brukes i sammenligningen, fortsatt begrenset til Norge." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.fetch_data import fetch_data\n", + "\n", + "# Import function to replace nordic (æøå)\n", + "from my_package.util import replace_nordic\n", + "\n", + "# User choose a city they want the weather data from\n", + "city_2 = input(\"Enter first city name: \")\n", + "\n", + "city_2 = replace_nordic(city_2)\n", + "\n", + "# Start_date is the first timestamp, end_date is the last\n", + "start_date, end_date = timestamps[0], timestamps[-1]\n", + "\n", + "# Stores the values in the variables\n", + "data_city_2, folder = fetch_data(start_date, end_date, city_2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lagre data i en json-fil\n", + "\n", + "Vi samler dataen for begge stedene og skriver de til en json fil.\n", + "\n", + "Skriv inn navn for til filen du vil lagre med dataen.\n", + "\n", + "Eks. test\n", + "Da vil filen lagres som data_**test**.json, i mappen \"../data/output_sammenligning_dag/data_{filnavn}.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Create a combined dict, and 'separete' them with 'city_1' and 'city_2'\n", + "city_weather = {\n", + " \"city_1\": data_city_1,\n", + " \"city_2\": data_city_2\n", + "}\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "from my_package.write_data import write_data\n", + "\n", + "# Overwrites the folder stored inside the function\n", + "folder = \"../data/output_sammenligning_dag\"\n", + "\n", + "filename = input(\"Write filename: \")\n", + "\n", + "# Writes the data, with the chosen name\n", + "write_data(city_weather, folder, filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lese fra fil\n", + "\n", + "Henter opp data lagret i filen, lagd over, og skriver ut lesbart ved hjelp av pandas\n", + "\n", + "Har har vi laget en funksjon som henter ut dataene for ønsket sted, og gjør endringer vi ønsker skal bli gjort for dataen for begge steder som:\n", + "- fjerner 'weather' kolonnen, som inneholder metadata\n", + "- setter tiden som index\n", + "- normaliserer, slik at det er enklere å lese all dataen\n", + " \n", + "Vi sjekker også at vi har data for stedene, altså at funskjonen funker, før den eventuelt viser dataen for stedene." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "\n", + "file_path = f'../data/output_sammenligning_dag/data_{filename}.json'\n", + "\n", + "from my_package.util import extract_city_df\n", + "\n", + "# Load the whole JSON file\n", + "with open(file_path, 'r') as f:\n", + " all_city_data = json.load(f)\n", + "\n", + "# Separate variables for each city\n", + "city_1_df = extract_city_df(all_city_data.get('city_1'))\n", + "city_2_df = extract_city_df(all_city_data.get('city_2'))\n", + "\n", + "# Checks if the data is not empty, aka there are values\n", + "if city_1_df is not None:\n", + " # Prints the city name\n", + " print(f\"{city_1} data:\")\n", + " # Display the dataframe readable\n", + " display(city_1_df)\n", + "\n", + "# Checks if the data is not empty, aka there are values\n", + "if city_2_df is not None:\n", + " # Prints the city name\n", + " print(f\"{city_2} data:\")\n", + " # Display the dataframe readable\n", + " display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Viser temperaturen\n", + "Regner ut gjennomsnitts-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur for begge steder.\n", + "\n", + "Legger dataen inn i en dataframe for å vise de i en ryddigere og mer lettlest tabell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Stores the temperature values of both cities\n", + "temp_city_1 = city_1_df['main.temp']\n", + "temp_city_2 = city_2_df['main.temp']\n", + "\n", + "# Find the mean temperature in both citites\n", + "temp_mean_city_1 = temp_city_1.mean().round(2)\n", + "temp_mean_city_2 = temp_city_2.mean().round(2)\n", + "\n", + "# Find the highest temperature in both cities\n", + "max_temp_city_1 = city_1_df['main.temp'].max().round(2)\n", + "max_temp_city_2 = city_2_df['main.temp'].max().round(2)\n", + "\n", + "# Find the lowest tempeartues in both cities\n", + "min_temp_city_1 = city_1_df['main.temp'].min().round(2)\n", + "min_temp_city_2 = city_2_df['main.temp'].min().round(2)\n", + "\n", + "# Stores the values of both city in a list\n", + "city_names = [city_1, city_2]\n", + "mean_temp = [temp_mean_city_1, temp_mean_city_2]\n", + "max_temp = [max_temp_city_1, max_temp_city_2]\n", + "min_temp = [min_temp_city_1, min_temp_city_2]\n", + "\n", + "# Creates dataframe of the tempvalues, to display it more readable\n", + "df_temp_cities = pd.DataFrame({\n", + " \"City\": city_names,\n", + " \"Mean Temperature (°C)\": mean_temp,\n", + " \"Highest Temperature (°C)\": max_temp,\n", + " \"Lowest Temperature (°C)\": min_temp\n", + "})\n", + "\n", + "# Display the dataframe with the values of both cities\n", + "display(df_temp_cities)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sjekker nedbørs-kolonnene\n", + "\n", + "En ting vi har oppfattet når vi har blitt kjent med dataen er at regn og snø kolonne mangler om det ikke har regnet eller snødd. Vi ønsker senere å plotte oversikt over nedbør, og det er lettere å plotte 0 nedbør enn manglende verdier. \n", + "\n", + "Derfor sjekker vi her om regn/snø kolonne eksisterer i dataen, om den ikke gjør så lager vi en og fyller den med NaN." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.util import ensure_rain_column\n", + "from my_package.util import ensure_snow_column\n", + "\n", + "# Chekcs if there are a rain/snow column in city_1_df, if not the function creates one with NaN values\n", + "city_1_df = ensure_rain_column(city_1_df)\n", + "city_1_df = ensure_snow_column(city_1_df)\n", + "# Displays the dataframe readable\n", + "display(city_1_df)\n", + "\n", + "# Chekcs if there are a rain/snow column in city_2_df, if not the function creates one with NaN values\n", + "city_2_df = ensure_rain_column(city_2_df)\n", + "city_2_df = ensure_snow_column(city_2_df)\n", + "# Displays the dataframe readable\n", + "display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sjekk for manglende verdier\n", + "Missigno sjekker og visualiserer manglende verdier, slik at det blir lettere å se hvilke kolonner feilen ligger i. \n", + "\n", + "Vis the blir \"hull\" i en søyle, tyder the på manglende verdier.\n", + "\n", + "Det er ikke vanlig å legge til tittel, men i dette tilfellet siden vi har to serier vi sjekker data for, har vi brukt matplotlib for å lettere se hvilket sted som har manglende data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import missingno as msno\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Plot missing data matrix fro city_1\n", + "msno.matrix(city_1_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_1}')\n", + "\n", + "# Plot missing data matrix fro city_2\n", + "msno.matrix(city_2_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_2}')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Endre manglende verdier\n", + "\n", + "Nå skal vi ha regn/snø kolonner, og kanksje inneholder de NaN.\n", + "\n", + "Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunne til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n", + "\n", + "Vi sjekker også temperatur-verdiene for begge stedene. Dersom det skulle mangle en verdi bruker vi ffill (forward-fill), altså at den bruker verdien som var før.\n", + "\n", + "Det kan være mangler i andre verdier, men vi endrer ikke disse nå, da vi kun vil bruke data om nedbør og temperatur." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.util import fill_rain_column\n", + "from my_package.util import fill_snow_column\n", + "\n", + "# Fill NaN values with 0, if there are any, for both rain/snow column\n", + "city_1_df = fill_rain_column(city_1_df)\n", + "city_1_df = fill_snow_column(city_1_df)\n", + "\n", + "# Fill NaN values with 0, if there are any, for both rain/snow column\n", + "city_2_df = fill_rain_column(city_2_df)\n", + "city_2_df = fill_snow_column(city_2_df)\n", + "\n", + "# Drops all the columns, if it has 'NaN' value.\n", + "city_1_df = city_1_df.dropna(axis='columns', how='all')\n", + "city_2_df = city_2_df.dropna(axis='columns', how='all')\n", + "\n", + "# If temperature is missing, take the same as the one before\n", + "city_1_df['main.temp'] = city_1_df['main.temp'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the lowest temperature \n", + "city_1_df['main.temp_min'] = city_1_df['main.temp_min'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the highest temperature \n", + "city_1_df['main.temp_max'] = city_1_df['main.temp_max'].fillna('obj.ffill()')\n", + "\n", + "\n", + "# If temperature is missing, take the same as the one before\n", + "city_2_df['main.temp'] = city_2_df['main.temp'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the lowest temperature \n", + "city_2_df['main.temp_min'] = city_2_df['main.temp_min'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the highest temperature \n", + "city_2_df['main.temp_max'] = city_2_df['main.temp_max'].fillna('obj.ffill()')\n", + "\n", + "# Display both cities readable\n", + "display(city_1_df)\n", + "display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualisere endring av data\n", + "Har lagt inn en ny missigno visualisering, for å se at de manglende dataene \"forsvinner\" når vi kjører cellen over. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import missingno as msno\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Plot missing data matrix fro city_1\n", + "msno.matrix(city_1_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_1}')\n", + "\n", + "# Plot missing data matrix fro city_2\n", + "msno.matrix(city_2_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_2}')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualisere data i en graf\n", + "Ved hjelp av Matplotlib har vi visualiert ønsket data, og ved hjelp av subplot, en modul i matplotlib, kan vi plotte flere verdier i samme graf, og få \"to y-akse\" på samme x-akse. \n", + "\n", + "Temperatur for begge stedene finner vi i den øverste grafen, hvor vi også har lagt inn gjennomsnittstemperaturen.\n", + "\n", + "I grafen under ser vi oversikt over nedbør for begge stedene sammenlignet 'side om side'. Vi skiller også mellom snø og regn, og dersom det skulle snø og regne i samme time, vil de bare 'stables' oppå hverandre.\n", + "\n", + "Grafen lagres i " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import os\n", + "\n", + "# Where the figure should be saved when exported\n", + "output_folder = \"../data/output_fig_sammenligning\"\n", + "\n", + "# Creates the folder if it does not exist\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "\n", + "# Extract rain values for both cities\n", + "city_1_rain = city_1_df['rain.1h']\n", + "city_2_rain = city_2_df['rain.1h']\n", + "\n", + "# Extract snow values for both cities\n", + "city_1_snow = city_1_df['snow.1h']\n", + "city_2_snow = city_2_df['snow.1h']\n", + "\n", + "# x_axis set to the index, which mean the datetime\n", + "x_axis = city_1_df.index\n", + "\n", + "# Makes the datetime^ to numeric using mdates-functions\n", + "x_axis_numeric = mdates.date2num(x_axis)\n", + "\n", + "# Two vertically stacked axis, (2 rows, 1 column), width and height of the figure, and the axis share the same x_axis\n", + "fig, (ax1, ax3) = plt.subplots(2, 1,figsize=(15, 8), sharex=True)\n", + "\n", + "# Set the title for the whole plot, above the upper plot\n", + "ax1.set_title(f'Weather compare for {city_1} and {city_2} ({date}) ')\n", + "\n", + "# Plots the temperature for city_1, with mean temperature\n", + "ax1.plot(x_axis, temp_city_1, color='#2E8B57', label=f'Temperature {city_1}')\n", + "ax1.axhline(temp_mean_city_1, color='#2E8B57', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", + "\n", + "# Plots the temperature for city_2s, with mean temperature\n", + "ax1.plot(x_axis, temp_city_2, color='#FFD700', label=f'Temperature {city_2}')\n", + "ax1.axhline(temp_mean_city_2, color='#FFD700', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", + "\n", + "# Design the y-axis for Temperature\n", + "ax1.set_ylabel(\"Temperature (°C)\")\n", + "\n", + "# Add grid only vertically\n", + "ax1.grid(axis='x')\n", + "\n", + "# Adjust the width of bars for better side-by-side comparison, for the precipitation\n", + "width = 0.01 \n", + "\n", + "# Plot rain/snow bars for both cities, sidy-by-side for better comparrison\n", + "# Place the snow and rain bars for city_1, with offset for better comparrisson with city_2\n", + "ax3.bar(x_axis_numeric - width/2, city_1_rain, width=width, alpha=0.7, color='#2E8B57', label=f'Rain {city_1}', edgecolor='#FFD700', bottom=city_1_snow)\n", + "ax3.bar(x_axis_numeric - width/2, city_1_snow, width=width, alpha=0.5, color='#2E8B57', label=f'Snow {city_1}', hatch='////', edgecolor='#FFD700')\n", + "\n", + "# Place the snow and rain bars for city_2, with offset for better comparrisson with city_1\n", + "ax3.bar(x_axis_numeric + width/2, city_2_rain, width=width, alpha=0.7, color='#FFD700', label=f'Rain {city_2}', edgecolor=\"#2E8B57\", bottom=city_2_snow)\n", + "ax3.bar(x_axis_numeric + width/2, city_2_snow, width=width, alpha=0.5, color='#FFD700', label=f'Snow {city_2}', hatch='////', edgecolor=\"#2E8B57\")\n", + "\n", + "\n", + "# Design the y-axis for precipiation\n", + "ax3.set_ylabel(\"Precipitation (mm)\")\n", + "\n", + "# Add grid only vertically\n", + "ax3.grid(axis='x')\n", + "\n", + "# Format the x-axis to show all hours, in the format \"HH:MM\"\n", + "ax1.xaxis.set_major_locator(mdates.HourLocator()) \n", + "ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))\n", + "\n", + "# Add label description\n", + "ax1.legend(loc='upper left')\n", + "ax3.legend(loc='upper left')\n", + "\n", + "# Add label for x-axis\n", + "ax3.set_xlabel('Datetime')\n", + "\n", + "# Save the plot to the data/output_fig folder\n", + "plot_path = os.path.join(output_folder, f\"weather_compare_plot_{city_1}_{city_2}.png\")\n", + "# Save the plot as a PNG file\n", + "plt.savefig(plot_path)\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 9df86aee72cda39c571107cc5490e2f21ebba386 Mon Sep 17 00:00:00 2001 From: toravest Date: Mon, 7 Apr 2025 12:34:01 +0200 Subject: [PATCH 3/5] changes function, standardvalue for hour and minute, if no input --- src/my_package/date_to_unix.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/my_package/date_to_unix.py b/src/my_package/date_to_unix.py index 92ea3ce..89da30b 100644 --- a/src/my_package/date_to_unix.py +++ b/src/my_package/date_to_unix.py @@ -11,16 +11,32 @@ def get_unix_timestamp(): start_year = int(start_date_components[0]) start_month = int(start_date_components[1]) start_date = int(start_date_components[2]) - start_hour = int(start_date_components[3]) - start_minute = int(start_date_components[4]) + + # Checks if hour and minute is inputed, if not value = 0 + if len(start_date_components) > 3: + start_hour = int(start_date_components[3]) + else: + start_hour = 00 + if len(start_date_components) > 4: + start_minute = int(start_date_components[4]) + else: + start_minute = 00 # Variables that splits and assign the end date values end_date_components = end_date_input.split(",") end_year = int(end_date_components[0]) end_month = int(end_date_components[1]) end_date = int(end_date_components[2]) - end_hour = int(end_date_components[3]) - end_minute = int(end_date_components[4]) + + # Checks if hour and minute is inputed, if not value = 0 + if len(end_date_components) > 3: + end_hour = int(end_date_components[3]) + else: + end_hour = 00 + if len(end_date_components) > 4: + end_minute = int(end_date_components[4]) + else: + end_minute = 00 # Converts dates to timestamp used to convert to unix, and print, with the user input start_date_timestamp = datetime.datetime(start_year, start_month, start_date, start_hour, start_minute) From 0e3d1ab99bb7b834b9ba666e25096f5d15431d92 Mon Sep 17 00:00:00 2001 From: toravest Date: Mon, 7 Apr 2025 15:34:17 +0200 Subject: [PATCH 4/5] add unittest, both positiv and negative 'rain.1h' column --- tests/unit/test_rain.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 tests/unit/test_rain.py diff --git a/tests/unit/test_rain.py b/tests/unit/test_rain.py new file mode 100644 index 0000000..bf067b4 --- /dev/null +++ b/tests/unit/test_rain.py @@ -0,0 +1,37 @@ +import unittest +import sys +import os +import pandas as pd +import numpy as np + +# Add the src folder to the Python path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../src'))) + +from my_package.util import ensure_rain_column +from my_package.util import fill_rain_column + + +class TestRain(unittest.TestCase): + def test_ensure_rain(self): + # Creates a df with the column 'something' and the value NaN + self.df = pd.DataFrame([[np.nan]], columns=['something']) + + # Runs the function that should create a column 'rain.1h' if its not already existing + self.df = ensure_rain_column(self.df) + + # Checks if the column 'rain.1h' is in the columns of the dataframe + self.assertIn('rain.1h', self.df.columns) + + def test_fill_rain(self): + # Creates a dataframe with the column 'rain.1h' and value NaN + self.df = pd.DataFrame([[np.nan]], columns=['rain.1h']) + + # Runs the function, that should replace NaN with 0 + self.df = fill_rain_column(self.df) + + # Checks if the first index in the 'rain.1h' column is not equal to NaN, aka the value has changed + self.assertNotEqual(self.df['rain.1h'][0], np.nan) + +if __name__ == "__main__": + unittest.main() + From 180f5d697162869cc4321019c43481a0217f7a95 Mon Sep 17 00:00:00 2001 From: toravest Date: Tue, 8 Apr 2025 12:50:15 +0200 Subject: [PATCH 5/5] minor changes --- notebooks/notebook_compare_one_day_data.ipynb | 5 +- .../notebook_compare_one_week_data.ipynb | 610 ++++++++++++++++++ .../notebook_compare_statistic_data.ipynb | 436 +++++++++++++ 3 files changed, 1050 insertions(+), 1 deletion(-) create mode 100644 notebooks/notebook_compare_one_week_data.ipynb create mode 100644 notebooks/notebook_compare_statistic_data.ipynb diff --git a/notebooks/notebook_compare_one_day_data.ipynb b/notebooks/notebook_compare_one_day_data.ipynb index d63e089..e0e5fd4 100644 --- a/notebooks/notebook_compare_one_day_data.ipynb +++ b/notebooks/notebook_compare_one_day_data.ipynb @@ -481,7 +481,7 @@ "\n", "I grafen under ser vi oversikt over nedbør for begge stedene sammenlignet 'side om side'. Vi skiller også mellom snø og regn, og dersom det skulle snø og regne i samme time, vil de bare 'stables' oppå hverandre.\n", "\n", - "Grafen lagres i " + "Grafen lagres i mappen \"../data/output_fig_sammenligning\"" ] }, { @@ -534,6 +534,9 @@ "# Add grid only vertically\n", "ax1.grid(axis='x')\n", "\n", + "# Add marker at 0 temperature\n", + "ax1.axhline(y=0, color='black', linewidth=1.5)\n", + "\n", "# Adjust the width of bars for better side-by-side comparison, for the precipitation\n", "width = 0.01 \n", "\n", diff --git a/notebooks/notebook_compare_one_week_data.ipynb b/notebooks/notebook_compare_one_week_data.ipynb new file mode 100644 index 0000000..186f61a --- /dev/null +++ b/notebooks/notebook_compare_one_week_data.ipynb @@ -0,0 +1,610 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "06c938cd", + "metadata": {}, + "source": [ + "# info" + ] + }, + { + "cell_type": "markdown", + "id": "f6f95dba", + "metadata": {}, + "source": [ + "### Velg start dato og sluttdato\n", + "\n", + "For å kunne hente data og gjøre en analyse trenger programmet å vite hvilken periode du vil hente ut for.\n", + "\n", + "Dataen skrives inn slik: (yyyy, mm, dd, hh, mm)\n", + "Her følger et eksempel: \n", + "|Hva|Hvordan|Eksempel|\n", + "|:---|:---:|:---:|\n", + "|år|yyyy|2025|\n", + "|måned|mm|03| \n", + "|dato|dd|01| \n", + "|time|hh|12| \n", + "|minutt|mm|00| \n", + "\n", + "Denne dataen skrives da inn på følgende hvis: (2025, 03, 01, 12, 00)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8474286b", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.date_to_unix import get_unix_timestamp\n", + "from my_package.date_to_unix import from_unix_timestamp\n", + "\n", + "# Runs the function and store the data\n", + "unix_start_date, unix_end_date = get_unix_timestamp()\n", + "\n", + "# Prints the unix_timestamp\n", + "print(\"Start date => unix timestamp:\", unix_start_date)\n", + "print(\"End date => unix timestamp:\", unix_end_date)\n", + "\n", + "# Run the function to convert from unix_timestamp to date, and store the variables\n", + "start_date, end_date = from_unix_timestamp(unix_start_date, unix_end_date)\n", + "\n", + "# Prints the date\n", + "print(\"Unix timestamp => start date:\", start_date)\n", + "print(\"Unix timestamp => end date:\", end_date)\n" + ] + }, + { + "cell_type": "markdown", + "id": "0b9d1a95", + "metadata": {}, + "source": [ + "### Velg første sted til sammenligning\n", + "\n", + "Skriv inn et av stedene du ønsker å sammenligne, foreløpig begrenset til Norge." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ebb8c16", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.fetch_data import fetch_data\n", + "\n", + "# Import function to replace nordic (æøå)\n", + "from my_package.util import replace_nordic\n", + "\n", + "# User input the city, for the weather\n", + "city_1 = input(\"Enter a city in Norway: \")\n", + "\n", + "city_1 = replace_nordic(city_1)\n", + "\n", + "# Stores the values in the variables\n", + "data_city_1, folder = fetch_data(unix_start_date, unix_end_date, city_1)" + ] + }, + { + "cell_type": "markdown", + "id": "2d0e2068", + "metadata": {}, + "source": [ + "### Velg andre sted til sammenligningen\n", + "Velg det andre stedet som skal brukes i sammenligningen, fortsatt begrenset til Norge." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61eec678", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.fetch_data import fetch_data\n", + "\n", + "# Import function to replace nordic (æøå)\n", + "from my_package.util import replace_nordic\n", + "\n", + "# User input the city, for the weather\n", + "city_2 = input(\"Enter a city in Norway: \")\n", + "\n", + "city_2 = replace_nordic(city_2)\n", + "\n", + "# Stores the values in the variables\n", + "data_city_2, folder = fetch_data(unix_start_date, unix_end_date, city_2)" + ] + }, + { + "cell_type": "markdown", + "id": "b04ecf78", + "metadata": {}, + "source": [ + "### Lagre data i en json-fil\n", + "\n", + "Vi samler dataen for begge stedene og skriver de til en json fil.\n", + "\n", + "Skriv inn navn for til filen du vil lagre med dataen.\n", + "\n", + "Eks. test\n", + "Da vil filen lagres som data_**test**.json, i mappen \"../data/output_sammenligning_uke/data_{filnavn}.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcf37e96", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Create a combined dict, and 'separete' them with 'city_1' and 'city_2'\n", + "city_weather = {\n", + " \"city_1\": data_city_1,\n", + " \"city_2\": data_city_2\n", + "}\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "from my_package.write_data import write_data\n", + "\n", + "# Overwrites the folder stored inside the function\n", + "folder = \"../data/output_sammenligning_uke\"\n", + "\n", + "filename = input(\"Write filename: \")\n", + "\n", + "# Writes the data, with the chosen name\n", + "write_data(city_weather, folder, filename)" + ] + }, + { + "cell_type": "markdown", + "id": "304312bc", + "metadata": {}, + "source": [ + "### Lese fra fil\n", + "\n", + "Henter opp data lagret i filen, lagd over, og skriver ut lesbart ved hjelp av pandas\n", + "\n", + "Har har vi laget en funksjon som henter ut dataene for ønsket sted, og gjør endringer vi ønsker skal bli gjort for dataen for begge steder som:\n", + "- fjerner 'weather' kolonnen, som inneholder metadata\n", + "- setter tiden som index\n", + "- normaliserer, slik at det er enklere å lese all dataen\n", + " \n", + "Vi sjekker også at vi har data for stedene, altså at funskjonen funker, før den eventuelt viser dataen for stedene." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79e9dda6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "\n", + "file_path = f'../data/output_sammenligning_uke/data_{filename}.json'\n", + "\n", + "from my_package.util import extract_city_df\n", + "\n", + "# Load the whole JSON file\n", + "with open(file_path, 'r') as f:\n", + " all_city_data = json.load(f)\n", + "\n", + "# Separate variables for each city\n", + "city_1_df = extract_city_df(all_city_data.get('city_1'))\n", + "city_2_df = extract_city_df(all_city_data.get('city_2'))\n", + "\n", + "# Checks if the data is not empty, aka there are values\n", + "if city_1_df is not None:\n", + " # Prints the city name\n", + " print(f\"{city_1} data:\")\n", + " # Display the dataframe readable\n", + " display(city_1_df)\n", + "\n", + "# Checks if the data is not empty, aka there are values\n", + "if city_2_df is not None:\n", + " # Prints the city name\n", + " print(f\"{city_2} data:\")\n", + " # Display the dataframe readable\n", + " display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "id": "57169489", + "metadata": {}, + "source": [ + "### Viser temperaturen\n", + "Regner ut gjennomsnitts-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur for begge steder.\n", + "\n", + "Legger dataen inn i en dataframe for å vise de i en ryddigere og mer lettlest tabell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "712b2a1e", + "metadata": {}, + "outputs": [], + "source": [ + "# Stores the temperature values of both cities\n", + "temp_city_1 = city_1_df['main.temp']\n", + "temp_city_2 = city_2_df['main.temp']\n", + "\n", + "# Find the mean temperature in both citites\n", + "temp_mean_city_1 = temp_city_1.mean().round(2)\n", + "temp_mean_city_2 = temp_city_2.mean().round(2)\n", + "\n", + "# Find the highest temperature in both cities\n", + "max_temp_city_1 = city_1_df['main.temp'].max().round(2)\n", + "max_temp_city_2 = city_2_df['main.temp'].max().round(2)\n", + "\n", + "# Find the lowest tempeartues in both cities\n", + "min_temp_city_1 = city_1_df['main.temp'].min().round(2)\n", + "min_temp_city_2 = city_2_df['main.temp'].min().round(2)\n", + "\n", + "# Stores the values of both city in a list\n", + "city_names = [city_1, city_2]\n", + "mean_temp = [temp_mean_city_1, temp_mean_city_2]\n", + "max_temp = [max_temp_city_1, max_temp_city_2]\n", + "min_temp = [min_temp_city_1, min_temp_city_2]\n", + "\n", + "# Creates dataframe of the tempvalues, to display it more readable\n", + "df_temp_cities = pd.DataFrame({\n", + " \"City\": city_names,\n", + " \"Mean Temperature (°C)\": mean_temp,\n", + " \"Highest Temperature (°C)\": max_temp,\n", + " \"Lowest Temperature (°C)\": min_temp\n", + "})\n", + "\n", + "# Display the dataframe with the values of both cities\n", + "display(df_temp_cities)\n" + ] + }, + { + "cell_type": "markdown", + "id": "d0248c3b", + "metadata": {}, + "source": [ + "### Sjekker nedbørs-kolonnene\n", + "\n", + "En ting vi har oppfattet når vi har blitt kjent med dataen er at regn og snø kolonne mangler om det ikke har regnet eller snødd. Vi ønsker senere å plotte oversikt over nedbør, og det er lettere å plotte 0 nedbør enn manglende verdier. \n", + "\n", + "Derfor sjekker vi her om regn/snø kolonne eksisterer i dataen, om den ikke gjør så lager vi en og fyller den med NaN." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff553681", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.util import ensure_rain_column\n", + "from my_package.util import ensure_snow_column\n", + "\n", + "# Chekcs if there are a rain/snow column in city_1_df, if not the function creates one with NaN values\n", + "city_1_df = ensure_rain_column(city_1_df)\n", + "city_1_df = ensure_snow_column(city_1_df)\n", + "# Displays the dataframe readable\n", + "display(city_1_df)\n", + "\n", + "# Chekcs if there are a rain/snow column in city_2_df, if not the function creates one with NaN values\n", + "city_2_df = ensure_rain_column(city_2_df)\n", + "city_2_df = ensure_snow_column(city_2_df)\n", + "# Displays the dataframe readable\n", + "display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "id": "e271e6c1", + "metadata": {}, + "source": [ + "### Sjekk for manglende verdier\n", + "Missigno sjekker og visualiserer manglende verdier, slik at det blir lettere å se hvilke kolonner feilen ligger i. \n", + "\n", + "Vis the blir \"hull\" i en søyle, tyder the på manglende verdier.\n", + "\n", + "Det er ikke vanlig å legge til tittel, men i dette tilfellet siden vi har to serier vi sjekker data for, har vi brukt matplotlib for å lettere se hvilket sted som har manglende data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9285c447", + "metadata": {}, + "outputs": [], + "source": [ + "import missingno as msno\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Plot missing data matrix fro city_1\n", + "msno.matrix(city_1_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_1}')\n", + "\n", + "# Plot missing data matrix fro city_2\n", + "msno.matrix(city_2_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_2}')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "3d8faa50", + "metadata": {}, + "source": [ + "### Endre manglende verdier\n", + "\n", + "Nå skal vi ha regn/snø kolonner, og kanksje inneholder de NaN.\n", + "\n", + "Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunne til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n", + "\n", + "Vi sjekker også temperatur-verdiene for begge stedene. Dersom det skulle mangle en verdi bruker vi ffill (forward-fill), altså at den bruker verdien som var før.\n", + "\n", + "Det kan være mangler i andre verdier, men vi endrer ikke disse nå, da vi kun vil bruke data om nedbør og temperatur." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a101ae22", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.util import fill_rain_column\n", + "from my_package.util import fill_snow_column\n", + "\n", + "# Fill NaN values with 0, if there are any, for both rain/snow column\n", + "city_1_df = fill_rain_column(city_1_df)\n", + "city_1_df = fill_snow_column(city_1_df)\n", + "\n", + "# Fill NaN values with 0, if there are any, for both rain/snow column\n", + "city_2_df = fill_rain_column(city_2_df)\n", + "city_2_df = fill_snow_column(city_2_df)\n", + "\n", + "# Drops all the columns, if it has 'NaN' value.\n", + "city_1_df = city_1_df.dropna(axis='columns', how='all')\n", + "city_2_df = city_2_df.dropna(axis='columns', how='all')\n", + "\n", + "# If temperature is missing, take the same as the one before\n", + "city_1_df['main.temp'] = city_1_df['main.temp'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the lowest temperature \n", + "city_1_df['main.temp_min'] = city_1_df['main.temp_min'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the highest temperature \n", + "city_1_df['main.temp_max'] = city_1_df['main.temp_max'].fillna('obj.ffill()')\n", + "\n", + "\n", + "# If temperature is missing, take the same as the one before\n", + "city_2_df['main.temp'] = city_2_df['main.temp'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the lowest temperature \n", + "city_2_df['main.temp_min'] = city_2_df['main.temp_min'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the highest temperature \n", + "city_2_df['main.temp_max'] = city_2_df['main.temp_max'].fillna('obj.ffill()')\n", + "\n", + "# Display both cities readable\n", + "display(city_1_df)\n", + "display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "id": "aca067b3", + "metadata": {}, + "source": [ + "### Visualisere endring av data\n", + "Har lagt inn en ny missigno visualisering, for å se at de manglende dataene \"forsvinner\" når vi kjører cellen over. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4cdf7b0", + "metadata": {}, + "outputs": [], + "source": [ + "import missingno as msno\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Plot missing data matrix fro city_1\n", + "msno.matrix(city_1_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_1}')\n", + "\n", + "# Plot missing data matrix fro city_2\n", + "msno.matrix(city_2_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_2}')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "61713f43", + "metadata": {}, + "source": [ + "### Visualisere data i en graf\n", + "Ved hjelp av Matplotlib har vi visualiert ønsket data, og ved hjelp av subplot, en modul i matplotlib, kan vi plotte flere verdier i samme graf, og få \"to y-akse\" på samme x-akse. \n", + "\n", + "Temperatur for begge stedene finner vi i den øverste grafen, hvor vi også har lagt inn gjennomsnittstemperaturen.\n", + "\n", + "I grafen under ser vi oversikt over nedbør for begge stedene sammenlignet 'side om side'. Vi skiller også mellom snø og regn, og dersom det skulle snø og regne i samme time, vil de bare 'stables' oppå hverandre.\n", + "\n", + "Grafen lagres i mappen \"../data/output_fig_sammenligning\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "668513b2", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import os\n", + "\n", + "# Where the figure should be saved when exported\n", + "output_folder = \"../data/output_fig_sammenligning\"\n", + "\n", + "# Creates the folder if it does not exist\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "\n", + "# Extract rain values for both cities\n", + "city_1_rain = city_1_df['rain.1h']\n", + "city_2_rain = city_2_df['rain.1h']\n", + "\n", + "# Extract snow values for both cities\n", + "city_1_snow = city_1_df['snow.1h']\n", + "city_2_snow = city_2_df['snow.1h']\n", + "\n", + "# x_axis set to the index, which mean the datetime\n", + "x_axis = city_1_df.index\n", + "\n", + "# Makes the datetime^ to numeric using mdates-functions\n", + "x_axis_numeric = mdates.date2num(x_axis)\n", + "\n", + "# Two vertically stacked axis, (2 rows, 1 column), width and height of the figure, and the axis share the same x_axis\n", + "fig, (ax1, ax3) = plt.subplots(2, 1,figsize=(15, 8), sharex=True)\n", + "\n", + "# Set the title for the whole plot, above the upper plot\n", + "ax1.set_title(f'Weather compare for {city_1} and {city_2} ({start_date} - {end_date}) ')\n", + "\n", + "# Plots the temperature for city_1, with mean temperature\n", + "ax1.plot(x_axis, temp_city_1, color='#2E8B57', label=f'Temperature {city_1}')\n", + "ax1.axhline(temp_mean_city_1, color='#2E8B57', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", + "\n", + "# Plots the temperature for city_2s, with mean temperature\n", + "ax1.plot(x_axis, temp_city_2, color='#FFD700', label=f'Temperature {city_2}')\n", + "ax1.axhline(temp_mean_city_2, color='#FFD700', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", + "\n", + "# Design the y-axis for Temperature\n", + "ax1.set_ylabel(\"Temperature (°C)\")\n", + "\n", + "# Add grid only vertically\n", + "ax1.grid(axis='x')\n", + "\n", + "# Add marker at 0 temperature\n", + "ax1.axhline(y=0, color='black', linewidth=1.5)\n", + "\n", + "width = 0.02\n", + "\n", + "# Plot rain/snow bars for both cities, sidy-by-side for better comparrison\n", + "# # Place the snow and rain bars for city_1, with offset for better comparrisson with city_2\n", + "\n", + "# ax3.bar(x_axis_numeric - width/2, city_1_rain, width=width, alpha=0.7, color='#2E8B57', label=f'Rain {city_1}', edgecolor='#FFD700', bottom=city_1_snow)\n", + "# ax3.bar(x_axis_numeric - width/2, city_1_snow, width=width, alpha=0.5, color='#2E8B57', label=f'Snow {city_1}', hatch='////', edgecolor='#FFD700')\n", + "\n", + "# # # Place the snow and rain bars for city_2, with offset for better comparrisson with city_1\n", + "# ax3.bar(x_axis_numeric + width/2, city_2_rain, width=width, alpha=0.7, color='#FFD700', label=f'Rain {city_2}', edgecolor=\"#2E8B57\", bottom=city_2_snow)\n", + "# ax3.bar(x_axis_numeric + width/2, city_2_snow, width=width, alpha=0.5, color='#FFD700', label=f'Snow {city_2}', hatch='////', edgecolor=\"#2E8B57\")\n", + "\n", + "# Concat the snow and rain\n", + "ax3.fill_between(x_axis, city_1_rain + city_1_snow, color='green', alpha=0.3, label=f'{city_1} Total')\n", + "ax3.fill_between(x_axis, city_2_rain + city_2_snow, color='gold', alpha=0.3, label=f'{city_2} Total')\n", + "\n", + "\n", + "# Design the y-axis for precipiation\n", + "ax3.set_ylabel(\"Precipitation (mm)\")\n", + "\n", + "# Add grid only vertically\n", + "ax3.grid(axis='x')\n", + "\n", + "# Customize the x-axis to show ticks for each hour\n", + "ax1.xaxis.set_major_locator(mdates.HourLocator(interval=12)) # Tick marks for every hour\n", + "ax1.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H')) # Format as \"Day Month Hour:Minute\"\n", + "\n", + "# Add label description\n", + "ax1.legend(loc='upper left')\n", + "ax3.legend(loc='upper left')\n", + "\n", + "# Add label for x-axis\n", + "ax3.set_xlabel('Datetime')\n", + "\n", + "# Save the plot to the data/output_fig folder\n", + "plot_path = os.path.join(output_folder, f\"weather_compare_plot_{city_1}_{city_2}.png\")\n", + "# Save the plot as a PNG file\n", + "plt.savefig(plot_path)\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/notebook_compare_statistic_data.ipynb b/notebooks/notebook_compare_statistic_data.ipynb new file mode 100644 index 0000000..5985dfc --- /dev/null +++ b/notebooks/notebook_compare_statistic_data.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e79f465a", + "metadata": {}, + "source": [ + "# info" + ] + }, + { + "cell_type": "markdown", + "id": "a2bfa3ca", + "metadata": {}, + "source": [ + "### Velg det første stedet til sammenligningen\n", + "\n", + "For å kunne sammenligne statistisk data for to steder, trenger vi de to stedene.\n", + "\n", + "Denne API-en henter statistisk historisk data, herunder, statistisk data basert på de historiske dataene, ikke reele statistisk historisk. \n", + "\n", + "Statistikken er basert på de historiske datane total sett, ikke for hvert år.\n", + "\n", + "API-en bruker mellom 30 og 50s på å hente dataen, så vær tålmodig!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f21635ac", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.year_data import fetch_data\n", + "\n", + "# Import function to replace nordic (æøå)\n", + "from my_package.util import replace_nordic\n", + "\n", + "# User input the city, for the weather\n", + "city_1 = input(\"Enter a city in Norway: \")\n", + "\n", + "city_1 = replace_nordic(city_1)\n", + "\n", + "data_city_1, folder = fetch_data(city_1)" + ] + }, + { + "cell_type": "markdown", + "id": "41293f67", + "metadata": {}, + "source": [ + "### Velg det andre stedet til sammenligningen" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fffd8f42", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.year_data import fetch_data\n", + "\n", + "# Import function to replace nordic (æøå)\n", + "from my_package.util import replace_nordic\n", + "\n", + "# User input the city, for the weather\n", + "city_2 = input(\"Enter a city in Norway: \")\n", + "\n", + "city_2 = replace_nordic(city_2)\n", + "\n", + "data_city_2, folder = fetch_data(city_2)" + ] + }, + { + "cell_type": "markdown", + "id": "6f5733f3", + "metadata": {}, + "source": [ + "### Lagre data i json-fil\n", + "\n", + "Skriv inn navn for til filen du vil lagre med dataen.\n", + "\n", + "Eks. test\n", + "Da vil filen lagres som data_**test**.json, i mappen \"../data/output_sammenligning_statistikk/data_{filnavn}.json\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32602874", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Create a combined dict, and 'separete' them with 'city_1' and 'city_2'\n", + "city_weather = {\n", + " \"city_1\": data_city_1,\n", + " \"city_2\": data_city_2\n", + "}\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "from my_package.write_data import write_data\n", + "\n", + "# Overwrites the folder stored inside the function\n", + "folder = \"../data/output_sammenligning_statistikk\"\n", + "\n", + "filename = input(\"Write filename: \")\n", + "\n", + "# Writes the data, with the chosen name\n", + "write_data(city_weather, folder, filename)" + ] + }, + { + "cell_type": "markdown", + "id": "1ccc995b", + "metadata": {}, + "source": [ + "### Lese fra fil\n", + "Sørger for at dataen lagret over blir mer lesbar.\n", + "\n", + "Og lagrer verdiene for hver av stedene i egne dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "485c474d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "\n", + "file_path = f'../data/output_sammenligning_statistikk/data_{filename}.json'\n", + "\n", + "\n", + "# Load the whole JSON file\n", + "with open(file_path, 'r') as f:\n", + " all_city_data = json.load(f)\n", + "\n", + "def extract_city_data(data):\n", + " # Checks if the 'result' column is in the data\n", + " if 'result' in data:\n", + " # Normalize the json and store it as a dataframe for better readability\n", + " df = pd.json_normalize(data['result'])\n", + "\n", + " # Display the dataframe\n", + " return df\n", + " else:\n", + " print(\"'result' not in data\")\n", + " return None\n", + "\n", + "# Separate variables for each city\n", + "city_1_df = extract_city_data(all_city_data.get('city_1'))\n", + "city_2_df = extract_city_data(all_city_data.get('city_2'))\n", + "\n", + "# Checks if the data is not empty, aka there are values\n", + "if city_1_df is not None:\n", + " # Prints the city name\n", + " print(f\"{city_1} data:\")\n", + " # Display the dataframe readable\n", + " display(city_1_df)\n", + "else:\n", + " print('\"city_1_df\" is empty')\n", + "\n", + "# Checks if the data is not empty, aka there are values\n", + "if city_2_df is not None:\n", + " # Prints the city name\n", + " print(f\"{city_2} data:\")\n", + " # Display the dataframe readable\n", + " display(city_2_df)\n", + "else:\n", + " print('\"city_2_df\" is empty')\n" + ] + }, + { + "cell_type": "markdown", + "id": "1c782190", + "metadata": {}, + "source": [ + "### Rydder i data\n", + "Fjerner alle kolonner vi ikke trenger, som standardavvik for alle kategorier for alle dager, vi kan regne ut en felles ved å bruke statistisc modulen. \n", + "\n", + "Ettersom alle kateogirene har lik data, ogg vi vil fjerne noen av verdiene fra alle kategoriene. Kan vi bruke filter funksjonen til å filtrere ut dataene som inneholder f.eks. '.st_dev'. Dette gjør at alle kategoirene fjernes på likt å vi slipper å skrive alle flere ganger." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e0e6fea", + "metadata": {}, + "outputs": [], + "source": [ + "def clean_df(df):\n", + " # Drop all columns that end with '...' using the filter function\n", + " df = df.drop(columns=df.filter(like='.p25').columns)\n", + " df = df.drop(columns=df.filter(like='.p75').columns)\n", + " df = df.drop(columns=df.filter(like='.st_dev').columns)\n", + " df = df.drop(columns=df.filter(like='.num').columns)\n", + "\n", + " return df\n", + "\n", + "# Cleans data for unessecarily columns\n", + "city_1_df = clean_df(city_1_df)\n", + "city_2_df = clean_df(city_2_df)\n", + "\n", + "display(city_1_df)\n", + "display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "id": "97847344", + "metadata": {}, + "source": [ + "### Plotter temperatur\n", + "Denne koden plotter og sammenlginer data basert på gjennomsnitts temperatur gjennom året. For å sikre lagring av de ulike kjøringene, vil grafen bli lagret i mappen \"../data/output_fig_sammenligning/mean_temp_plot_{city_1}_(city_2).json\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "851e62c8", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import os\n", + "import sys\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Import the kelvin to celsius function\n", + "from my_package.util import kelvin_to_celsius\n", + "\n", + "output_folder = \"../data/output_fig\"\n", + "os.makedirs(output_folder, exist_ok=True) # Create the folder if it doesn't exist\n", + "\n", + "# Converts to and make a new column with celsius temp, and not kelvin\n", + "city_1_df['temp.mean_celsius'] = kelvin_to_celsius(city_1_df['temp.mean'])\n", + "city_2_df['temp.mean_celsius'] = kelvin_to_celsius(city_2_df['temp.mean'])\n", + "\n", + "temp_city_1 = city_1_df['temp.mean_celsius']\n", + "temp_city_2 = city_2_df['temp.mean_celsius']\n", + "\n", + "\n", + "temp_mean_city_1 = temp_city_1.mean().round(2)\n", + "temp_mean_city_2 = temp_city_2.mean().round(2)\n", + "\n", + "# Convert from day and month, to datetime\n", + "# df['date'] = pd.to_datetime(df[['month', 'day']].assign(year=2024))\n", + "\n", + "# Create a new column that concatenates month and day (e.g., \"03-01\" for March 1)\n", + "city_1_df['month_day'] = city_1_df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", + "city_2_df['month_day'] = city_2_df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", + "\n", + "# Plot the graph of the mean temperature\n", + "plt.figure(figsize=(12, 6))\n", + "plt.plot(city_1_df['month_day'], temp_city_1, color='#2E8B57', label=f'temp {city_1}')\n", + "plt.plot(city_2_df['month_day'], temp_city_2, color='#FFD700', label=f'temp {city_2}')\n", + "\n", + "plt.axhline(temp_mean_city_1, color='#2E8B57', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", + "plt.axhline(temp_mean_city_2, color='#FFD700', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", + "\n", + "# Label for easier reading and understanding of the plot\n", + "# plt.title(f\"Mean temp - statistic historical {city_name}\")\n", + "plt.xlabel(\"Date\")\n", + "plt.ylabel(\"Temperature (°C)\")\n", + "\n", + "# Add marker at 0 temperature\n", + "plt.axhline(y=0, color='black', linewidth=1.5)\n", + "\n", + "# Customize the x-axis to show ticks and labels only at the start of each month\n", + "plt.gca().xaxis.set_major_locator(mdates.MonthLocator()) \n", + "# Format ticks to show abbreviated month names (e.g., Jan, Feb)\n", + "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) \n", + "\n", + "plt.xticks(rotation=45)\n", + "plt.yticks(range(-20, 30, 2))\n", + "plt.tight_layout()\n", + "plt.grid()\n", + "\n", + "plt.legend()\n", + "\n", + "# Save the plot to the data/output_fig folder\n", + "# plot_path = os.path.join(output_folder, f\"mean_temp_plot_{city_name}.png\")\n", + "# plt.savefig(plot_path) # Save the plot as a PNG file\n", + "\n", + "# Show the plot\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "2884cf46", + "metadata": {}, + "source": [ + "### Plotter data\n", + "Her plottes temperatur og regn på samme akse, med vind i en egen graf under, men de deler samme x-akse, som er month_date." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5baab98a", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import os\n", + "import sys\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Import the kelvin to celsius function\n", + "from my_package.util import kelvin_to_celsius\n", + "\n", + "# Defines the output folder for the figure, and makes it if is does not exsist\n", + "output_folder = \"../data/output_fig\"\n", + "os.makedirs(output_folder, exist_ok=True) \n", + "\n", + "# Converts to and make a new column with celsius temp, and not kelvin\n", + "city_1_df['temp.mean_celsius'] = kelvin_to_celsius(city_1_df['temp.mean'])\n", + "temp_city_1 = city_1_df['temp.mean_celsius']\n", + "precipitation_city_1 = city_1_df['precipitation.mean']\n", + "wind_city_1 = city_1_df['wind.mean']\n", + "\n", + "# Converts to and make a new column with celsius temp, and not kelvin\n", + "city_2_df['temp.mean_celsius'] = kelvin_to_celsius(city_2_df['temp.mean'])\n", + "temp_city_2 = city_2_df['temp.mean_celsius']\n", + "precipitation_city_2 = city_2_df['precipitation.mean']\n", + "wind_city_2 = city_2_df['wind.mean']\n", + "\n", + "# Create a new column that concatenates month and day (e.g., \"03-01\" for March 1)\n", + "city_1_df['month_day'] = city_1_df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", + "city_2_df['month_day'] = city_2_df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", + "\n", + "x_axis = city_1_df['month_day']\n", + "\n", + "fig, (ax1, ax3) = plt.subplots(2, 1, figsize = (15, 8), sharex=True)\n", + "\n", + "# Plot temperature on the primary y-axis\n", + "ax1.plot(x_axis, temp_city_1, color='#2E8B57', label=f'Temperature {city_1}')\n", + "ax1.plot(x_axis, temp_city_2, color='#FFD700', label=f'Temperature {city_2}')\n", + "# ax1.set_xlabel('Datetime')\n", + "ax1.set_ylabel('Temperature (°C)', color='tab:red')\n", + "ax1.tick_params(axis='y', labelcolor='tab:red')\n", + "\n", + "# Add marker at 0 temperature\n", + "ax1.axhline(y=0, color='black', linewidth=1.5)\n", + "\n", + "# Plot precipitation as bars on the secondary y-axis\n", + "ax2 = ax1.twinx()\n", + "\n", + "# ax2.bar(x_axis, precipitation_city_1, color='#2E8B57', alpha=0.5, width=1, label=f'Precipitation {city_1}')\n", + "# ax2.bar(x_axis, precipitation_city_2, color='#FFD700', alpha=0.5, width=1, label=f'Precipitation {city_2}')\n", + "\n", + "ax2.fill_between(x_axis, precipitation_city_1, color='green', alpha=0.3, label=f'{city_1} Total')\n", + "ax2.fill_between(x_axis, precipitation_city_2, color='gold', alpha=0.3, label=f'{city_2} Total')\n", + "\n", + "ax2.set_ylabel(\"Precipitation (mm)\", color='tab:blue')\n", + "ax2.tick_params(axis='y', labelcolor='tab:blue')\n", + "\n", + "ax1.grid(axis = 'x')\n", + "ax1.legend(loc='upper left')\n", + "ax2.legend(loc='upper right')\n", + "\n", + "ax3.plot(x_axis, wind_city_1, color='#2E8B57', label=f'Wind {city_1}')\n", + "ax3.plot(x_axis, wind_city_2, color='#FFD700', label=f'Wind {city_2}')\n", + "# ax3.plot(x_axis, wind_speed, color='tab:purple', linestyle='dashed', label='Wind_speed')\n", + "ax3.set_ylabel('Wind (m/s)')\n", + "ax3.set_xlabel('Datetime')\n", + "ax3.legend(loc='upper right')\n", + "\n", + "ax3.grid(axis = 'x')\n", + "\n", + "\n", + "# Customize the x-axis to show ticks and labels only at the start of each month\n", + "plt.gca().xaxis.set_major_locator(mdates.MonthLocator()) \n", + "# Format ticks to show abbreviated month names (e.g., Jan, Feb)\n", + "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) \n", + "\n", + "plt.tight_layout()\n", + "\n", + "# Show the plot\n", + "plt.show()\n", + "\n", + "# print(df['precipitation.max'].max())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}