diff --git a/notebooks/notebook_compare_one_day_data.ipynb b/notebooks/notebook_compare_one_day_data.ipynb new file mode 100644 index 0000000..d63e089 --- /dev/null +++ b/notebooks/notebook_compare_one_day_data.ipynb @@ -0,0 +1,598 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook - Compare one day data\n", + "\n", + "Denne notebooken henter data fra to ønskede steder i Norge, og sammenligner dataen ved hjelp av visualisering i matplotlib av temperatur og nedbør." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Velg dag for vær-sammenligning\n", + "For å kunne hente data og gjøre en analyse trenger programmet å vite hvilken dag du vil hente ut for, også skrives alle timene fra den dagen ut. Programmet kan ikke hente ut data fra nåværende, eller senere datoer, altså må man velge datoer fra tidligere tidspunkt.\n", + "\n", + "Dataen skrives inn slik: (yyyy, mm, dd)\n", + "Her følger et eksempel: \n", + "|Hva|Hvordan|Eksempel|\n", + "|:---|:---:|:---:|\n", + "|år|yyyy|2025|\n", + "|måned|mm|03| \n", + "|dato|dd|01| \n", + "\n", + "Denne dataen skrives da inn på følgende hvis: (2025, 03, 01)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import time\n", + "\n", + "# Makes a function so the start and end date is the same date, with all hours of that date\n", + "def get_unix_timestamps_for_day():\n", + " date_input = input(\"Choose a date (yyyy, mm, dd): \")\n", + " date_components = date_input.split(\",\")\n", + " year = int(date_components[0])\n", + " month = int(date_components[1])\n", + " day = int(date_components[2])\n", + "\n", + " # Goes through all hours of the day, use %Y-%m-%d etc. from pythons strftime to convert datetime into a readable string \n", + " timestamps = []\n", + " for hour in range(24):\n", + " dt = datetime.datetime(year, month, day, hour, 0)\n", + " unix_timestamp = int(time.mktime(dt.timetuple()))\n", + " timestamps.append((unix_timestamp, dt.strftime('%Y-%m-%d %H:%M:%S'))) \n", + " \n", + " # Prevents from getting data for the current day, or the future\n", + " if dt >= datetime.datetime.now():\n", + " print(\"Failed, cant use future dates\")\n", + "\n", + " # If \n", + " raise ValueError\n", + "\n", + " # Prints the date chosen\n", + " print(f\"Selected date: {year}-{month:02d}-{day:02d}\")\n", + "\n", + " # Prints the timestamp and the date an hour of the day after\n", + " for ts, readable in timestamps:\n", + " print(f\"Unix Timestamp: {ts} -> {readable}\")\n", + " \n", + " return date_input, [ts[0] for ts in timestamps]\n", + "\n", + "date, timestamps = get_unix_timestamps_for_day()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Velg første sted til sammenligningen\n", + "For å kunne sammenligne data fra to steder, trenger vi først to steder å sammenligne. Stedene er foreløpig begrenset til Norge.\n", + "\n", + "Her kunne vi brukt en løkke, spesielt en while løkke for å sjekke at stedet fungerer, og for å kjøre begge stedene på samme celle. Men vi har valgt å kjøre en og en for lettere å oppfatte feil ved sted, og at dersom sted 1 blir feil, må du ikke velge sted 2 på nytt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.fetch_data import fetch_data\n", + "\n", + "# Import function to replace nordic (æøå)\n", + "from my_package.util import replace_nordic\n", + "\n", + "# User choose a city they want the weather data from\n", + "city_1 = input(\"Enter first city name: \")\n", + "\n", + "city_1 = replace_nordic(city_1)\n", + "\n", + "# Start_date is the first timestamp, end_date is the last\n", + "start_date, end_date = timestamps[0], timestamps[-1]\n", + "\n", + "# Stores the values in the variables\n", + "data_city_1, folder = fetch_data(start_date, end_date, city_1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Velg andre sted til sammenligningen\n", + "Velg det andre stedet som skal brukes i sammenligningen, fortsatt begrenset til Norge." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.fetch_data import fetch_data\n", + "\n", + "# Import function to replace nordic (æøå)\n", + "from my_package.util import replace_nordic\n", + "\n", + "# User choose a city they want the weather data from\n", + "city_2 = input(\"Enter first city name: \")\n", + "\n", + "city_2 = replace_nordic(city_2)\n", + "\n", + "# Start_date is the first timestamp, end_date is the last\n", + "start_date, end_date = timestamps[0], timestamps[-1]\n", + "\n", + "# Stores the values in the variables\n", + "data_city_2, folder = fetch_data(start_date, end_date, city_2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lagre data i en json-fil\n", + "\n", + "Vi samler dataen for begge stedene og skriver de til en json fil.\n", + "\n", + "Skriv inn navn for til filen du vil lagre med dataen.\n", + "\n", + "Eks. test\n", + "Da vil filen lagres som data_**test**.json, i mappen \"../data/output_sammenligning_dag/data_{filnavn}.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Create a combined dict, and 'separete' them with 'city_1' and 'city_2'\n", + "city_weather = {\n", + " \"city_1\": data_city_1,\n", + " \"city_2\": data_city_2\n", + "}\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "from my_package.write_data import write_data\n", + "\n", + "# Overwrites the folder stored inside the function\n", + "folder = \"../data/output_sammenligning_dag\"\n", + "\n", + "filename = input(\"Write filename: \")\n", + "\n", + "# Writes the data, with the chosen name\n", + "write_data(city_weather, folder, filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lese fra fil\n", + "\n", + "Henter opp data lagret i filen, lagd over, og skriver ut lesbart ved hjelp av pandas\n", + "\n", + "Har har vi laget en funksjon som henter ut dataene for ønsket sted, og gjør endringer vi ønsker skal bli gjort for dataen for begge steder som:\n", + "- fjerner 'weather' kolonnen, som inneholder metadata\n", + "- setter tiden som index\n", + "- normaliserer, slik at det er enklere å lese all dataen\n", + " \n", + "Vi sjekker også at vi har data for stedene, altså at funskjonen funker, før den eventuelt viser dataen for stedene." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "\n", + "file_path = f'../data/output_sammenligning_dag/data_{filename}.json'\n", + "\n", + "from my_package.util import extract_city_df\n", + "\n", + "# Load the whole JSON file\n", + "with open(file_path, 'r') as f:\n", + " all_city_data = json.load(f)\n", + "\n", + "# Separate variables for each city\n", + "city_1_df = extract_city_df(all_city_data.get('city_1'))\n", + "city_2_df = extract_city_df(all_city_data.get('city_2'))\n", + "\n", + "# Checks if the data is not empty, aka there are values\n", + "if city_1_df is not None:\n", + " # Prints the city name\n", + " print(f\"{city_1} data:\")\n", + " # Display the dataframe readable\n", + " display(city_1_df)\n", + "\n", + "# Checks if the data is not empty, aka there are values\n", + "if city_2_df is not None:\n", + " # Prints the city name\n", + " print(f\"{city_2} data:\")\n", + " # Display the dataframe readable\n", + " display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Viser temperaturen\n", + "Regner ut gjennomsnitts-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur for begge steder.\n", + "\n", + "Legger dataen inn i en dataframe for å vise de i en ryddigere og mer lettlest tabell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Stores the temperature values of both cities\n", + "temp_city_1 = city_1_df['main.temp']\n", + "temp_city_2 = city_2_df['main.temp']\n", + "\n", + "# Find the mean temperature in both citites\n", + "temp_mean_city_1 = temp_city_1.mean().round(2)\n", + "temp_mean_city_2 = temp_city_2.mean().round(2)\n", + "\n", + "# Find the highest temperature in both cities\n", + "max_temp_city_1 = city_1_df['main.temp'].max().round(2)\n", + "max_temp_city_2 = city_2_df['main.temp'].max().round(2)\n", + "\n", + "# Find the lowest tempeartues in both cities\n", + "min_temp_city_1 = city_1_df['main.temp'].min().round(2)\n", + "min_temp_city_2 = city_2_df['main.temp'].min().round(2)\n", + "\n", + "# Stores the values of both city in a list\n", + "city_names = [city_1, city_2]\n", + "mean_temp = [temp_mean_city_1, temp_mean_city_2]\n", + "max_temp = [max_temp_city_1, max_temp_city_2]\n", + "min_temp = [min_temp_city_1, min_temp_city_2]\n", + "\n", + "# Creates dataframe of the tempvalues, to display it more readable\n", + "df_temp_cities = pd.DataFrame({\n", + " \"City\": city_names,\n", + " \"Mean Temperature (°C)\": mean_temp,\n", + " \"Highest Temperature (°C)\": max_temp,\n", + " \"Lowest Temperature (°C)\": min_temp\n", + "})\n", + "\n", + "# Display the dataframe with the values of both cities\n", + "display(df_temp_cities)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sjekker nedbørs-kolonnene\n", + "\n", + "En ting vi har oppfattet når vi har blitt kjent med dataen er at regn og snø kolonne mangler om det ikke har regnet eller snødd. Vi ønsker senere å plotte oversikt over nedbør, og det er lettere å plotte 0 nedbør enn manglende verdier. \n", + "\n", + "Derfor sjekker vi her om regn/snø kolonne eksisterer i dataen, om den ikke gjør så lager vi en og fyller den med NaN." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.util import ensure_rain_column\n", + "from my_package.util import ensure_snow_column\n", + "\n", + "# Chekcs if there are a rain/snow column in city_1_df, if not the function creates one with NaN values\n", + "city_1_df = ensure_rain_column(city_1_df)\n", + "city_1_df = ensure_snow_column(city_1_df)\n", + "# Displays the dataframe readable\n", + "display(city_1_df)\n", + "\n", + "# Chekcs if there are a rain/snow column in city_2_df, if not the function creates one with NaN values\n", + "city_2_df = ensure_rain_column(city_2_df)\n", + "city_2_df = ensure_snow_column(city_2_df)\n", + "# Displays the dataframe readable\n", + "display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sjekk for manglende verdier\n", + "Missigno sjekker og visualiserer manglende verdier, slik at det blir lettere å se hvilke kolonner feilen ligger i. \n", + "\n", + "Vis the blir \"hull\" i en søyle, tyder the på manglende verdier.\n", + "\n", + "Det er ikke vanlig å legge til tittel, men i dette tilfellet siden vi har to serier vi sjekker data for, har vi brukt matplotlib for å lettere se hvilket sted som har manglende data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import missingno as msno\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Plot missing data matrix fro city_1\n", + "msno.matrix(city_1_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_1}')\n", + "\n", + "# Plot missing data matrix fro city_2\n", + "msno.matrix(city_2_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_2}')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Endre manglende verdier\n", + "\n", + "Nå skal vi ha regn/snø kolonner, og kanksje inneholder de NaN.\n", + "\n", + "Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunne til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n", + "\n", + "Vi sjekker også temperatur-verdiene for begge stedene. Dersom det skulle mangle en verdi bruker vi ffill (forward-fill), altså at den bruker verdien som var før.\n", + "\n", + "Det kan være mangler i andre verdier, men vi endrer ikke disse nå, da vi kun vil bruke data om nedbør og temperatur." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Gets the absolute path to the src folder\n", + "sys.path.append(os.path.abspath(\"../src\"))\n", + "\n", + "# Now we can import the fucntion from the module\n", + "from my_package.util import fill_rain_column\n", + "from my_package.util import fill_snow_column\n", + "\n", + "# Fill NaN values with 0, if there are any, for both rain/snow column\n", + "city_1_df = fill_rain_column(city_1_df)\n", + "city_1_df = fill_snow_column(city_1_df)\n", + "\n", + "# Fill NaN values with 0, if there are any, for both rain/snow column\n", + "city_2_df = fill_rain_column(city_2_df)\n", + "city_2_df = fill_snow_column(city_2_df)\n", + "\n", + "# Drops all the columns, if it has 'NaN' value.\n", + "city_1_df = city_1_df.dropna(axis='columns', how='all')\n", + "city_2_df = city_2_df.dropna(axis='columns', how='all')\n", + "\n", + "# If temperature is missing, take the same as the one before\n", + "city_1_df['main.temp'] = city_1_df['main.temp'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the lowest temperature \n", + "city_1_df['main.temp_min'] = city_1_df['main.temp_min'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the highest temperature \n", + "city_1_df['main.temp_max'] = city_1_df['main.temp_max'].fillna('obj.ffill()')\n", + "\n", + "\n", + "# If temperature is missing, take the same as the one before\n", + "city_2_df['main.temp'] = city_2_df['main.temp'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the lowest temperature \n", + "city_2_df['main.temp_min'] = city_2_df['main.temp_min'].fillna('obj.ffill()')\n", + "\n", + "# Forward fill missing values in the highest temperature \n", + "city_2_df['main.temp_max'] = city_2_df['main.temp_max'].fillna('obj.ffill()')\n", + "\n", + "# Display both cities readable\n", + "display(city_1_df)\n", + "display(city_2_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualisere endring av data\n", + "Har lagt inn en ny missigno visualisering, for å se at de manglende dataene \"forsvinner\" når vi kjører cellen over. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import missingno as msno\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Plot missing data matrix fro city_1\n", + "msno.matrix(city_1_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_1}')\n", + "\n", + "# Plot missing data matrix fro city_2\n", + "msno.matrix(city_2_df)\n", + "\n", + "# Add title using matplotlib\n", + "plt.title(f'Missing Data for {city_2}')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualisere data i en graf\n", + "Ved hjelp av Matplotlib har vi visualiert ønsket data, og ved hjelp av subplot, en modul i matplotlib, kan vi plotte flere verdier i samme graf, og få \"to y-akse\" på samme x-akse. \n", + "\n", + "Temperatur for begge stedene finner vi i den øverste grafen, hvor vi også har lagt inn gjennomsnittstemperaturen.\n", + "\n", + "I grafen under ser vi oversikt over nedbør for begge stedene sammenlignet 'side om side'. Vi skiller også mellom snø og regn, og dersom det skulle snø og regne i samme time, vil de bare 'stables' oppå hverandre.\n", + "\n", + "Grafen lagres i " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import os\n", + "\n", + "# Where the figure should be saved when exported\n", + "output_folder = \"../data/output_fig_sammenligning\"\n", + "\n", + "# Creates the folder if it does not exist\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "\n", + "# Extract rain values for both cities\n", + "city_1_rain = city_1_df['rain.1h']\n", + "city_2_rain = city_2_df['rain.1h']\n", + "\n", + "# Extract snow values for both cities\n", + "city_1_snow = city_1_df['snow.1h']\n", + "city_2_snow = city_2_df['snow.1h']\n", + "\n", + "# x_axis set to the index, which mean the datetime\n", + "x_axis = city_1_df.index\n", + "\n", + "# Makes the datetime^ to numeric using mdates-functions\n", + "x_axis_numeric = mdates.date2num(x_axis)\n", + "\n", + "# Two vertically stacked axis, (2 rows, 1 column), width and height of the figure, and the axis share the same x_axis\n", + "fig, (ax1, ax3) = plt.subplots(2, 1,figsize=(15, 8), sharex=True)\n", + "\n", + "# Set the title for the whole plot, above the upper plot\n", + "ax1.set_title(f'Weather compare for {city_1} and {city_2} ({date}) ')\n", + "\n", + "# Plots the temperature for city_1, with mean temperature\n", + "ax1.plot(x_axis, temp_city_1, color='#2E8B57', label=f'Temperature {city_1}')\n", + "ax1.axhline(temp_mean_city_1, color='#2E8B57', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", + "\n", + "# Plots the temperature for city_2s, with mean temperature\n", + "ax1.plot(x_axis, temp_city_2, color='#FFD700', label=f'Temperature {city_2}')\n", + "ax1.axhline(temp_mean_city_2, color='#FFD700', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", + "\n", + "# Design the y-axis for Temperature\n", + "ax1.set_ylabel(\"Temperature (°C)\")\n", + "\n", + "# Add grid only vertically\n", + "ax1.grid(axis='x')\n", + "\n", + "# Adjust the width of bars for better side-by-side comparison, for the precipitation\n", + "width = 0.01 \n", + "\n", + "# Plot rain/snow bars for both cities, sidy-by-side for better comparrison\n", + "# Place the snow and rain bars for city_1, with offset for better comparrisson with city_2\n", + "ax3.bar(x_axis_numeric - width/2, city_1_rain, width=width, alpha=0.7, color='#2E8B57', label=f'Rain {city_1}', edgecolor='#FFD700', bottom=city_1_snow)\n", + "ax3.bar(x_axis_numeric - width/2, city_1_snow, width=width, alpha=0.5, color='#2E8B57', label=f'Snow {city_1}', hatch='////', edgecolor='#FFD700')\n", + "\n", + "# Place the snow and rain bars for city_2, with offset for better comparrisson with city_1\n", + "ax3.bar(x_axis_numeric + width/2, city_2_rain, width=width, alpha=0.7, color='#FFD700', label=f'Rain {city_2}', edgecolor=\"#2E8B57\", bottom=city_2_snow)\n", + "ax3.bar(x_axis_numeric + width/2, city_2_snow, width=width, alpha=0.5, color='#FFD700', label=f'Snow {city_2}', hatch='////', edgecolor=\"#2E8B57\")\n", + "\n", + "\n", + "# Design the y-axis for precipiation\n", + "ax3.set_ylabel(\"Precipitation (mm)\")\n", + "\n", + "# Add grid only vertically\n", + "ax3.grid(axis='x')\n", + "\n", + "# Format the x-axis to show all hours, in the format \"HH:MM\"\n", + "ax1.xaxis.set_major_locator(mdates.HourLocator()) \n", + "ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))\n", + "\n", + "# Add label description\n", + "ax1.legend(loc='upper left')\n", + "ax3.legend(loc='upper left')\n", + "\n", + "# Add label for x-axis\n", + "ax3.set_xlabel('Datetime')\n", + "\n", + "# Save the plot to the data/output_fig folder\n", + "plot_path = os.path.join(output_folder, f\"weather_compare_plot_{city_1}_{city_2}.png\")\n", + "# Save the plot as a PNG file\n", + "plt.savefig(plot_path)\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}