Skip to content

Commit

Permalink
notebook one_week_data - add: util-universal-functions, scatter-temp,…
Browse files Browse the repository at this point in the history
… delete: outliers
  • Loading branch information
torave committed Apr 15, 2025
1 parent 98d3177 commit 339a67c
Showing 1 changed file with 83 additions and 211 deletions.
294 changes: 83 additions & 211 deletions notebooks/notebook_one_week_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
"source": [
"### Lese fra fil\n",
"\n",
"Henter opp data lagret i filen, lagd over, og skriver ut lesbart ved hjelp av pandas"
"Ved hjelp av funksjonen `extract_city_df` fjernes unødvendige kolonner, og dataen blir normalisert for lettere lesbarhet."
]
},
{
Expand All @@ -144,11 +144,13 @@
"source": [
"import pandas as pd\n",
"\n",
"# Read json-file using pandas\n",
"data = pd.read_json(f'../data/output_stedsnavn/data_{filename}.json')\n",
"# Reads from file using pandas\n",
"weather_data = pd.read_json(f'../data/output_stedsnavn/data_{filename}.json')\n",
"\n",
"# Display the data\n",
"display(data)"
"from my_package.util import extract_city_df\n",
"\n",
"df = extract_city_df(weather_data)\n",
"display(df)"
]
},
{
Expand All @@ -169,30 +171,15 @@
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from my_package.util import extract_city_df\n",
"from my_package.util import ensure_rain_column\n",
"from my_package.util import ensure_snow_column\n",
"from my_package.util import ensure_column\n",
"\n",
"df = extract_city_df(data)\n",
"# Choose columns to ensure that are in the dataframe\n",
"columns_to_ensure = ['rain.1h', 'snow.1h']\n",
"\n",
"# Checks if the rain is a value, it will not be if it is no rain and then cause a KeyError\n",
"try:\n",
" rain = df['rain.1h']\n",
"\n",
"# If no rain, make the rain column and fill it with NaN\n",
"except KeyError:\n",
" df = ensure_rain_column(df)\n",
"\n",
"# Checks if the snow is a value, it will not be if it is no snow and then cause a KeyError\n",
"try:\n",
" snow = df['snow.1h']\n",
"\n",
"# If no snow, make the snow column and fill it with NaN\n",
"except KeyError:\n",
" df = ensure_snow_column(df)\n",
"# Runs the function with the dataframe and the columns to ensure\n",
"df = ensure_column(df, columns_to_ensure)\n",
"\n",
"# Display the dataframe, with potentially new columns\n",
"display(df) "
]
},
Expand All @@ -201,7 +188,9 @@
"metadata": {},
"source": [
"### Viser temperaturen\n",
"Regner ut gjennomsnittst-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur."
"Regner ut gjennomsnittst-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur.\n",
"\n",
"VI plotter også temperaturen i et scatter-diagram, med gjennomsnittet på en striplet linje. "
]
},
{
Expand All @@ -210,16 +199,64 @@
"metadata": {},
"outputs": [],
"source": [
"# Extract main values\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.dates as mdates\n",
"\n",
"# Stores the temperature values\n",
"temp = df['main.temp']\n",
"\n",
"temp_mean = temp.mean().round(2)\n",
"temp_max = temp.max().round(2)\n",
"temp_min = temp.min().round(2)\n",
"\n",
"# Print the average temperature\n",
"print(f'Mean temperatur: {temp_mean}')\n",
"print(f'Highest temperatur: {temp_max}')\n",
"print(f'Lowest temperatur: {temp_min}')"
"\n",
"# Find the highest and lowest temperatures\n",
"max_temp = df['main.temp'].max().round(2)\n",
"min_temp = df['main.temp'].min().round(2)\n",
"\n",
"print(\"Highest temperature:\", max_temp)\n",
"print(\"Lowest temperature:\", min_temp)\n",
"\n",
"\n",
"# Set the x_axis to the index, which represents the time\n",
"x_axis = df.index\n",
"\n",
"# Choose the width and height of the plot\n",
"plt.figure(figsize=(12, 6))\n",
"\n",
"# Scatter plot for each temperature reading\n",
"plt.scatter(x_axis, temp, color='tab:red', label='Temperaturmålinger', alpha=0.7)\n",
"\n",
"# Add a horizontal line for the mean temperature\n",
"plt.axhline(y=temp_mean, color='tab:red', linestyle=\"dashed\", label=f'Gj.snitt {temp_mean}°C')\n",
"\n",
"# Get the current axis and store it as ax\n",
"ax = plt.gca()\n",
"\n",
"# Customize the x-axis to show ticks for each day\n",
"ax.xaxis.set_major_locator(mdates.DayLocator(interval=1)) # Tick marks for each day\n",
"ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d %H:%M')) # Format as \"Month-day Hour:Minute\"\n",
"\n",
"# Rotate x-axis labels for better readability\n",
"plt.xticks(rotation=45)\n",
"\n",
"# Adjust layout\n",
"plt.tight_layout()\n",
"\n",
"# Add title for the plot\n",
"plt.title(f'Temperatur {city_name}, ({start_date} to {end_date})')\n",
"\n",
"# Add marker at 0 temperature\n",
"plt.axhline(y=0, color='black', linewidth=1.5)\n",
"\n",
"# Show grid\n",
"plt.grid()\n",
"\n",
"# Show legend\n",
"plt.legend(loc='upper right')\n",
"\n",
"# Show the plot\n",
"plt.show()\n"
]
},
{
Expand Down Expand Up @@ -251,7 +288,11 @@
"### Endre manglende verdier\n",
"I de fleste tilfeller virker dataene å være tilnærmet \"perfekte\", men de inkluderer bare snø og regn dersom det er snø eller regn. Derfor vil vi fa NaN verdier i de målingene det ikke har regnet/snødd. \n",
"\n",
"Under sjekker vi først om regn eller snø er i målingen, og hvis den er, bytter vi ut NaN med 0."
"Under sjekker vi først om regn eller snø er i målingen, og hvis de er, bytter vi ut NaN med 0. \n",
"\n",
"Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunnen til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n",
"\n",
"Deretter bruker vi interpolate dersom de skulle være NaN verdier, det er en funksjon som tar utgangspunkt i verdien før og verdien etter for å 'gjette' verdien som mangler. Vi har lagt til 'limit-direction', som gjør at den gjetter selv om man bare har en verdi på siden, som feks. første og siste verdi. "
]
},
{
Expand All @@ -260,50 +301,19 @@
"metadata": {},
"outputs": [],
"source": [
"from my_package.util import fill_rain_column\n",
"from my_package.util import fill_snow_column\n",
"\n",
"df = fill_rain_column(df)\n",
"df = fill_snow_column(df)\n",
"\n",
"# If wind_gust is stored, fill the NaN with 0\n",
"try: \n",
" df['wind.gust'] = df['wind.gust'].fillna(0)\n",
"except KeyError:\n",
" print(\"['wind.gust'], not in df\")\n",
"\n",
"# If wind_deg is stored, fill the NaN with 0\n",
"try: \n",
" df['wind.deg'] = df['wind.deg'].fillna(0)\n",
"except KeyError:\n",
" print(\"['wind.deg'], not in df\")\n",
"\n",
"# If wind_speed is stored, fill the NaN with 0\n",
"try: \n",
" df['wind.speed'] = df['wind.speed'].fillna(0)\n",
"except KeyError:\n",
" print(\"['wind.speed'], not in df\")\n",
"from my_package.util import fill_column_0\n",
"\n",
"# If temperature is missing, take the same as the one before\n",
"df['main.temp'] = df['main.temp'].fillna('obj.ffill()')\n",
"# Columns we want to fill 'NaN' with 0\n",
"columns_to_fill = ['rain.1h', 'snow.1h']\n",
"\n",
"# Forward fill missing values in what the temperature feels like\n",
"df['main.feels_like'] = df['main.feels_like'].fillna('obj.ffill()')\n",
"# Runs the function with the wanted columns\n",
"df = fill_column_0(df, columns_to_fill)\n",
"\n",
"# Forward fill missing values in the pressure\n",
"df['main.pressure'] = df['main.pressure'].fillna('obj.ffill()')\n",
"# Drops all the columns, if it has 'NaN' value.\n",
"df = df.dropna(axis='columns', how='all')\n",
"\n",
"# Forward fill missing values in the humidity\n",
"df['main.humidity'] = df['main.humidity'].fillna('obj.ffill()')\n",
"\n",
"# Forward fill missing values in the lowest temperature \n",
"df['main.temp_min'] = df['main.temp_min'].fillna('obj.ffill()')\n",
"\n",
"# Forward fill missing values in the highest temperature \n",
"df['main.temp_max'] = df['main.temp_max'].fillna('obj.ffill()')\n",
"\n",
"# Forward fill missing values of clouds\n",
"df['clouds.all'] = df['clouds.all'].fillna('obj.ffill()')\n",
"# Interpolate other missing 'NaN'-values\n",
"df = df.interpolate(method='linear', limit_direction='both')\n",
"\n",
"# Display the df, now without NaN\n",
"display(df)"
Expand Down Expand Up @@ -441,144 +451,6 @@
"# Show the plot\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import statistics\n",
"\n",
"# Extract temperature columns\n",
"temp_mean = df['main.temp']\n",
"\n",
"# Calculate means\n",
"temp_mean_mean = temp_mean.mean()\n",
"\n",
"\n",
"# Calculate standard deviations\n",
"temp_mean_stdev = statistics.stdev(temp_mean)\n",
"\n",
"\n",
"# Calculate 3 standard deviation limits\n",
"mean_lower_limit = temp_mean_mean - (temp_mean_stdev * 3)\n",
"mean_upper_limit = temp_mean_mean + (temp_mean_stdev * 3)\n",
"\n",
"# Identify outliers\n",
"mean_outliers = df.loc[(df['main.temp'] > mean_upper_limit) | (df['main.temp'] < mean_lower_limit), 'main.temp']\n",
"\n",
"# Print the outliers\n",
"print(\"Outliers in main.temp:\")\n",
"print(mean_outliers)\n",
"\n",
"# Replace outliers with NaN\n",
"df.loc[(df['main.temp'] > mean_upper_limit) | (df['main.temp'] < mean_lower_limit), 'main.temp'] = np.nan\n",
"\n",
"# Interpolate to replace NaN values with linear interpolation\n",
"df['main.temp'] = df['main.temp'].interpolate(method='linear')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import matplotlib.dates as mdates\n",
"import os\n",
"\n",
"# Where the figure should be saved when exported\n",
"output_folder = \"../data/output_fig\"\n",
"\n",
"# Creates the folder if it does not exist\n",
"os.makedirs(output_folder, exist_ok=True)\n",
"\n",
"# x_axis set to the index, which mean the datetime\n",
"x_axis = df.index\n",
"\n",
"# Gets the values\n",
"rain = df['rain.1h']\n",
"temp = df['main.temp']\n",
"snow = df['snow.1h']\n",
"wind_gust = df['wind.gust']\n",
"wind_speed = df['wind.speed']\n",
"temp_mean = temp.mean().round(2)\n",
"\n",
"# Two vertically stacked axis, (2 rows, 1 column), width and height of the figure, and the axis share the same x_axis\n",
"fig, (ax1, ax3) = plt.subplots(2, 1,figsize=(15, 8), sharex=True)\n",
"\n",
"\n",
"# Set the title for the diagram, above the first axis, with city_name and input_date\n",
"ax1.set_title(f'Weather data for {city_name} ({start_date}) to ({end_date}) ')\n",
"\n",
"# Plot temperature on the primary y-axis\n",
"ax1.plot(x_axis, temp, color='tab:red', label='Temperature (°C)')\n",
"ax1.axhline(y=temp_mean, color='tab:red', linestyle='dashed', label='Mean temperature (°C)')\n",
"ax1.axhline(y=0, color='black', linewidth=1.5)\n",
"\n",
"# Design the y-axis for temperatur\n",
"ax1.set_ylabel('Temperature (°C)', color='tab:red')\n",
"ax1.tick_params(axis='y', labelcolor='tab:red')\n",
"\n",
"# Plot Precipitation as bars on the secondary y-axis\n",
"ax2 = ax1.twinx()\n",
"\n",
"# Add rain\n",
"# ax2.bar(x_axis, rain, color='tab:blue', alpha=0.5, width=0.02, label='Rain (mm)')\n",
"ax2.hist(x_axis, bins=len(x_axis), weights=rain, color='tab:blue', alpha=0.5, label= 'Rain (mm)', bottom=snow)\n",
"\n",
"# Add snow\n",
"# ax2.bar(x_axis, snow, color='tab:grey', alpha=0.5, width=0.02, label='Snow (mm)')\n",
"ax2.hist(x_axis, bins=len(x_axis), weights=snow, color='tab:gray', alpha=0.5, label= 'Snow (mm)')\n",
"\n",
"# Design the y-axis for precipiation\n",
"ax2.set_ylabel(\"Precipitation (mm)\", color='tab:blue')\n",
"ax2.tick_params(axis='y', labelcolor='tab:blue')\n",
"\n",
"\n",
"# Customize the x-axis to show ticks for each hour\n",
"ax1.xaxis.set_major_locator(mdates.HourLocator(interval=12)) # Tick marks for every hour\n",
"ax1.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H')) # Format as \"Day Month Hour:Minute\"\n",
"\n",
"# Add label-description for both axis\n",
"ax1.legend(loc='upper left')\n",
"ax2.legend(loc='upper right')\n",
"\n",
"# Add grid, but only vertically\n",
"ax1.grid(axis = 'x')\n",
"\n",
"\n",
"# Plot the wind at the second x-axis (the axis below)\n",
"ax3.plot(x_axis, wind_gust, color='tab:purple', linestyle='dashed', label='Wind_gust')\n",
"ax3.plot(x_axis, wind_speed, color='tab:purple', label='Wind_speed')\n",
"ax3.set_ylabel('Wind (m/s)')\n",
"\n",
"# Add x_label visible for both x-axis\n",
"ax3.set_xlabel('Datetime')\n",
"\n",
"# Add label-description\n",
"ax3.legend(loc='upper right')\n",
"\n",
"# Customize the x-axis to show ticks for each hour\n",
"ax3.xaxis.set_major_locator(mdates.HourLocator(interval=12)) # Tick marks for every hour\n",
"ax3.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H')) # Format as \"Day Month Hour:Minute\"\n",
"\n",
"# Add grid, but only vertically\n",
"ax3.grid(axis = 'x')\n",
"\n",
"# Adjust layout\n",
"plt.tight_layout()\n",
"\n",
"# Save the plot to the data/output_fig folder\n",
"plot_path = os.path.join(output_folder, f\"weather_data_plot{city_name}.png\")\n",
"plt.savefig(plot_path) # Save the plot as a PNG file\n",
"\n",
"# Show the plot\n",
"plt.show()"
]
}
],
"metadata": {
Expand Down

0 comments on commit 339a67c

Please sign in to comment.