diff --git a/notebooks/notebook_statistic_data.ipynb b/notebooks/notebook_statistic_data.ipynb index d3527e5..d6cc41f 100644 --- a/notebooks/notebook_statistic_data.ipynb +++ b/notebooks/notebook_statistic_data.ipynb @@ -209,7 +209,8 @@ "import matplotlib.dates as mdates\n", "\n", "output_folder = \"../data/figures/output_fig_statistic\"\n", - "os.makedirs(output_folder, exist_ok=True) # Create the folder if it doesn't exist\n", + "# Create the folder if it doesn't exist\n", + "os.makedirs(output_folder, exist_ok=True)\n", "\n", "\n", "temp = df['temp.mean_celsius']\n", @@ -380,37 +381,61 @@ " \"winter\": winter\n", " }\n", "\n", - "def check_outliers(seasons, temp):\n", - " temp_mean = df[temp]\n", + "def check_outliers(df, seasons, temp_column):\n", + " '''\n", + " This function takes in the parameters season and temp. Seasons is a list of the different seasons, with masked dates.\n", + " The temp, indicates either the mean temperature, the max measured temp or the lowest measured temperature for each day.\n", + " The function then calculates both the mean and standard deviation based on the temp. Before calculating the upper and\n", + " lower limit with 3 standard deviation away from the mean. Then it uses these values to find the outliers, which is \n", + " either 3 standarddeviation above or below the mean. This is based on the seasons, so for each season, it find the outliers.\n", + " The outliers are then given the nan value, before using interpolate to 'guess' the missing value. Then the outliers are dealt with. \n", + " '''\n", "\n", + " # Finds the temperature in the dataframe according to the inputed value\n", + " temp = df[temp_column]\n", + "\n", + " # Goes through each season, and calcute mean and st.dev and searching for outliers\n", " for season_name, season_mask in seasons.items():\n", - " temp_mean_season = temp_mean[season_mask]\n", - " season_mean = temp_mean_season.mean()\n", - " season_stdev = temp_mean_season.std()\n", "\n", + " # Stores the temperature based on the season mask dates\n", + " temp_season = temp[season_mask]\n", + "\n", + " # Calculates the mean for the season\n", + " season_mean = temp_season.mean()\n", + "\n", + " # Calculates the standard deviation for the season\n", + " season_stdev = temp_season.std()\n", + "\n", + " # Calculates the limits for 3 standard deviation from the mean\n", " upper_limit = season_mean + (season_stdev * 3)\n", " lower_limit = season_mean - (season_stdev * 3)\n", "\n", - " outliers_mask = (temp_mean_season > upper_limit) | (temp_mean_season < lower_limit)\n", + " # Creates a mask for the temperatures above or below the upper/lower limit\n", + " outliers_mask = (temp_season > upper_limit) | (temp_season < lower_limit)\n", "\n", - " outlier_index = temp_mean_season.index[outliers_mask]\n", + " # Gets the index for the outliers\n", + " outlier_index = temp_season.index[outliers_mask].tolist()\n", "\n", + " # Cheks if the 'month_day' column are in the dataframe, and then add it with the output for the outliers\n", " if 'month_day' in df.columns:\n", - " outliers = df.loc[outlier_index, ['month_day', temp]]\n", + " outliers = df.loc[outlier_index, ['month_day', temp_column]]\n", " else:\n", - " outliers = df.loc[outlier_index, temp]\n", + " outliers = df.loc[outlier_index, temp_column]\n", "\n", + " # Prints the season name, before the outliers\n", " print(season_name)\n", " print(outliers)\n", "\n", " # Replace with NaN\n", - " df.loc[outlier_index, temp] = np.nan\n", + " df.loc[outlier_index, temp_column] = np.nan\n", "\n", - " df[temp] = df[temp].interpolate(method='linear', limit_direction='both')\n", + " # Interpolate other missing 'NaN'-values\n", + " df[temp_column] = df[temp_column].interpolate(method='linear', limit_direction='both')\n", "\n", - "check_outliers(seasons, 'temp.mean_celsius')\n", - "check_outliers(seasons, 'temp.record_min_celsius')\n", - "check_outliers(seasons, 'temp.max_celsius')" + "# Runs the funciton for each of the wanted temperature columns in the df\n", + "check_outliers(df, seasons, 'temp.mean_celsius')\n", + "check_outliers(df, seasons, 'temp.record_min_celsius')\n", + "check_outliers(df, seasons, 'temp.max_celsius')" ] }, { @@ -448,8 +473,9 @@ "plt.plot(temp_record_min.index, temp_record_min, color='tab:blue', label='Min Temperature')\n", "\n", "# Customize the x-axis to show ticks and labels only at the start of each month\n", - "plt.gca().xaxis.set_major_locator(mdates.MonthLocator()) \n", - "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) # Format ticks to show abbreviated month names (e.g., Jan, Feb)\n", + "plt.gca().xaxis.set_major_locator(mdates.MonthLocator())\n", + "# Format ticks to show abbreviated month names (e.g., Jan, Feb)\n", + "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) \n", "\n", "plt.axhline(y=0, color='black', linewidth=1.5)\n", "\n", @@ -544,7 +570,6 @@ "# Normalize the data for better readability\n", "df_records = pd.json_normalize(data)\n", "\n", - "\n", "# Displays the dataframe\n", "display(df_records)" ]