From 7ffe6587e73f2b3bf11297a4fc3358a79090146c Mon Sep 17 00:00:00 2001 From: toravest Date: Fri, 23 May 2025 17:18:01 +0200 Subject: [PATCH] add seasons for outliers statistic data --- notebooks/notebook_statistic_data.ipynb | 85 ++++++++++++------------- 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/notebooks/notebook_statistic_data.ipynb b/notebooks/notebook_statistic_data.ipynb index 5ba4d36..d3527e5 100644 --- a/notebooks/notebook_statistic_data.ipynb +++ b/notebooks/notebook_statistic_data.ipynb @@ -357,63 +357,60 @@ "outputs": [], "source": [ "import numpy as np\n", - "import statistics\n", + "import pandas as pd\n", "\n", - "# Ensure 'month_day' is set as the index\n", - "if 'month_day' in df.columns:\n", - " df.set_index('month_day', inplace=True)\n", - "else:\n", - " print('month_day not in')\n", + "df = df.reset_index()\n", "\n", - "# Extract temperature columns\n", - "temp_mean = df['temp.mean_celsius']\n", - "temp_record_min = df['temp.record_min_celsius']\n", - "temp_record_max = df['temp.record_max_celsius']\n", + "# Makes a mask for spring days\n", + "spring = (df['month_day'] >= '03-01') & (df['month_day'] <= '05-31')\n", + "\n", + "# Makes a mask for summer days\n", + "summer = (df['month_day'] >= '06-01') & (df['month_day'] <= '08-31') \n", + "\n", + "# Makes a mask for autumn days\n", + "autumn = (df['month_day'] >= '09-01') & (df['month_day'] <= '11-20')\n", + "\n", + "# Makes a mask for winter days, we need the or statment to get both sides of the calender year\n", + "winter = ((df['month_day'] >= '12-01') & (df['month_day'] <= '12-31')) | ((df['month_day'] >= '01-01') & (df['month_day'] <= '02-29'))\n", "\n", - "# Calculate means\n", - "temp_mean_mean = temp_mean.mean()\n", - "temp_record_min_mean = temp_record_min.mean()\n", - "temp_record_max_mean = temp_record_max.mean()\n", + "seasons = {\n", + " \"spring\": spring,\n", + " \"summer\": summer,\n", + " \"autumn\": autumn,\n", + " \"winter\": winter\n", + " }\n", "\n", - "# Calculate standard deviations\n", - "temp_mean_stdev = statistics.stdev(temp_mean)\n", - "temp_record_min_stdev = statistics.stdev(temp_record_min)\n", - "temp_record_max_stdev = statistics.stdev(temp_record_max)\n", + "def check_outliers(seasons, temp):\n", + " temp_mean = df[temp]\n", "\n", - "# Calculate 3 standard deviation limits\n", - "mean_lower_limit = temp_mean_mean - (temp_mean_stdev * 3)\n", - "mean_upper_limit = temp_mean_mean + (temp_mean_stdev * 3)\n", + " for season_name, season_mask in seasons.items():\n", + " temp_mean_season = temp_mean[season_mask]\n", + " season_mean = temp_mean_season.mean()\n", + " season_stdev = temp_mean_season.std()\n", "\n", - "min_lower_limit = temp_record_min_mean - (temp_record_min_stdev * 3)\n", - "min_upper_limit = temp_record_min_mean + (temp_record_min_stdev * 3)\n", + " upper_limit = season_mean + (season_stdev * 3)\n", + " lower_limit = season_mean - (season_stdev * 3)\n", "\n", - "max_lower_limit = temp_record_max_mean - (temp_record_max_stdev * 3)\n", - "max_upper_limit = temp_record_max_mean + (temp_record_max_stdev * 3)\n", + " outliers_mask = (temp_mean_season > upper_limit) | (temp_mean_season < lower_limit)\n", "\n", - "# Identify outliers\n", - "mean_outliers = df.loc[(df['temp.mean_celsius'] > mean_upper_limit) | (df['temp.mean_celsius'] < mean_lower_limit), 'temp.mean_celsius']\n", - "min_outliers = df.loc[(df['temp.record_min_celsius'] > min_upper_limit) | (df['temp.record_min_celsius'] < min_lower_limit), 'temp.record_min_celsius']\n", - "max_outliers = df.loc[(df['temp.record_max_celsius'] > max_upper_limit) | (df['temp.record_max_celsius'] < max_lower_limit), 'temp.record_max_celsius']\n", + " outlier_index = temp_mean_season.index[outliers_mask]\n", "\n", - "# Print the outliers\n", - "print(\"Outliers in temp.mean_celsius:\")\n", - "print(mean_outliers)\n", + " if 'month_day' in df.columns:\n", + " outliers = df.loc[outlier_index, ['month_day', temp]]\n", + " else:\n", + " outliers = df.loc[outlier_index, temp]\n", "\n", - "print(\"Outliers in temp.record_min_celsius:\")\n", - "print(min_outliers)\n", + " print(season_name)\n", + " print(outliers)\n", "\n", - "print(\"Outliers in temp.record_max_celsius:\")\n", - "print(max_outliers)\n", + " # Replace with NaN\n", + " df.loc[outlier_index, temp] = np.nan\n", "\n", - "# Replace outliers with NaN\n", - "df.loc[(df['temp.mean_celsius'] > mean_upper_limit) | (df['temp.mean_celsius'] < mean_lower_limit), 'temp.mean_celsius'] = np.nan\n", - "df.loc[(df['temp.record_min_celsius'] > min_upper_limit) | (df['temp.record_min_celsius'] < min_lower_limit), 'temp.record_min_celsius'] = np.nan\n", - "df.loc[(df['temp.record_max_celsius'] > max_upper_limit) | (df['temp.record_max_celsius'] < max_lower_limit), 'temp.record_max_celsius'] = np.nan\n", + " df[temp] = df[temp].interpolate(method='linear', limit_direction='both')\n", "\n", - "# Interpolate to replace NaN values with linear interpolation\n", - "df['temp.mean_celsius'] = df['temp.mean_celsius'].interpolate(method='linear', limit_direction='both')\n", - "df['temp.record_min_celsius'] = df['temp.record_min_celsius'].interpolate(method='linear', limit_direction='both')\n", - "df['temp.record_max_celsius'] = df['temp.record_max_celsius'].interpolate(method='linear', limit_direction='both')" + "check_outliers(seasons, 'temp.mean_celsius')\n", + "check_outliers(seasons, 'temp.record_min_celsius')\n", + "check_outliers(seasons, 'temp.max_celsius')" ] }, {