Skip to content

Commit

Permalink
add seasons for outliers statistic data
Browse files Browse the repository at this point in the history
  • Loading branch information
toravest committed May 23, 2025
1 parent 9a25ccf commit 7ffe658
Showing 1 changed file with 41 additions and 44 deletions.
85 changes: 41 additions & 44 deletions notebooks/notebook_statistic_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -357,63 +357,60 @@
"outputs": [],
"source": [
"import numpy as np\n",
"import statistics\n",
"import pandas as pd\n",
"\n",
"# Ensure 'month_day' is set as the index\n",
"if 'month_day' in df.columns:\n",
" df.set_index('month_day', inplace=True)\n",
"else:\n",
" print('month_day not in')\n",
"df = df.reset_index()\n",
"\n",
"# Extract temperature columns\n",
"temp_mean = df['temp.mean_celsius']\n",
"temp_record_min = df['temp.record_min_celsius']\n",
"temp_record_max = df['temp.record_max_celsius']\n",
"# Makes a mask for spring days\n",
"spring = (df['month_day'] >= '03-01') & (df['month_day'] <= '05-31')\n",
"\n",
"# Makes a mask for summer days\n",
"summer = (df['month_day'] >= '06-01') & (df['month_day'] <= '08-31') \n",
"\n",
"# Makes a mask for autumn days\n",
"autumn = (df['month_day'] >= '09-01') & (df['month_day'] <= '11-20')\n",
"\n",
"# Makes a mask for winter days, we need the or statment to get both sides of the calender year\n",
"winter = ((df['month_day'] >= '12-01') & (df['month_day'] <= '12-31')) | ((df['month_day'] >= '01-01') & (df['month_day'] <= '02-29'))\n",
"\n",
"# Calculate means\n",
"temp_mean_mean = temp_mean.mean()\n",
"temp_record_min_mean = temp_record_min.mean()\n",
"temp_record_max_mean = temp_record_max.mean()\n",
"seasons = {\n",
" \"spring\": spring,\n",
" \"summer\": summer,\n",
" \"autumn\": autumn,\n",
" \"winter\": winter\n",
" }\n",
"\n",
"# Calculate standard deviations\n",
"temp_mean_stdev = statistics.stdev(temp_mean)\n",
"temp_record_min_stdev = statistics.stdev(temp_record_min)\n",
"temp_record_max_stdev = statistics.stdev(temp_record_max)\n",
"def check_outliers(seasons, temp):\n",
" temp_mean = df[temp]\n",
"\n",
"# Calculate 3 standard deviation limits\n",
"mean_lower_limit = temp_mean_mean - (temp_mean_stdev * 3)\n",
"mean_upper_limit = temp_mean_mean + (temp_mean_stdev * 3)\n",
" for season_name, season_mask in seasons.items():\n",
" temp_mean_season = temp_mean[season_mask]\n",
" season_mean = temp_mean_season.mean()\n",
" season_stdev = temp_mean_season.std()\n",
"\n",
"min_lower_limit = temp_record_min_mean - (temp_record_min_stdev * 3)\n",
"min_upper_limit = temp_record_min_mean + (temp_record_min_stdev * 3)\n",
" upper_limit = season_mean + (season_stdev * 3)\n",
" lower_limit = season_mean - (season_stdev * 3)\n",
"\n",
"max_lower_limit = temp_record_max_mean - (temp_record_max_stdev * 3)\n",
"max_upper_limit = temp_record_max_mean + (temp_record_max_stdev * 3)\n",
" outliers_mask = (temp_mean_season > upper_limit) | (temp_mean_season < lower_limit)\n",
"\n",
"# Identify outliers\n",
"mean_outliers = df.loc[(df['temp.mean_celsius'] > mean_upper_limit) | (df['temp.mean_celsius'] < mean_lower_limit), 'temp.mean_celsius']\n",
"min_outliers = df.loc[(df['temp.record_min_celsius'] > min_upper_limit) | (df['temp.record_min_celsius'] < min_lower_limit), 'temp.record_min_celsius']\n",
"max_outliers = df.loc[(df['temp.record_max_celsius'] > max_upper_limit) | (df['temp.record_max_celsius'] < max_lower_limit), 'temp.record_max_celsius']\n",
" outlier_index = temp_mean_season.index[outliers_mask]\n",
"\n",
"# Print the outliers\n",
"print(\"Outliers in temp.mean_celsius:\")\n",
"print(mean_outliers)\n",
" if 'month_day' in df.columns:\n",
" outliers = df.loc[outlier_index, ['month_day', temp]]\n",
" else:\n",
" outliers = df.loc[outlier_index, temp]\n",
"\n",
"print(\"Outliers in temp.record_min_celsius:\")\n",
"print(min_outliers)\n",
" print(season_name)\n",
" print(outliers)\n",
"\n",
"print(\"Outliers in temp.record_max_celsius:\")\n",
"print(max_outliers)\n",
" # Replace with NaN\n",
" df.loc[outlier_index, temp] = np.nan\n",
"\n",
"# Replace outliers with NaN\n",
"df.loc[(df['temp.mean_celsius'] > mean_upper_limit) | (df['temp.mean_celsius'] < mean_lower_limit), 'temp.mean_celsius'] = np.nan\n",
"df.loc[(df['temp.record_min_celsius'] > min_upper_limit) | (df['temp.record_min_celsius'] < min_lower_limit), 'temp.record_min_celsius'] = np.nan\n",
"df.loc[(df['temp.record_max_celsius'] > max_upper_limit) | (df['temp.record_max_celsius'] < max_lower_limit), 'temp.record_max_celsius'] = np.nan\n",
" df[temp] = df[temp].interpolate(method='linear', limit_direction='both')\n",
"\n",
"# Interpolate to replace NaN values with linear interpolation\n",
"df['temp.mean_celsius'] = df['temp.mean_celsius'].interpolate(method='linear', limit_direction='both')\n",
"df['temp.record_min_celsius'] = df['temp.record_min_celsius'].interpolate(method='linear', limit_direction='both')\n",
"df['temp.record_max_celsius'] = df['temp.record_max_celsius'].interpolate(method='linear', limit_direction='both')"
"check_outliers(seasons, 'temp.mean_celsius')\n",
"check_outliers(seasons, 'temp.record_min_celsius')\n",
"check_outliers(seasons, 'temp.max_celsius')"
]
},
{
Expand Down

0 comments on commit 7ffe658

Please sign in to comment.