Skip to content

Commit

Permalink
add comments and function documentation notebook_statistic
Browse files Browse the repository at this point in the history
  • Loading branch information
toravest committed May 24, 2025
1 parent 7ffe658 commit 22394b6
Showing 1 changed file with 43 additions and 18 deletions.
61 changes: 43 additions & 18 deletions notebooks/notebook_statistic_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,8 @@
"import matplotlib.dates as mdates\n",
"\n",
"output_folder = \"../data/figures/output_fig_statistic\"\n",
"os.makedirs(output_folder, exist_ok=True) # Create the folder if it doesn't exist\n",
"# Create the folder if it doesn't exist\n",
"os.makedirs(output_folder, exist_ok=True)\n",
"\n",
"\n",
"temp = df['temp.mean_celsius']\n",
Expand Down Expand Up @@ -380,37 +381,61 @@
" \"winter\": winter\n",
" }\n",
"\n",
"def check_outliers(seasons, temp):\n",
" temp_mean = df[temp]\n",
"def check_outliers(df, seasons, temp_column):\n",
" '''\n",
" This function takes in the parameters season and temp. Seasons is a list of the different seasons, with masked dates.\n",
" The temp, indicates either the mean temperature, the max measured temp or the lowest measured temperature for each day.\n",
" The function then calculates both the mean and standard deviation based on the temp. Before calculating the upper and\n",
" lower limit with 3 standard deviation away from the mean. Then it uses these values to find the outliers, which is \n",
" either 3 standarddeviation above or below the mean. This is based on the seasons, so for each season, it find the outliers.\n",
" The outliers are then given the nan value, before using interpolate to 'guess' the missing value. Then the outliers are dealt with. \n",
" '''\n",
"\n",
" # Finds the temperature in the dataframe according to the inputed value\n",
" temp = df[temp_column]\n",
"\n",
" # Goes through each season, and calcute mean and st.dev and searching for outliers\n",
" for season_name, season_mask in seasons.items():\n",
" temp_mean_season = temp_mean[season_mask]\n",
" season_mean = temp_mean_season.mean()\n",
" season_stdev = temp_mean_season.std()\n",
"\n",
" # Stores the temperature based on the season mask dates\n",
" temp_season = temp[season_mask]\n",
"\n",
" # Calculates the mean for the season\n",
" season_mean = temp_season.mean()\n",
"\n",
" # Calculates the standard deviation for the season\n",
" season_stdev = temp_season.std()\n",
"\n",
" # Calculates the limits for 3 standard deviation from the mean\n",
" upper_limit = season_mean + (season_stdev * 3)\n",
" lower_limit = season_mean - (season_stdev * 3)\n",
"\n",
" outliers_mask = (temp_mean_season > upper_limit) | (temp_mean_season < lower_limit)\n",
" # Creates a mask for the temperatures above or below the upper/lower limit\n",
" outliers_mask = (temp_season > upper_limit) | (temp_season < lower_limit)\n",
"\n",
" outlier_index = temp_mean_season.index[outliers_mask]\n",
" # Gets the index for the outliers\n",
" outlier_index = temp_season.index[outliers_mask].tolist()\n",
"\n",
" # Cheks if the 'month_day' column are in the dataframe, and then add it with the output for the outliers\n",
" if 'month_day' in df.columns:\n",
" outliers = df.loc[outlier_index, ['month_day', temp]]\n",
" outliers = df.loc[outlier_index, ['month_day', temp_column]]\n",
" else:\n",
" outliers = df.loc[outlier_index, temp]\n",
" outliers = df.loc[outlier_index, temp_column]\n",
"\n",
" # Prints the season name, before the outliers\n",
" print(season_name)\n",
" print(outliers)\n",
"\n",
" # Replace with NaN\n",
" df.loc[outlier_index, temp] = np.nan\n",
" df.loc[outlier_index, temp_column] = np.nan\n",
"\n",
" df[temp] = df[temp].interpolate(method='linear', limit_direction='both')\n",
" # Interpolate other missing 'NaN'-values\n",
" df[temp_column] = df[temp_column].interpolate(method='linear', limit_direction='both')\n",
"\n",
"check_outliers(seasons, 'temp.mean_celsius')\n",
"check_outliers(seasons, 'temp.record_min_celsius')\n",
"check_outliers(seasons, 'temp.max_celsius')"
"# Runs the funciton for each of the wanted temperature columns in the df\n",
"check_outliers(df, seasons, 'temp.mean_celsius')\n",
"check_outliers(df, seasons, 'temp.record_min_celsius')\n",
"check_outliers(df, seasons, 'temp.max_celsius')"
]
},
{
Expand Down Expand Up @@ -448,8 +473,9 @@
"plt.plot(temp_record_min.index, temp_record_min, color='tab:blue', label='Min Temperature')\n",
"\n",
"# Customize the x-axis to show ticks and labels only at the start of each month\n",
"plt.gca().xaxis.set_major_locator(mdates.MonthLocator()) \n",
"plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) # Format ticks to show abbreviated month names (e.g., Jan, Feb)\n",
"plt.gca().xaxis.set_major_locator(mdates.MonthLocator())\n",
"# Format ticks to show abbreviated month names (e.g., Jan, Feb)\n",
"plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) \n",
"\n",
"plt.axhline(y=0, color='black', linewidth=1.5)\n",
"\n",
Expand Down Expand Up @@ -544,7 +570,6 @@
"# Normalize the data for better readability\n",
"df_records = pd.json_normalize(data)\n",
"\n",
"\n",
"# Displays the dataframe\n",
"display(df_records)"
]
Expand Down

0 comments on commit 22394b6

Please sign in to comment.