diff --git a/notebooks/notebook_compare_one_day_data.ipynb b/notebooks/notebook_compare_one_day_data.ipynb index e0e5fd4..e9f1bec 100644 --- a/notebooks/notebook_compare_one_day_data.ipynb +++ b/notebooks/notebook_compare_one_day_data.ipynb @@ -93,21 +93,19 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.fetch_data import fetch_data\n", + "from my_package.data import fetch_time_data\n", "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", - "# User choose a city they want the weather data from\n", - "city_1 = input(\"Enter first city name: \")\n", - "\n", - "city_1 = replace_nordic(city_1)\n", + "# User input the city, for the weather\n", + "city_1 = input_place()\n", "\n", "# Start_date is the first timestamp, end_date is the last\n", "start_date, end_date = timestamps[0], timestamps[-1]\n", "\n", "# Stores the values in the variables\n", - "data_city_1, folder = fetch_data(start_date, end_date, city_1)" + "data_city_1, folder = fetch_time_data(start_date, end_date, city_1)" ] }, { @@ -131,21 +129,19 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.fetch_data import fetch_data\n", - "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "from my_package.data import fetch_time_data\n", "\n", - "# User choose a city they want the weather data from\n", - "city_2 = input(\"Enter first city name: \")\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", - "city_2 = replace_nordic(city_2)\n", + "# User input the city, for the weather\n", + "city_2 = input_place()\n", "\n", "# Start_date is the first timestamp, end_date is the last\n", "start_date, end_date = timestamps[0], timestamps[-1]\n", "\n", "# Stores the values in the variables\n", - "data_city_2, folder = fetch_data(start_date, end_date, city_2)" + "data_city_2, folder = fetch_time_data(start_date, end_date, city_2)" ] }, { @@ -180,7 +176,7 @@ "# Gets the absolute path to the src folder\n", "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", - "from my_package.write_data import write_data\n", + "from my_package.data import write_data\n", "\n", "# Overwrites the folder stored inside the function\n", "folder = \"../data/output_sammenligning_dag\"\n", @@ -199,12 +195,9 @@ "\n", "Henter opp data lagret i filen, lagd over, og skriver ut lesbart ved hjelp av pandas\n", "\n", - "Har har vi laget en funksjon som henter ut dataene for ønsket sted, og gjør endringer vi ønsker skal bli gjort for dataen for begge steder som:\n", - "- fjerner 'weather' kolonnen, som inneholder metadata\n", - "- setter tiden som index\n", - "- normaliserer, slik at det er enklere å lese all dataen\n", + "Vi importerer funksjonen `extract_city_df` som fjerner uønskede kolonner, og returnerer dataen mer lesbart.\n", " \n", - "Vi sjekker også at vi har data for stedene, altså at funskjonen funker, før den eventuelt viser dataen for stedene." + "Vi lagrer dataen for begge byene, men legger til kolonne 'city' for å lagre id og 'city_name' for å lagre stedsnavn. Før vi slår sammen begge dataene til en dataframe, for å lettere bruke Pandas SQL for å hente ut ønsket data for begge stedene." ] }, { @@ -215,32 +208,35 @@ "source": [ "import pandas as pd\n", "import json\n", + "from pandasql import sqldf\n", "\n", "file_path = f'../data/output_sammenligning_dag/data_{filename}.json'\n", "\n", - "from my_package.util import extract_city_df\n", + "from my_package.data import extract_city_df\n", "\n", "# Load the whole JSON file\n", "with open(file_path, 'r') as f:\n", " all_city_data = json.load(f)\n", "\n", - "# Separate variables for each city\n", + "# Extract and label each city, with id and name\n", "city_1_df = extract_city_df(all_city_data.get('city_1'))\n", + "city_1_df['city'] = 'city_1'\n", + "city_1_df['city_name'] = city_1\n", + "\n", "city_2_df = extract_city_df(all_city_data.get('city_2'))\n", + "city_2_df['city'] = 'city_2'\n", + "city_2_df['city_name'] = city_2\n", + "\n", + "# Concat both city df into one, to be able to use pandas sql\n", + "both_cities_df = pd.concat([city_1_df, city_2_df])\n", + "\n", + "# Pandas SQL adds more values to dt, example \"yyyy-mm-dd hh:mm:ss:xxxxx\", so to delete the extra x´s we reset the index\n", + "both_cities_df = both_cities_df.reset_index()\n", + "# And add dt with the wanted format\n", + "both_cities_df['dt'] = both_cities_df['dt'].dt.strftime('%Y-%m-%d %H:%M:%S')\n", "\n", - "# Checks if the data is not empty, aka there are values\n", - "if city_1_df is not None:\n", - " # Prints the city name\n", - " print(f\"{city_1} data:\")\n", - " # Display the dataframe readable\n", - " display(city_1_df)\n", - "\n", - "# Checks if the data is not empty, aka there are values\n", - "if city_2_df is not None:\n", - " # Prints the city name\n", - " print(f\"{city_2} data:\")\n", - " # Display the dataframe readable\n", - " display(city_2_df)" + "# Display all the values from both cities, with city 1 first, then city 2\n", + "display(sqldf('''SELECT * FROM both_cities_df'''))" ] }, { @@ -248,9 +244,9 @@ "metadata": {}, "source": [ "### Viser temperaturen\n", - "Regner ut gjennomsnitts-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur for begge steder.\n", + "Vi bruker pandas SQL for å hente ut temperaturen for begge stedene, og lagrer de i en tabell. Ved hjelp av 'pivot' , en innebgyd funksjon for å rotere tabeller, setter vi den opp for bedre lesbarhet, ved å sette begge byene ved siden av hverandre og 'dt' som index. Dette gjør det lettere å sammenligne temperaturen for begge stedene til samme tid.\n", "\n", - "Legger dataen inn i en dataframe for å vise de i en ryddigere og mer lettlest tabell." + "Ved hjelp av en pandas SQL setning kan vi hente og lagre gjennomsnitt, maksimalt og minste målte temperatur for begge stedene. Senere kan vi bare skrive en SELECT setning til denne variabelen, for å eksepelhvis hente ut gjennomsnitts data for sted 1." ] }, { @@ -259,49 +255,40 @@ "metadata": {}, "outputs": [], "source": [ - "# Stores the temperature values of both cities\n", - "temp_city_1 = city_1_df['main.temp']\n", - "temp_city_2 = city_2_df['main.temp']\n", - "\n", - "# Find the mean temperature in both citites\n", - "temp_mean_city_1 = temp_city_1.mean().round(2)\n", - "temp_mean_city_2 = temp_city_2.mean().round(2)\n", - "\n", - "# Find the highest temperature in both cities\n", - "max_temp_city_1 = city_1_df['main.temp'].max().round(2)\n", - "max_temp_city_2 = city_2_df['main.temp'].max().round(2)\n", - "\n", - "# Find the lowest tempeartues in both cities\n", - "min_temp_city_1 = city_1_df['main.temp'].min().round(2)\n", - "min_temp_city_2 = city_2_df['main.temp'].min().round(2)\n", - "\n", - "# Stores the values of both city in a list\n", - "city_names = [city_1, city_2]\n", - "mean_temp = [temp_mean_city_1, temp_mean_city_2]\n", - "max_temp = [max_temp_city_1, max_temp_city_2]\n", - "min_temp = [min_temp_city_1, min_temp_city_2]\n", - "\n", - "# Creates dataframe of the tempvalues, to display it more readable\n", - "df_temp_cities = pd.DataFrame({\n", - " \"City\": city_names,\n", - " \"Mean Temperature (°C)\": mean_temp,\n", - " \"Highest Temperature (°C)\": max_temp,\n", - " \"Lowest Temperature (°C)\": min_temp\n", - "})\n", - "\n", - "# Display the dataframe with the values of both cities\n", - "display(df_temp_cities)\n" + "# Stores the `main.temp` from both cities using pandas sql\n", + "temp_data = sqldf('''\n", + " SELECT dt, city_name, `main.temp` as temp\n", + " FROM both_cities_df\n", + "''')\n", + "\n", + "# Set the dt to index, city for \"header\" and the temp as values for each city each hour\n", + "pivoted_temp = temp_data.pivot(index='dt', columns='city_name', values='temp')\n", + "\n", + "# Rename the columns in the pivoted_temp\n", + "pivoted_temp.columns = [f\"{col}_main_temp\" for col in pivoted_temp.columns]\n", + "\n", + "# Display the final result\n", + "print(f\"Main temperature for {city_1} and {city_2}\")\n", + "display(pivoted_temp)\n", + "\n", + "# Extract and stores temperatur data for each city using pandas sql\n", + "stat_temp_per_city = sqldf('''\n", + " SELECT city, city_name, AVG(`main.temp`) AS avg_temp, MAX(`main.temp`) AS max_temp, MIN(`main.temp`) AS min_temp\n", + " FROM both_cities_df\n", + " GROUP BY city_name\n", + " ''')\n", + "\n", + "# Display the temperatur stats for both cities\n", + "display(stat_temp_per_city)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Sjekker nedbørs-kolonnene\n", + "### Sjekker kolonner\n", "\n", - "En ting vi har oppfattet når vi har blitt kjent med dataen er at regn og snø kolonne mangler om det ikke har regnet eller snødd. Vi ønsker senere å plotte oversikt over nedbør, og det er lettere å plotte 0 nedbør enn manglende verdier. \n", - "\n", - "Derfor sjekker vi her om regn/snø kolonne eksisterer i dataen, om den ikke gjør så lager vi en og fyller den med NaN." + "Vi har oppfattet at kolonnen som i størst grad kan mangler er 'rain.1h' og 'snow.1h', vi skal også bruke disse verdiene senere i plottingen av en graf. Derfor har vi lagd funksjonen `ensure_column` som tar i mot dataframen og kolonnene vi ønsker å sjekke. Om kolonnene ikke eksisterer blir de laget, og fylt med 'NaN'." ] }, { @@ -317,20 +304,12 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.util import ensure_rain_column\n", - "from my_package.util import ensure_snow_column\n", - "\n", - "# Chekcs if there are a rain/snow column in city_1_df, if not the function creates one with NaN values\n", - "city_1_df = ensure_rain_column(city_1_df)\n", - "city_1_df = ensure_snow_column(city_1_df)\n", - "# Displays the dataframe readable\n", - "display(city_1_df)\n", - "\n", - "# Chekcs if there are a rain/snow column in city_2_df, if not the function creates one with NaN values\n", - "city_2_df = ensure_rain_column(city_2_df)\n", - "city_2_df = ensure_snow_column(city_2_df)\n", - "# Displays the dataframe readable\n", - "display(city_2_df)" + "from my_package.util import ensure_column\n", + "\n", + "columns_to_ensure = ['rain.1h', 'snow.1h']\n", + "\n", + "both_cities_df = ensure_column(both_cities_df, columns_to_ensure)\n", + "display(both_cities_df)" ] }, { @@ -340,9 +319,7 @@ "### Sjekk for manglende verdier\n", "Missigno sjekker og visualiserer manglende verdier, slik at det blir lettere å se hvilke kolonner feilen ligger i. \n", "\n", - "Vis the blir \"hull\" i en søyle, tyder the på manglende verdier.\n", - "\n", - "Det er ikke vanlig å legge til tittel, men i dette tilfellet siden vi har to serier vi sjekker data for, har vi brukt matplotlib for å lettere se hvilket sted som har manglende data." + "Hvis det blir \"hull\" i en søyle, tyder the på manglende verdier." ] }, { @@ -352,22 +329,9 @@ "outputs": [], "source": [ "import missingno as msno\n", - "import matplotlib.pyplot as plt\n", - "\n", - "# Plot missing data matrix fro city_1\n", - "msno.matrix(city_1_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_1}')\n", - "\n", - "# Plot missing data matrix fro city_2\n", - "msno.matrix(city_2_df)\n", "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_2}')\n", - "\n", - "# Show the plot\n", - "plt.show()" + "# Plot missing data matrix for both cities\n", + "msno.matrix(both_cities_df)" ] }, { @@ -376,13 +340,11 @@ "source": [ "### Endre manglende verdier\n", "\n", - "Nå skal vi ha regn/snø kolonner, og kanksje inneholder de NaN.\n", - "\n", - "Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunne til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n", + "Om vi har manglende verdieer kan det tyde på to ting, 1. feil i målingen eller 2. det er ingenting å måle. Nr. 2 ser vi spesielt på kolonnene 'rain.1h' og 'snow.1h', om det ikke er snødd/regnet vil verdien bli lagret for 'NaN'. 'NaN' er ikke mulig å plotte, så vi ønsker å lagre verdien som 0 isteden. Vi importerer derfor funksjonen `fill_column_0` som tar inn datafram og kolonner vi ønsker at 'NaN' verdien skal byttes ut med 0. \n", "\n", - "Vi sjekker også temperatur-verdiene for begge stedene. Dersom det skulle mangle en verdi bruker vi ffill (forward-fill), altså at den bruker verdien som var før.\n", + "Om det er kolonner hvor alle verdiene er 'NaN' fjerner vi hele kolonnen, dette fordi verdien 0 ikke vil gi oss mer informasjon uansett.\n", "\n", - "Det kan være mangler i andre verdier, men vi endrer ikke disse nå, da vi kun vil bruke data om nedbør og temperatur." + "Andre kolonner som har en 'NaN'-verdi velger vi å bruke interpolate, som ser på verdien før og verdien etter og 'gjetter' på den manglende verdien. Vi har lagt til 'limit-direction' for at den skal kunne gjette på første og siste, eventuelt andre verdier som bare har verdi på en side." ] }, { @@ -398,43 +360,20 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.util import fill_rain_column\n", - "from my_package.util import fill_snow_column\n", + "from my_package.util import fill_column_0\n", "\n", - "# Fill NaN values with 0, if there are any, for both rain/snow column\n", - "city_1_df = fill_rain_column(city_1_df)\n", - "city_1_df = fill_snow_column(city_1_df)\n", + "columns_to_0 = ['rain.1h', 'snow.1h']\n", "\n", "# Fill NaN values with 0, if there are any, for both rain/snow column\n", - "city_2_df = fill_rain_column(city_2_df)\n", - "city_2_df = fill_snow_column(city_2_df)\n", - "\n", - "# Drops all the columns, if it has 'NaN' value.\n", - "city_1_df = city_1_df.dropna(axis='columns', how='all')\n", - "city_2_df = city_2_df.dropna(axis='columns', how='all')\n", - "\n", - "# If temperature is missing, take the same as the one before\n", - "city_1_df['main.temp'] = city_1_df['main.temp'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the lowest temperature \n", - "city_1_df['main.temp_min'] = city_1_df['main.temp_min'].fillna('obj.ffill()')\n", + "both_cities_df = fill_column_0(both_cities_df, columns_to_0)\n", "\n", - "# Forward fill missing values in the highest temperature \n", - "city_1_df['main.temp_max'] = city_1_df['main.temp_max'].fillna('obj.ffill()')\n", + "# Drops the whole column, if it only has 'NaN' value.\n", + "both_cities_df = both_cities_df.dropna(axis='columns', how='all')\n", "\n", - "\n", - "# If temperature is missing, take the same as the one before\n", - "city_2_df['main.temp'] = city_2_df['main.temp'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the lowest temperature \n", - "city_2_df['main.temp_min'] = city_2_df['main.temp_min'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the highest temperature \n", - "city_2_df['main.temp_max'] = city_2_df['main.temp_max'].fillna('obj.ffill()')\n", + "both_cities_df = both_cities_df.interpolate(method='linear', limit_direction='both')\n", "\n", "# Display both cities readable\n", - "display(city_1_df)\n", - "display(city_2_df)" + "display(both_cities_df)" ] }, { @@ -452,22 +391,9 @@ "outputs": [], "source": [ "import missingno as msno\n", - "import matplotlib.pyplot as plt\n", "\n", - "# Plot missing data matrix fro city_1\n", - "msno.matrix(city_1_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_1}')\n", - "\n", - "# Plot missing data matrix fro city_2\n", - "msno.matrix(city_2_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_2}')\n", - "\n", - "# Show the plot\n", - "plt.show()" + "# Plot missing data matrix for both cities\n", + "msno.matrix(both_cities_df)" ] }, { @@ -477,7 +403,7 @@ "### Visualisere data i en graf\n", "Ved hjelp av Matplotlib har vi visualiert ønsket data, og ved hjelp av subplot, en modul i matplotlib, kan vi plotte flere verdier i samme graf, og få \"to y-akse\" på samme x-akse. \n", "\n", - "Temperatur for begge stedene finner vi i den øverste grafen, hvor vi også har lagt inn gjennomsnittstemperaturen.\n", + "Temperatur for begge stedene finner vi i den øverste grafen, hvor vi også har lagt inn gjennomsnittstemperaturen. Her har vi bare skrevet nye SELECT setninger som henter ut gjennomsnitt for begge steder fra 'stat_temp_per_city'.\n", "\n", "I grafen under ser vi oversikt over nedbør for begge stedene sammenlignet 'side om side'. Vi skiller også mellom snø og regn, og dersom det skulle snø og regne i samme time, vil de bare 'stables' oppå hverandre.\n", "\n", @@ -500,13 +426,22 @@ "# Creates the folder if it does not exist\n", "os.makedirs(output_folder, exist_ok=True)\n", "\n", + "temp_city_1 = sqldf('''SELECT `main.temp` FROM both_cities_df WHERE city = \"city_1\"''')\n", + "temp_city_2 = sqldf('''SELECT `main.temp` FROM both_cities_df WHERE city = \"city_2\"''')\n", + "\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (first row, first column)\n", + "temp_mean_city_1 = sqldf('''SELECT avg_temp FROM stat_temp_per_city WHERE city = \"city_1\"''').iloc[0, 0]\n", + "temp_mean_city_2 = sqldf('''SELECT avg_temp FROM stat_temp_per_city WHERE city = \"city_2\"''').iloc[0, 0]\n", + "\n", "# Extract rain values for both cities\n", - "city_1_rain = city_1_df['rain.1h']\n", - "city_2_rain = city_2_df['rain.1h']\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (all rows, first column)\n", + "city_1_rain = sqldf('''SELECT `rain.1h` FROM both_cities_df WHERE city = \"city_1\"''').iloc[:,0]\n", + "city_2_rain = sqldf('''SELECT `rain.1h` FROM both_cities_df WHERE city = \"city_2\"''').iloc[:,0]\n", "\n", "# Extract snow values for both cities\n", - "city_1_snow = city_1_df['snow.1h']\n", - "city_2_snow = city_2_df['snow.1h']\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (all rows, first column)\n", + "city_1_snow = sqldf('''SELECT `snow.1h` FROM both_cities_df WHERE city = \"city_1\"''').iloc[:,0]\n", + "city_2_snow = sqldf('''SELECT `snow.1h` FROM both_cities_df WHERE city = \"city_2\"''').iloc[:,0]\n", "\n", "# x_axis set to the index, which mean the datetime\n", "x_axis = city_1_df.index\n", @@ -521,12 +456,12 @@ "ax1.set_title(f'Weather compare for {city_1} and {city_2} ({date}) ')\n", "\n", "# Plots the temperature for city_1, with mean temperature\n", - "ax1.plot(x_axis, temp_city_1, color='#2E8B57', label=f'Temperature {city_1}')\n", - "ax1.axhline(temp_mean_city_1, color='#2E8B57', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", + "ax1.plot(x_axis, temp_city_1, color='#008080', label=f'Temperature {city_1}')\n", + "ax1.axhline(temp_mean_city_1, color='#008080', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", "\n", "# Plots the temperature for city_2s, with mean temperature\n", - "ax1.plot(x_axis, temp_city_2, color='#FFD700', label=f'Temperature {city_2}')\n", - "ax1.axhline(temp_mean_city_2, color='#FFD700', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", + "ax1.plot(x_axis, temp_city_2, color='#FFA500', label=f'Temperature {city_2}')\n", + "ax1.axhline(temp_mean_city_2, color='#FFA500', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", "\n", "# Design the y-axis for Temperature\n", "ax1.set_ylabel(\"Temperature (°C)\")\n", @@ -542,12 +477,12 @@ "\n", "# Plot rain/snow bars for both cities, sidy-by-side for better comparrison\n", "# Place the snow and rain bars for city_1, with offset for better comparrisson with city_2\n", - "ax3.bar(x_axis_numeric - width/2, city_1_rain, width=width, alpha=0.7, color='#2E8B57', label=f'Rain {city_1}', edgecolor='#FFD700', bottom=city_1_snow)\n", - "ax3.bar(x_axis_numeric - width/2, city_1_snow, width=width, alpha=0.5, color='#2E8B57', label=f'Snow {city_1}', hatch='////', edgecolor='#FFD700')\n", + "ax3.bar(x_axis_numeric - width/2, city_1_rain, width=width, alpha=0.7, color='#008080', label=f'Rain {city_1}', bottom=city_1_snow, edgecolor='#008080')\n", + "ax3.bar(x_axis_numeric - width/2, city_1_snow, width=width, alpha=0.7, color='#B2D9D9', label=f'Snow {city_1}', hatch='///', edgecolor='#008080')\n", "\n", "# Place the snow and rain bars for city_2, with offset for better comparrisson with city_1\n", - "ax3.bar(x_axis_numeric + width/2, city_2_rain, width=width, alpha=0.7, color='#FFD700', label=f'Rain {city_2}', edgecolor=\"#2E8B57\", bottom=city_2_snow)\n", - "ax3.bar(x_axis_numeric + width/2, city_2_snow, width=width, alpha=0.5, color='#FFD700', label=f'Snow {city_2}', hatch='////', edgecolor=\"#2E8B57\")\n", + "ax3.bar(x_axis_numeric + width/2, city_2_rain, width=width, alpha=0.7, color='#FFA500', label=f'Rain {city_2}', bottom=city_2_snow, edgecolor='#FFA500')\n", + "ax3.bar(x_axis_numeric + width/2, city_2_snow, width=width, alpha=0.7, color='#FFE5B3', label=f'Snow {city_2}', hatch='///', edgecolor='#FFA500')\n", "\n", "\n", "# Design the y-axis for precipiation\n", diff --git a/notebooks/notebook_compare_one_week_data.ipynb b/notebooks/notebook_compare_one_week_data.ipynb index 186f61a..aa1c3ee 100644 --- a/notebooks/notebook_compare_one_week_data.ipynb +++ b/notebooks/notebook_compare_one_week_data.ipynb @@ -86,18 +86,16 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.fetch_data import fetch_data\n", + "from my_package.data import fetch_time_data\n", "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", "# User input the city, for the weather\n", - "city_1 = input(\"Enter a city in Norway: \")\n", - "\n", - "city_1 = replace_nordic(city_1)\n", + "city_1 = input_place()\n", "\n", "# Stores the values in the variables\n", - "data_city_1, folder = fetch_data(unix_start_date, unix_end_date, city_1)" + "data_city_1, folder = fetch_time_data(unix_start_date, unix_end_date, city_1)" ] }, { @@ -123,18 +121,16 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.fetch_data import fetch_data\n", + "from my_package.data import fetch_time_data\n", "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", "# User input the city, for the weather\n", - "city_2 = input(\"Enter a city in Norway: \")\n", - "\n", - "city_2 = replace_nordic(city_2)\n", + "city_2 = input_place()\n", "\n", "# Stores the values in the variables\n", - "data_city_2, folder = fetch_data(unix_start_date, unix_end_date, city_2)" + "data_city_2, folder = fetch_time_data(unix_start_date, unix_end_date, city_2)" ] }, { @@ -171,7 +167,7 @@ "# Gets the absolute path to the src folder\n", "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", - "from my_package.write_data import write_data\n", + "from my_package.data import write_data\n", "\n", "# Overwrites the folder stored inside the function\n", "folder = \"../data/output_sammenligning_uke\"\n", @@ -191,12 +187,9 @@ "\n", "Henter opp data lagret i filen, lagd over, og skriver ut lesbart ved hjelp av pandas\n", "\n", - "Har har vi laget en funksjon som henter ut dataene for ønsket sted, og gjør endringer vi ønsker skal bli gjort for dataen for begge steder som:\n", - "- fjerner 'weather' kolonnen, som inneholder metadata\n", - "- setter tiden som index\n", - "- normaliserer, slik at det er enklere å lese all dataen\n", + "Vi importerer funksjonen `extract_city_df` som fjerner uønskede kolonner, og returnerer dataen mer lesbart.\n", " \n", - "Vi sjekker også at vi har data for stedene, altså at funskjonen funker, før den eventuelt viser dataen for stedene." + "Vi lagrer dataen for begge byene, men legger til kolonne 'city' for å lagre id og 'city_name' for å lagre stedsnavn. Før vi slår sammen begge dataene til en dataframe, for å lettere bruke Pandas SQL for å hente ut ønsket data for begge stedene." ] }, { @@ -208,32 +201,35 @@ "source": [ "import pandas as pd\n", "import json\n", + "from pandasql import sqldf\n", "\n", "file_path = f'../data/output_sammenligning_uke/data_{filename}.json'\n", "\n", - "from my_package.util import extract_city_df\n", + "from my_package.data import extract_city_df\n", "\n", "# Load the whole JSON file\n", "with open(file_path, 'r') as f:\n", " all_city_data = json.load(f)\n", "\n", - "# Separate variables for each city\n", + "# Extract and label each city, with id and name\n", "city_1_df = extract_city_df(all_city_data.get('city_1'))\n", + "city_1_df['city'] = 'city_1'\n", + "city_1_df['city_name'] = city_1\n", + "\n", "city_2_df = extract_city_df(all_city_data.get('city_2'))\n", + "city_2_df['city'] = 'city_2'\n", + "city_2_df['city_name'] = city_2\n", + "\n", + "# Concat both city df into one, to be able to use pandas sql\n", + "both_cities_df = pd.concat([city_1_df, city_2_df])\n", + "\n", + "# Pandas SQL adds more values to dt, example \"yyyy-mm-dd hh:mm:ss:xxxxx\", so to delete the extra x´s we reset the index\n", + "both_cities_df = both_cities_df.reset_index()\n", + "# And add dt with the wanted format\n", + "both_cities_df['dt'] = both_cities_df['dt'].dt.strftime('%Y-%m-%d %H:%M:%S')\n", "\n", - "# Checks if the data is not empty, aka there are values\n", - "if city_1_df is not None:\n", - " # Prints the city name\n", - " print(f\"{city_1} data:\")\n", - " # Display the dataframe readable\n", - " display(city_1_df)\n", - "\n", - "# Checks if the data is not empty, aka there are values\n", - "if city_2_df is not None:\n", - " # Prints the city name\n", - " print(f\"{city_2} data:\")\n", - " # Display the dataframe readable\n", - " display(city_2_df)" + "# Display all the values from both cities, with city 1 first, then city 2\n", + "display(sqldf('''SELECT * FROM both_cities_df'''))" ] }, { @@ -242,9 +238,9 @@ "metadata": {}, "source": [ "### Viser temperaturen\n", - "Regner ut gjennomsnitts-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur for begge steder.\n", + "Vi bruker pandas SQL for å hente ut temperaturen for begge stedene, og lagrer de i en tabell. Ved hjelp av 'pivot' , en innebgyd funksjon for å rotere tabeller, setter vi den opp for bedre lesbarhet, ved å sette begge byene ved siden av hverandre og 'dt' som index. Dette gjør det lettere å sammenligne temperaturen for begge stedene til samme tid.\n", "\n", - "Legger dataen inn i en dataframe for å vise de i en ryddigere og mer lettlest tabell." + "Ved hjelp av en pandas SQL setning kan vi hente og lagre gjennomsnitt, maksimalt og minste målte temperatur for begge stedene. Senere kan vi bare skrive en SELECT setning til denne variabelen, for å eksepelhvis hente ut gjennomsnitts data for sted 1." ] }, { @@ -254,38 +250,31 @@ "metadata": {}, "outputs": [], "source": [ - "# Stores the temperature values of both cities\n", - "temp_city_1 = city_1_df['main.temp']\n", - "temp_city_2 = city_2_df['main.temp']\n", - "\n", - "# Find the mean temperature in both citites\n", - "temp_mean_city_1 = temp_city_1.mean().round(2)\n", - "temp_mean_city_2 = temp_city_2.mean().round(2)\n", - "\n", - "# Find the highest temperature in both cities\n", - "max_temp_city_1 = city_1_df['main.temp'].max().round(2)\n", - "max_temp_city_2 = city_2_df['main.temp'].max().round(2)\n", - "\n", - "# Find the lowest tempeartues in both cities\n", - "min_temp_city_1 = city_1_df['main.temp'].min().round(2)\n", - "min_temp_city_2 = city_2_df['main.temp'].min().round(2)\n", - "\n", - "# Stores the values of both city in a list\n", - "city_names = [city_1, city_2]\n", - "mean_temp = [temp_mean_city_1, temp_mean_city_2]\n", - "max_temp = [max_temp_city_1, max_temp_city_2]\n", - "min_temp = [min_temp_city_1, min_temp_city_2]\n", - "\n", - "# Creates dataframe of the tempvalues, to display it more readable\n", - "df_temp_cities = pd.DataFrame({\n", - " \"City\": city_names,\n", - " \"Mean Temperature (°C)\": mean_temp,\n", - " \"Highest Temperature (°C)\": max_temp,\n", - " \"Lowest Temperature (°C)\": min_temp\n", - "})\n", - "\n", - "# Display the dataframe with the values of both cities\n", - "display(df_temp_cities)\n" + "# Stores the `main.temp` from both cities using pandas sql\n", + "temp_data = sqldf('''\n", + " SELECT dt, city_name, `main.temp` as temp\n", + " FROM both_cities_df\n", + "''')\n", + "\n", + "# Set the dt to index, city for \"header\" and the temp as values for each city each hour\n", + "pivoted_temp = temp_data.pivot(index='dt', columns='city_name', values='temp')\n", + "\n", + "# Rename the columns in the pivoted_temp\n", + "pivoted_temp.columns = [f\"{col}_main_temp\" for col in pivoted_temp.columns]\n", + "\n", + "# Display the final result\n", + "print(f\"Main temperature for {city_1} and {city_2}\")\n", + "display(pivoted_temp)\n", + "\n", + "# Extract and stores temperatur data for each city using pandas sql\n", + "stat_temp_per_city = sqldf('''\n", + " SELECT city, city_name, AVG(`main.temp`) AS avg_temp, MAX(`main.temp`) AS max_temp, MIN(`main.temp`) AS min_temp\n", + " FROM both_cities_df\n", + " GROUP BY city_name\n", + " ''')\n", + "\n", + "# Display the temperatur stats for both cities\n", + "display(stat_temp_per_city)" ] }, { @@ -293,11 +282,9 @@ "id": "d0248c3b", "metadata": {}, "source": [ - "### Sjekker nedbørs-kolonnene\n", + "### Sjekker kolonner\n", "\n", - "En ting vi har oppfattet når vi har blitt kjent med dataen er at regn og snø kolonne mangler om det ikke har regnet eller snødd. Vi ønsker senere å plotte oversikt over nedbør, og det er lettere å plotte 0 nedbør enn manglende verdier. \n", - "\n", - "Derfor sjekker vi her om regn/snø kolonne eksisterer i dataen, om den ikke gjør så lager vi en og fyller den med NaN." + "Vi har oppfattet at kolonnen som i størst grad kan mangler er 'rain.1h' og 'snow.1h', vi skal også bruke disse verdiene senere i plottingen av en graf. Derfor har vi lagd funksjonen `ensure_column` som tar i mot dataframen og kolonnene vi ønsker å sjekke. Om kolonnene ikke eksisterer blir de laget, og fylt med 'NaN'." ] }, { @@ -314,20 +301,12 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.util import ensure_rain_column\n", - "from my_package.util import ensure_snow_column\n", - "\n", - "# Chekcs if there are a rain/snow column in city_1_df, if not the function creates one with NaN values\n", - "city_1_df = ensure_rain_column(city_1_df)\n", - "city_1_df = ensure_snow_column(city_1_df)\n", - "# Displays the dataframe readable\n", - "display(city_1_df)\n", - "\n", - "# Chekcs if there are a rain/snow column in city_2_df, if not the function creates one with NaN values\n", - "city_2_df = ensure_rain_column(city_2_df)\n", - "city_2_df = ensure_snow_column(city_2_df)\n", - "# Displays the dataframe readable\n", - "display(city_2_df)" + "from my_package.util import ensure_column\n", + "\n", + "columns_to_ensure = ['rain.1h', 'snow.1h']\n", + "\n", + "both_cities_df = ensure_column(both_cities_df, columns_to_ensure)\n", + "display(both_cities_df)" ] }, { @@ -338,9 +317,7 @@ "### Sjekk for manglende verdier\n", "Missigno sjekker og visualiserer manglende verdier, slik at det blir lettere å se hvilke kolonner feilen ligger i. \n", "\n", - "Vis the blir \"hull\" i en søyle, tyder the på manglende verdier.\n", - "\n", - "Det er ikke vanlig å legge til tittel, men i dette tilfellet siden vi har to serier vi sjekker data for, har vi brukt matplotlib for å lettere se hvilket sted som har manglende data." + "Hvis det blir \"hull\" i en søyle, tyder the på manglende verdier." ] }, { @@ -351,22 +328,9 @@ "outputs": [], "source": [ "import missingno as msno\n", - "import matplotlib.pyplot as plt\n", - "\n", - "# Plot missing data matrix fro city_1\n", - "msno.matrix(city_1_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_1}')\n", - "\n", - "# Plot missing data matrix fro city_2\n", - "msno.matrix(city_2_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_2}')\n", "\n", - "# Show the plot\n", - "plt.show()" + "# Plot missing data matrix for both cities\n", + "msno.matrix(both_cities_df)" ] }, { @@ -376,13 +340,11 @@ "source": [ "### Endre manglende verdier\n", "\n", - "Nå skal vi ha regn/snø kolonner, og kanksje inneholder de NaN.\n", + "Om vi har manglende verdieer kan det tyde på to ting, 1. feil i målingen eller 2. det er ingenting å måle. Nr. 2 ser vi spesielt på kolonnene 'rain.1h' og 'snow.1h', om det ikke er snødd/regnet vil verdien bli lagret for 'NaN'. 'NaN' er ikke mulig å plotte, så vi ønsker å lagre verdien som 0 isteden. Vi importerer derfor funksjonen `fill_column_0` som tar inn datafram og kolonner vi ønsker at 'NaN' verdien skal byttes ut med 0. \n", "\n", - "Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunne til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n", + "Om det er kolonner hvor alle verdiene er 'NaN' fjerner vi hele kolonnen, dette fordi verdien 0 ikke vil gi oss mer informasjon uansett.\n", "\n", - "Vi sjekker også temperatur-verdiene for begge stedene. Dersom det skulle mangle en verdi bruker vi ffill (forward-fill), altså at den bruker verdien som var før.\n", - "\n", - "Det kan være mangler i andre verdier, men vi endrer ikke disse nå, da vi kun vil bruke data om nedbør og temperatur." + "Andre kolonner som har en 'NaN'-verdi velger vi å bruke interpolate, som ser på verdien før og verdien etter og 'gjetter' på den manglende verdien. Vi har lagt til 'limit-direction' for at den skal kunne gjette på første og siste, eventuelt andre verdier som bare har verdi på en side." ] }, { @@ -399,43 +361,20 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.util import fill_rain_column\n", - "from my_package.util import fill_snow_column\n", + "from my_package.util import fill_column_0\n", "\n", - "# Fill NaN values with 0, if there are any, for both rain/snow column\n", - "city_1_df = fill_rain_column(city_1_df)\n", - "city_1_df = fill_snow_column(city_1_df)\n", + "columns_to_0 = ['rain.1h', 'snow.1h']\n", "\n", "# Fill NaN values with 0, if there are any, for both rain/snow column\n", - "city_2_df = fill_rain_column(city_2_df)\n", - "city_2_df = fill_snow_column(city_2_df)\n", + "both_cities_df = fill_column_0(both_cities_df, columns_to_0)\n", "\n", - "# Drops all the columns, if it has 'NaN' value.\n", - "city_1_df = city_1_df.dropna(axis='columns', how='all')\n", - "city_2_df = city_2_df.dropna(axis='columns', how='all')\n", + "# Drops the whole column, if it only has 'NaN' value.\n", + "both_cities_df = both_cities_df.dropna(axis='columns', how='all')\n", "\n", - "# If temperature is missing, take the same as the one before\n", - "city_1_df['main.temp'] = city_1_df['main.temp'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the lowest temperature \n", - "city_1_df['main.temp_min'] = city_1_df['main.temp_min'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the highest temperature \n", - "city_1_df['main.temp_max'] = city_1_df['main.temp_max'].fillna('obj.ffill()')\n", - "\n", - "\n", - "# If temperature is missing, take the same as the one before\n", - "city_2_df['main.temp'] = city_2_df['main.temp'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the lowest temperature \n", - "city_2_df['main.temp_min'] = city_2_df['main.temp_min'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the highest temperature \n", - "city_2_df['main.temp_max'] = city_2_df['main.temp_max'].fillna('obj.ffill()')\n", + "both_cities_df = both_cities_df.interpolate(method='linear', limit_direction='both')\n", "\n", "# Display both cities readable\n", - "display(city_1_df)\n", - "display(city_2_df)" + "display(both_cities_df)" ] }, { @@ -455,22 +394,9 @@ "outputs": [], "source": [ "import missingno as msno\n", - "import matplotlib.pyplot as plt\n", - "\n", - "# Plot missing data matrix fro city_1\n", - "msno.matrix(city_1_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_1}')\n", "\n", - "# Plot missing data matrix fro city_2\n", - "msno.matrix(city_2_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_2}')\n", - "\n", - "# Show the plot\n", - "plt.show()" + "# Plot missing data matrix for both cities\n", + "msno.matrix(both_cities_df)" ] }, { @@ -481,7 +407,7 @@ "### Visualisere data i en graf\n", "Ved hjelp av Matplotlib har vi visualiert ønsket data, og ved hjelp av subplot, en modul i matplotlib, kan vi plotte flere verdier i samme graf, og få \"to y-akse\" på samme x-akse. \n", "\n", - "Temperatur for begge stedene finner vi i den øverste grafen, hvor vi også har lagt inn gjennomsnittstemperaturen.\n", + "Temperatur for begge stedene finner vi i den øverste grafen, hvor vi også har lagt inn gjennomsnittstemperaturen. Her har vi bare skrevet nye SELECT setninger som henter ut gjennomsnitt for begge steder fra 'stat_temp_per_city'.\n", "\n", "I grafen under ser vi oversikt over nedbør for begge stedene sammenlignet 'side om side'. Vi skiller også mellom snø og regn, og dersom det skulle snø og regne i samme time, vil de bare 'stables' oppå hverandre.\n", "\n", @@ -505,13 +431,22 @@ "# Creates the folder if it does not exist\n", "os.makedirs(output_folder, exist_ok=True)\n", "\n", + "temp_city_1 = sqldf('''SELECT `main.temp` FROM both_cities_df WHERE city = \"city_1\"''')\n", + "temp_city_2 = sqldf('''SELECT `main.temp` FROM both_cities_df WHERE city = \"city_2\"''')\n", + "\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (first row, first column)\n", + "temp_mean_city_1 = sqldf('''SELECT avg_temp FROM stat_temp_per_city WHERE city = \"city_1\"''').iloc[0, 0]\n", + "temp_mean_city_2 = sqldf('''SELECT avg_temp FROM stat_temp_per_city WHERE city = \"city_2\"''').iloc[0, 0]\n", + "\n", "# Extract rain values for both cities\n", - "city_1_rain = city_1_df['rain.1h']\n", - "city_2_rain = city_2_df['rain.1h']\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (all rows, first column)\n", + "city_1_rain = sqldf('''SELECT `rain.1h` FROM both_cities_df WHERE city = \"city_1\"''').iloc[:,0]\n", + "city_2_rain = sqldf('''SELECT `rain.1h` FROM both_cities_df WHERE city = \"city_2\"''').iloc[:,0]\n", "\n", "# Extract snow values for both cities\n", - "city_1_snow = city_1_df['snow.1h']\n", - "city_2_snow = city_2_df['snow.1h']\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (all rows, first column)\n", + "city_1_snow = sqldf('''SELECT `snow.1h` FROM both_cities_df WHERE city = \"city_1\"''').iloc[:,0]\n", + "city_2_snow = sqldf('''SELECT `snow.1h` FROM both_cities_df WHERE city = \"city_2\"''').iloc[:,0]\n", "\n", "# x_axis set to the index, which mean the datetime\n", "x_axis = city_1_df.index\n", @@ -523,15 +458,15 @@ "fig, (ax1, ax3) = plt.subplots(2, 1,figsize=(15, 8), sharex=True)\n", "\n", "# Set the title for the whole plot, above the upper plot\n", - "ax1.set_title(f'Weather compare for {city_1} and {city_2} ({start_date} - {end_date}) ')\n", + "ax1.set_title(f'Weather compare for {city_1} and {city_2} ({start_date} to {end_date}) ')\n", "\n", "# Plots the temperature for city_1, with mean temperature\n", - "ax1.plot(x_axis, temp_city_1, color='#2E8B57', label=f'Temperature {city_1}')\n", - "ax1.axhline(temp_mean_city_1, color='#2E8B57', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", + "ax1.plot(x_axis, temp_city_1, color='#008080', label=f'Temperature {city_1}')\n", + "ax1.axhline(temp_mean_city_1, color='#008080', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", "\n", "# Plots the temperature for city_2s, with mean temperature\n", - "ax1.plot(x_axis, temp_city_2, color='#FFD700', label=f'Temperature {city_2}')\n", - "ax1.axhline(temp_mean_city_2, color='#FFD700', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", + "ax1.plot(x_axis, temp_city_2, color='#FFA500', label=f'Temperature {city_2}')\n", + "ax1.axhline(temp_mean_city_2, color='#FFA500', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", "\n", "# Design the y-axis for Temperature\n", "ax1.set_ylabel(\"Temperature (°C)\")\n", @@ -542,21 +477,17 @@ "# Add marker at 0 temperature\n", "ax1.axhline(y=0, color='black', linewidth=1.5)\n", "\n", - "width = 0.02\n", + "# Adjust the width of bars for better side-by-side comparison, for the precipitation\n", + "width = 0.01 \n", "\n", "# Plot rain/snow bars for both cities, sidy-by-side for better comparrison\n", - "# # Place the snow and rain bars for city_1, with offset for better comparrisson with city_2\n", - "\n", - "# ax3.bar(x_axis_numeric - width/2, city_1_rain, width=width, alpha=0.7, color='#2E8B57', label=f'Rain {city_1}', edgecolor='#FFD700', bottom=city_1_snow)\n", - "# ax3.bar(x_axis_numeric - width/2, city_1_snow, width=width, alpha=0.5, color='#2E8B57', label=f'Snow {city_1}', hatch='////', edgecolor='#FFD700')\n", - "\n", - "# # # Place the snow and rain bars for city_2, with offset for better comparrisson with city_1\n", - "# ax3.bar(x_axis_numeric + width/2, city_2_rain, width=width, alpha=0.7, color='#FFD700', label=f'Rain {city_2}', edgecolor=\"#2E8B57\", bottom=city_2_snow)\n", - "# ax3.bar(x_axis_numeric + width/2, city_2_snow, width=width, alpha=0.5, color='#FFD700', label=f'Snow {city_2}', hatch='////', edgecolor=\"#2E8B57\")\n", + "# Place the snow and rain bars for city_1, with offset for better comparrisson with city_2\n", + "ax3.bar(x_axis_numeric - width/2, city_1_rain, width=width, alpha=0.7, color='#008080', label=f'Rain {city_1}', bottom=city_1_snow, edgecolor='#008080')\n", + "ax3.bar(x_axis_numeric - width/2, city_1_snow, width=width, alpha=0.7, color='#B2D9D9', label=f'Snow {city_1}', hatch='///', edgecolor='#008080')\n", "\n", - "# Concat the snow and rain\n", - "ax3.fill_between(x_axis, city_1_rain + city_1_snow, color='green', alpha=0.3, label=f'{city_1} Total')\n", - "ax3.fill_between(x_axis, city_2_rain + city_2_snow, color='gold', alpha=0.3, label=f'{city_2} Total')\n", + "# Place the snow and rain bars for city_2, with offset for better comparrisson with city_1\n", + "ax3.bar(x_axis_numeric + width/2, city_2_rain, width=width, alpha=0.7, color='#FFA500', label=f'Rain {city_2}', bottom=city_2_snow, edgecolor='#FFA500')\n", + "ax3.bar(x_axis_numeric + width/2, city_2_snow, width=width, alpha=0.7, color='#FFE5B3', label=f'Snow {city_2}', hatch='///', edgecolor='#FFA500')\n", "\n", "\n", "# Design the y-axis for precipiation\n", @@ -566,8 +497,8 @@ "ax3.grid(axis='x')\n", "\n", "# Customize the x-axis to show ticks for each hour\n", - "ax1.xaxis.set_major_locator(mdates.HourLocator(interval=12)) # Tick marks for every hour\n", - "ax1.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H')) # Format as \"Day Month Hour:Minute\"\n", + "ax3.xaxis.set_major_locator(mdates.HourLocator(interval=12)) # Tick marks for every hour\n", + "ax3.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H')) # Format as \"Day Month Hour:Minute\"\n", "\n", "# Add label description\n", "ax1.legend(loc='upper left')\n", diff --git a/notebooks/notebook_compare_statistic_data.ipynb b/notebooks/notebook_compare_statistic_data.ipynb index 5985dfc..1ceb959 100644 --- a/notebooks/notebook_compare_statistic_data.ipynb +++ b/notebooks/notebook_compare_statistic_data.ipynb @@ -38,17 +38,15 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.year_data import fetch_data\n", + "from my_package.data import fetch_stat_data\n", "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", "# User input the city, for the weather\n", - "city_1 = input(\"Enter a city in Norway: \")\n", + "city_1 = input_place()\n", "\n", - "city_1 = replace_nordic(city_1)\n", - "\n", - "data_city_1, folder = fetch_data(city_1)" + "data_city_1, folder = fetch_stat_data(city_1)" ] }, { @@ -73,17 +71,15 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.year_data import fetch_data\n", + "from my_package.data import fetch_stat_data\n", "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", "# User input the city, for the weather\n", - "city_2 = input(\"Enter a city in Norway: \")\n", - "\n", - "city_2 = replace_nordic(city_2)\n", + "city_2 = input_place()\n", "\n", - "data_city_2, folder = fetch_data(city_2)" + "data_city_2, folder = fetch_stat_data(city_2)" ] }, { @@ -118,7 +114,7 @@ "# Gets the absolute path to the src folder\n", "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", - "from my_package.write_data import write_data\n", + "from my_package.data import write_data\n", "\n", "# Overwrites the folder stored inside the function\n", "folder = \"../data/output_sammenligning_statistikk\"\n", @@ -149,47 +145,30 @@ "source": [ "import pandas as pd\n", "import json\n", + "from pandasql import sqldf\n", "\n", "file_path = f'../data/output_sammenligning_statistikk/data_{filename}.json'\n", "\n", + "from my_package.data import extract_city_data_stat\n", "\n", "# Load the whole JSON file\n", "with open(file_path, 'r') as f:\n", " all_city_data = json.load(f)\n", "\n", - "def extract_city_data(data):\n", - " # Checks if the 'result' column is in the data\n", - " if 'result' in data:\n", - " # Normalize the json and store it as a dataframe for better readability\n", - " df = pd.json_normalize(data['result'])\n", + "# Separate variables for each city\n", + "city_1_df = extract_city_data_stat(all_city_data.get('city_1'))\n", + "city_1_df['city'] = 'city_1'\n", + "city_1_df['city_name'] = city_1\n", + "\n", + "city_2_df = extract_city_data_stat(all_city_data.get('city_2'))\n", + "city_2_df['city'] = 'city_2'\n", + "city_2_df['city_name'] = city_2\n", "\n", - " # Display the dataframe\n", - " return df\n", - " else:\n", - " print(\"'result' not in data\")\n", - " return None\n", + "# Concat both city df into one, to be able to use pandas sql\n", + "both_cities_df = pd.concat([city_1_df, city_2_df])\n", "\n", - "# Separate variables for each city\n", - "city_1_df = extract_city_data(all_city_data.get('city_1'))\n", - "city_2_df = extract_city_data(all_city_data.get('city_2'))\n", - "\n", - "# Checks if the data is not empty, aka there are values\n", - "if city_1_df is not None:\n", - " # Prints the city name\n", - " print(f\"{city_1} data:\")\n", - " # Display the dataframe readable\n", - " display(city_1_df)\n", - "else:\n", - " print('\"city_1_df\" is empty')\n", - "\n", - "# Checks if the data is not empty, aka there are values\n", - "if city_2_df is not None:\n", - " # Prints the city name\n", - " print(f\"{city_2} data:\")\n", - " # Display the dataframe readable\n", - " display(city_2_df)\n", - "else:\n", - " print('\"city_2_df\" is empty')\n" + "# Display all the values from both cities, with city 1 first, then city 2\n", + "display(sqldf('''SELECT * FROM both_cities_df'''))\n" ] }, { @@ -210,36 +189,29 @@ "metadata": {}, "outputs": [], "source": [ - "def clean_df(df):\n", - " # Drop all columns that end with '...' using the filter function\n", - " df = df.drop(columns=df.filter(like='.p25').columns)\n", - " df = df.drop(columns=df.filter(like='.p75').columns)\n", - " df = df.drop(columns=df.filter(like='.st_dev').columns)\n", - " df = df.drop(columns=df.filter(like='.num').columns)\n", - "\n", - " return df\n", + "from my_package.util import clean_df\n", "\n", "# Cleans data for unessecarily columns\n", - "city_1_df = clean_df(city_1_df)\n", - "city_2_df = clean_df(city_2_df)\n", + "both_cities_df = clean_df(both_cities_df)\n", "\n", - "display(city_1_df)\n", - "display(city_2_df)" + "display(both_cities_df)" ] }, { "cell_type": "markdown", - "id": "97847344", + "id": "4d493df2", "metadata": {}, "source": [ - "### Plotter temperatur\n", - "Denne koden plotter og sammenlginer data basert på gjennomsnitts temperatur gjennom året. For å sikre lagring av de ulike kjøringene, vil grafen bli lagret i mappen \"../data/output_fig_sammenligning/mean_temp_plot_{city_1}_(city_2).json\"\n" + "### Viser temperaturen\n", + "Vi bruker pandas SQL for å hente ut temperaturen for begge stedene, og lagrer de i en tabell. Ved hjelp av 'pivot' , en innebgyd funksjon for å rotere tabeller, setter vi den opp for bedre lesbarhet, ved å sette begge byene ved siden av hverandre og en sammenslåing av kolonnene måned og dag ('month_day') som index. Dette gjør det lettere å sammenligne temperaturen for begge stedene til samme tid.\n", + "\n", + "Ved hjelp av en pandas SQL setning kan vi hente og lagre gjennomsnitt, maksimalt og minste målte temperatur for begge stedene. Senere kan vi bare skrive en SELECT setning til denne variabelen, for å eksepelhvis hente ut gjennomsnitts data for sted 1." ] }, { "cell_type": "code", "execution_count": null, - "id": "851e62c8", + "id": "33d5a1c3", "metadata": {}, "outputs": [], "source": [ @@ -257,58 +229,36 @@ "output_folder = \"../data/output_fig\"\n", "os.makedirs(output_folder, exist_ok=True) # Create the folder if it doesn't exist\n", "\n", - "# Converts to and make a new column with celsius temp, and not kelvin\n", - "city_1_df['temp.mean_celsius'] = kelvin_to_celsius(city_1_df['temp.mean'])\n", - "city_2_df['temp.mean_celsius'] = kelvin_to_celsius(city_2_df['temp.mean'])\n", - "\n", - "temp_city_1 = city_1_df['temp.mean_celsius']\n", - "temp_city_2 = city_2_df['temp.mean_celsius']\n", - "\n", - "\n", - "temp_mean_city_1 = temp_city_1.mean().round(2)\n", - "temp_mean_city_2 = temp_city_2.mean().round(2)\n", - "\n", - "# Convert from day and month, to datetime\n", - "# df['date'] = pd.to_datetime(df[['month', 'day']].assign(year=2024))\n", + "both_cities_df['temp.mean_celsius'] = kelvin_to_celsius(both_cities_df['temp.mean'])\n", + "both_cities_df['temp.max_celsius'] = kelvin_to_celsius(both_cities_df['temp.record_max'])\n", + "both_cities_df['temp.min_celsius'] = kelvin_to_celsius(both_cities_df['temp.record_min'])\n", "\n", "# Create a new column that concatenates month and day (e.g., \"03-01\" for March 1)\n", - "city_1_df['month_day'] = city_1_df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", - "city_2_df['month_day'] = city_2_df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", - "\n", - "# Plot the graph of the mean temperature\n", - "plt.figure(figsize=(12, 6))\n", - "plt.plot(city_1_df['month_day'], temp_city_1, color='#2E8B57', label=f'temp {city_1}')\n", - "plt.plot(city_2_df['month_day'], temp_city_2, color='#FFD700', label=f'temp {city_2}')\n", - "\n", - "plt.axhline(temp_mean_city_1, color='#2E8B57', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", - "plt.axhline(temp_mean_city_2, color='#FFD700', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", - "\n", - "# Label for easier reading and understanding of the plot\n", - "# plt.title(f\"Mean temp - statistic historical {city_name}\")\n", - "plt.xlabel(\"Date\")\n", - "plt.ylabel(\"Temperature (°C)\")\n", - "\n", - "# Add marker at 0 temperature\n", - "plt.axhline(y=0, color='black', linewidth=1.5)\n", - "\n", - "# Customize the x-axis to show ticks and labels only at the start of each month\n", - "plt.gca().xaxis.set_major_locator(mdates.MonthLocator()) \n", - "# Format ticks to show abbreviated month names (e.g., Jan, Feb)\n", - "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) \n", - "\n", - "plt.xticks(rotation=45)\n", - "plt.yticks(range(-20, 30, 2))\n", - "plt.tight_layout()\n", - "plt.grid()\n", - "\n", - "plt.legend()\n", - "\n", - "# Save the plot to the data/output_fig folder\n", - "# plot_path = os.path.join(output_folder, f\"mean_temp_plot_{city_name}.png\")\n", - "# plt.savefig(plot_path) # Save the plot as a PNG file\n", - "\n", - "# Show the plot\n", - "plt.show()\n" + "both_cities_df['month_day'] = both_cities_df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", + "\n", + "temp_data = sqldf('''\n", + " SELECT month_day, city_name, `temp.mean_celsius` as temp\n", + " FROM both_cities_df\n", + "''')\n", + "\n", + "# Set the dt to index, city for \"header\" and the temp as values for each city each hour\n", + "pivoted_temp = temp_data.pivot(index='month_day', columns='city_name', values='temp')\n", + "\n", + "# Rename the columns in the pivoted_temp\n", + "pivoted_temp.columns = [f\"{col}_main_temp\" for col in pivoted_temp.columns]\n", + "# Display the final result\n", + "print(f\"Main temperature for {city_1} and {city_2}\")\n", + "display(pivoted_temp)\n", + "\n", + "# Extract and stores temperatur data for each city using pandas sql\n", + "stat_temp_per_city = sqldf('''\n", + " SELECT city, city_name, AVG(`temp.mean_celsius`) AS avg_temp, MAX(`temp.max_celsius`) AS max_temp, MIN(`temp.min_celsius`) AS min_temp\n", + " FROM both_cities_df\n", + " GROUP BY city_name\n", + " ''')\n", + "\n", + "# Display the temperatur stats for both cities\n", + "display(stat_temp_per_city)" ] }, { @@ -323,48 +273,35 @@ { "cell_type": "code", "execution_count": null, - "id": "5baab98a", + "id": "83aed603", "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "import matplotlib.dates as mdates\n", - "import os\n", - "import sys\n", - "\n", - "# Gets the absolute path to the src folder\n", - "sys.path.append(os.path.abspath(\"../src\"))\n", - "\n", - "# Import the kelvin to celsius function\n", - "from my_package.util import kelvin_to_celsius\n", - "\n", - "# Defines the output folder for the figure, and makes it if is does not exsist\n", "output_folder = \"../data/output_fig\"\n", - "os.makedirs(output_folder, exist_ok=True) \n", + "os.makedirs(output_folder, exist_ok=True) # Create the folder if it doesn't exist\n", "\n", - "# Converts to and make a new column with celsius temp, and not kelvin\n", - "city_1_df['temp.mean_celsius'] = kelvin_to_celsius(city_1_df['temp.mean'])\n", - "temp_city_1 = city_1_df['temp.mean_celsius']\n", - "precipitation_city_1 = city_1_df['precipitation.mean']\n", - "wind_city_1 = city_1_df['wind.mean']\n", + "temp_city_1 = sqldf('''SELECT `temp.mean_celsius` FROM both_cities_df WHERE city = \"city_1\"''')\n", + "temp_city_2 = sqldf('''SELECT `temp.mean_celsius` FROM both_cities_df WHERE city = \"city_2\"''')\n", "\n", - "# Converts to and make a new column with celsius temp, and not kelvin\n", - "city_2_df['temp.mean_celsius'] = kelvin_to_celsius(city_2_df['temp.mean'])\n", - "temp_city_2 = city_2_df['temp.mean_celsius']\n", - "precipitation_city_2 = city_2_df['precipitation.mean']\n", - "wind_city_2 = city_2_df['wind.mean']\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (first row, first column)\n", + "temp_mean_city_1 = sqldf('''SELECT avg_temp FROM stat_temp_per_city WHERE city = \"city_1\"''').iloc[0, 0]\n", + "temp_mean_city_2 = sqldf('''SELECT avg_temp FROM stat_temp_per_city WHERE city = \"city_2\"''').iloc[0, 0]\n", "\n", - "# Create a new column that concatenates month and day (e.g., \"03-01\" for March 1)\n", - "city_1_df['month_day'] = city_1_df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", - "city_2_df['month_day'] = city_2_df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", + "# Extract precipitation values for both cities\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (all rows, first column)\n", + "precipitation_city_1 = sqldf('''SELECT `precipitation.mean` FROM both_cities_df WHERE city = \"city_1\"''').iloc[:,0]\n", + "precipitation_city_2 = sqldf('''SELECT `precipitation.mean` FROM both_cities_df WHERE city = \"city_2\"''').iloc[:,0]\n", + "\n", + "wind_city_1 = sqldf('''SELECT `wind.mean` FROM both_cities_df WHERE city = \"city_1\"''').iloc[:,0]\n", + "wind_city_2 = sqldf('''SELECT `wind.mean` FROM both_cities_df WHERE city = \"city_2\"''').iloc[:,0]\n", "\n", "x_axis = city_1_df['month_day']\n", "\n", "fig, (ax1, ax3) = plt.subplots(2, 1, figsize = (15, 8), sharex=True)\n", "\n", "# Plot temperature on the primary y-axis\n", - "ax1.plot(x_axis, temp_city_1, color='#2E8B57', label=f'Temperature {city_1}')\n", - "ax1.plot(x_axis, temp_city_2, color='#FFD700', label=f'Temperature {city_2}')\n", + "ax1.plot(x_axis, temp_city_1, color='#008080', label=f'Temperature {city_1}')\n", + "ax1.plot(x_axis, temp_city_2, color='#FFA500', label=f'Temperature {city_2}')\n", "# ax1.set_xlabel('Datetime')\n", "ax1.set_ylabel('Temperature (°C)', color='tab:red')\n", "ax1.tick_params(axis='y', labelcolor='tab:red')\n", @@ -372,14 +309,15 @@ "# Add marker at 0 temperature\n", "ax1.axhline(y=0, color='black', linewidth=1.5)\n", "\n", + "ax1.axhline(y=temp_mean_city_1, color='#008080', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", + "ax1.axhline(y=temp_mean_city_2, color='#FFA500', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", + "\n", "# Plot precipitation as bars on the secondary y-axis\n", "ax2 = ax1.twinx()\n", "\n", - "# ax2.bar(x_axis, precipitation_city_1, color='#2E8B57', alpha=0.5, width=1, label=f'Precipitation {city_1}')\n", - "# ax2.bar(x_axis, precipitation_city_2, color='#FFD700', alpha=0.5, width=1, label=f'Precipitation {city_2}')\n", - "\n", - "ax2.fill_between(x_axis, precipitation_city_1, color='green', alpha=0.3, label=f'{city_1} Total')\n", - "ax2.fill_between(x_axis, precipitation_city_2, color='gold', alpha=0.3, label=f'{city_2} Total')\n", + "# Fill between the precipitation, for an easier and more readable vizualisation\n", + "ax2.fill_between(x_axis, precipitation_city_1, color='#80C0C0', alpha=0.5, label=f'{city_1} Total', edgecolor = '#008080')\n", + "ax2.fill_between(x_axis, precipitation_city_2, color='#FFD280', alpha=0.5, label=f'{city_2} Total', edgecolor = '#FFA500')\n", "\n", "ax2.set_ylabel(\"Precipitation (mm)\", color='tab:blue')\n", "ax2.tick_params(axis='y', labelcolor='tab:blue')\n", @@ -388,8 +326,8 @@ "ax1.legend(loc='upper left')\n", "ax2.legend(loc='upper right')\n", "\n", - "ax3.plot(x_axis, wind_city_1, color='#2E8B57', label=f'Wind {city_1}')\n", - "ax3.plot(x_axis, wind_city_2, color='#FFD700', label=f'Wind {city_2}')\n", + "ax3.plot(x_axis, wind_city_1, color='#008080', label=f'Wind {city_1}')\n", + "ax3.plot(x_axis, wind_city_2, color='#FFA500', label=f'Wind {city_2}')\n", "# ax3.plot(x_axis, wind_speed, color='tab:purple', linestyle='dashed', label='Wind_speed')\n", "ax3.set_ylabel('Wind (m/s)')\n", "ax3.set_xlabel('Datetime')\n", @@ -403,12 +341,15 @@ "# Format ticks to show abbreviated month names (e.g., Jan, Feb)\n", "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) \n", "\n", + "plt.title(f\"Statistic weather data for ({city_1} and {city_2})\")\n", "plt.tight_layout()\n", "\n", - "# Show the plot\n", - "plt.show()\n", + "# Save the plot to the data/output_fig folder\n", + "plot_path = os.path.join(output_folder, f\"mean_temp_plot_{city_1}_{city_2}.png\")\n", + "plt.savefig(plot_path) # Save the plot as a PNG file\n", "\n", - "# print(df['precipitation.max'].max())" + "# Show the plot\n", + "plt.show()" ] } ], diff --git a/notebooks/notebook_current_data.ipynb b/notebooks/notebook_current_data.ipynb index c99efdd..d2d01a9 100644 --- a/notebooks/notebook_current_data.ipynb +++ b/notebooks/notebook_current_data.ipynb @@ -30,15 +30,13 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.fetch_current_data import fetch_current_data\n", + "from my_package.data import fetch_current_data\n", "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", "# User input the city, for the weather\n", - "city_name = input(\"Enter a city in Norway: \")\n", - "\n", - "city_name = replace_nordic(city_name)\n", + "city_name = input_place()\n", "\n", "# Stores the return of the function\n", "data, folder = fetch_current_data(city_name)" @@ -66,7 +64,7 @@ "# Gets the absolute path to the src folder\n", "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", - "from my_package.write_data import write_data\n", + "from my_package.data import write_data\n", "\n", "# The user choose the filename\n", "filename = input(\"Write filename: \")\n", diff --git a/notebooks/notebook_one_day_data.ipynb b/notebooks/notebook_one_day_data.ipynb index 5f861d0..b88b58c 100644 --- a/notebooks/notebook_one_day_data.ipynb +++ b/notebooks/notebook_one_day_data.ipynb @@ -68,8 +68,7 @@ " \n", " return date_input, [ts[0] for ts in timestamps]\n", "\n", - "date, timestamps = get_unix_timestamps_for_day()\n", - "\n" + "date, timestamps = get_unix_timestamps_for_day()" ] }, { @@ -96,23 +95,19 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.fetch_data import fetch_data\n", + "from my_package.data import fetch_time_data\n", "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", - "# User choose a city they want the weather data from\n", - "city_name = input(\"Enter city name: \")\n", - "\n", - "city_name = replace_nordic(city_name)\n", + "# User input the city, for the weather\n", + "city_name = input_place()\n", "\n", "# Start_date is the first timestamp, end_date is the last\n", "start_date, end_date = timestamps[0], timestamps[-1]\n", "\n", - "city_name = replace_nordic(city_name)\n", - "\n", "# Stores the values in the variables\n", - "weather_data, folder = fetch_data(start_date, end_date, city_name)" + "weather_data, folder = fetch_time_data(start_date, end_date, city_name)" ] }, { @@ -136,7 +131,7 @@ "# Gets the absolute path to the src folder\n", "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", - "from my_package.write_data import write_data\n", + "from my_package.data import write_data\n", "\n", "filename = input(\"Write filename: \")\n", "\n", @@ -150,7 +145,7 @@ "source": [ "### Lese fra fil\n", "\n", - "Henter opp data lagret i filen, lagd over, og skriver ut lesbart ved hjelp av pandas" + "Ved hjelp av funksjonen `extract_city_df` fjernes unødvendige kolonner, og dataen blir normalisert for lettere lesbarhet." ] }, { @@ -164,7 +159,7 @@ "# Reads from file using pandas\n", "weather_data = pd.read_json(f'../data/output_stedsnavn/data_{filename}.json')\n", "\n", - "from my_package.util import extract_city_df\n", + "from my_package.data import extract_city_df\n", "\n", "df = extract_city_df(weather_data)\n", "display(df)" @@ -212,10 +207,10 @@ "plt.figure(figsize=(12, 6))\n", "\n", "# Scatter plot for each temperature reading\n", - "plt.scatter(x_axis, temp, color='tab:green', label='Temperaturmålinger', alpha=0.6)\n", + "plt.scatter(x_axis, temp, color='tab:red', label='Temperaturmålinger', alpha=0.7)\n", "\n", "# Add a horizontal line for the mean temperature\n", - "plt.axhline(y=temp_mean, color='green', linestyle='--', label=f'Gj.snitt {temp_mean}°C')\n", + "plt.axhline(y=temp_mean, color='tab:red', linestyle=\"dashed\", label=f'Gj.snitt {temp_mean}°C')\n", "\n", "# Get the current axis and store it as ax\n", "ax = plt.gca()\n", @@ -250,8 +245,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Visualiserer nedbør\n", - "Ved hjelp av matplotlib visualiserer vi nedbør for ønsket dag." + "### Sjekker eksitensen av kolonner\n", + "\n", + "Gjennom prosjektet har vi oppdaget at kolonnene som ofte mangler er 'rain.1h' og 'snow.1h'. Derfor importerer vi funksjonen `ensure_column` som tar inn dataframen og kolonnene vi vil sjekke. Dersom kolonnene ikke eksisterer blir de lagt til og fylt med 'NaN' før dataframen returneres.\n", + "\n", + "Sjekker også for kolonnene 'wind.gust' og 'wind.speed' da de skal brukes til plotting senere." ] }, { @@ -260,76 +258,15 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "import matplotlib.dates as mdates\n", - "import numpy as np\n", - "\n", - "from my_package.util import ensure_rain_column\n", - "from my_package.util import ensure_snow_column\n", - "\n", - "x_axis = df.index\n", - "\n", - "# Checks if the rain is a value, it will not be if it is no rain and then cause a KeyError\n", - "try:\n", - " rain = df['rain.1h']\n", - "\n", - "# If no rain, make the rain column and fill it with NaN\n", - "except KeyError:\n", - " df = ensure_rain_column(df)\n", + "from my_package.util import ensure_column\n", "\n", - "# Checks if the snow is a value, it will not be if it is no rain and then cause a KeyError\n", - "try:\n", - " snow = df['snow.1h']\n", + "# The columns we want to check if exsist\n", + "columns_to_ensure = ['rain.1h', 'snow.1h', 'wind.speed', 'wind.gust']\n", "\n", - "# If no snow, make the snow column and fill it with NaN\n", - "except KeyError:\n", - " df = ensure_snow_column(df)\n", + "# Runs the function with wanted colummns\n", + "df = ensure_column(df, columns_to_ensure)\n", "\n", - "# Choose the width and height of the plot\n", - "plt.figure(figsize=(15, 6))\n", - "\n", - "# Check with rain, will cause NameError if the try/except over fails\n", - "try:\n", - " plt.bar(x_axis, rain, width=0.02, alpha=0.5, color='tab:blue', label='rain')\n", - "except: NameError\n", - "\n", - "# Check with snow, will cause NameError if the try/except over fails\n", - "try: \n", - " plt.bar(x_axis, snow, width=0.02, alpha=0.5, color='tab:grey', label='snow')\n", - "except: NameError\n", - "\n", - "# Get the current axsis, and store it as ax\n", - "ax = plt.gca()\n", - "\n", - "# Use the current ax, to get a tick-mark on the x_axis for each hour, and print like \"HH:MM\"\n", - "ax.xaxis.set_major_locator(mdates.HourLocator())\n", - "ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))\n", - "\n", - "# Add the label-desciption\n", - "plt.legend(loc = 'upper right')\n", - "\n", - "# Add title to the plot, with date\n", - "plt.title(f'Precipitation {city_name}, ({date}))')\n", - "\n", - "# Shows the plot\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Vise dataframe, med nye kolonner\n", - "Hvis dataframen ikke inneholdt 'rain.1h' eller 'snow.1h', skal de nå ha blitt lagt til med 'NaN' verdier." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Display df, to see if 'rain.1h' and 'snow.1h' was added with NaN values\n", + "# Display dataframe with eventual changes\n", "display(df)" ] }, @@ -340,7 +277,7 @@ "### Sjekk for manglende verdier\n", "Missigno sjekker og visualiserer manglende verdier, slik at det blir lettere å se hvilke kolonner feilen ligger i. \n", "\n", - "Vis the blir \"hull\" i en søyle, tyder the på manglende verdier." + "Hvis det blir \"hull\" i en søyle, tyder the på manglende verdier." ] }, { @@ -364,9 +301,9 @@ "\n", "Under sjekker vi først om regn eller snø er i målingen, og hvis de er, bytter vi ut NaN med 0. \n", "\n", - "Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunne til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n", + "Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunnen til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n", "\n", - "Deretter sjekker vi andre verdier, og bytter enten 'NaN' med 0, eller med verdien før. Verdiene vi setter til 0 gjelder da snø, regn og vind, resten blir satt til verdien før." + "Deretter bruker vi interpolate dersom de skulle være NaN verdier, det er en funksjon som tar utgangspunkt i verdien før og verdien etter for å 'gjette' verdien som mangler. Vi har lagt til 'limit-direction', som gjør at den gjetter selv om man bare har en verdi på siden, som feks. første og siste verdi. " ] }, { @@ -375,54 +312,19 @@ "metadata": {}, "outputs": [], "source": [ - "from my_package.util import fill_rain_column\n", - "from my_package.util import fill_snow_column\n", + "from my_package.util import fill_column_0\n", "\n", - "df = fill_rain_column(df)\n", + "# The columns we want to replace 'NaN' with 0\n", + "columns_to_0 = ['rain.1h', 'snow.1h', 'wind.gust']\n", "\n", - "df = fill_snow_column(df)\n", + "# Runs the function with wanted columns\n", + "df = fill_column_0(df, columns_to_0)\n", "\n", "# Drops all the columns, if it has 'NaN' value.\n", "df = df.dropna(axis='columns', how='all')\n", "\n", - "# If wind_gust is stored, fill the NaN with 0\n", - "try: \n", - " df['wind.gust'] = df['wind.gust'].fillna(0)\n", - "except KeyError:\n", - " print(\"['wind.gust'], not in df\")\n", - "\n", - "# If wind_deg is stored, fill the NaN with 0\n", - "try: \n", - " df['wind.deg'] = df['wind.deg'].fillna(0)\n", - "except KeyError:\n", - " print(\"['wind.deg'], not in df\")\n", - "\n", - "# If wind_speed is stored, fill the NaN with 0\n", - "try: \n", - " df['wind.speed'] = df['wind.speed'].fillna(0)\n", - "except KeyError:\n", - " print(\"['wind.speed'], not in df\")\n", - "\n", - "# If temperature is missing, take the same as the one before\n", - "df['main.temp'] = df['main.temp'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in what the temperature feels like\n", - "df['main.feels_like'] = df['main.feels_like'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the pressure\n", - "df['main.pressure'] = df['main.pressure'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the humidity\n", - "df['main.humidity'] = df['main.humidity'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the lowest temperature \n", - "df['main.temp_min'] = df['main.temp_min'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the highest temperature \n", - "df['main.temp_max'] = df['main.temp_max'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values of clouds\n", - "df['clouds.all'] = df['clouds.all'].fillna('obj.ffill()')\n", + "# Interpolate other missing 'NaN'-values\n", + "df = df.interpolate(method='linear', limit_direction='both')\n", "\n", "# Display the df, now without NaN\n", "display(df)" @@ -479,7 +381,7 @@ "# x_axis set to the index, which mean the datetime\n", "x_axis = df.index\n", "\n", - "# Gets the values\n", + "# Gets the values we need to for the plot\n", "rain = df['rain.1h']\n", "temp = df['main.temp']\n", "snow = df['snow.1h']\n", @@ -685,7 +587,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/notebooks/notebook_one_week_data.ipynb b/notebooks/notebook_one_week_data.ipynb index a714766..e315da3 100644 --- a/notebooks/notebook_one_week_data.ipynb +++ b/notebooks/notebook_one_week_data.ipynb @@ -84,18 +84,16 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.fetch_data import fetch_data\n", + "from my_package.data import fetch_time_data\n", "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", "# User input the city, for the weather\n", - "city_name = input(\"Enter a city in Norway: \")\n", - "\n", - "city_name = replace_nordic(city_name)\n", + "city_name = input_place()\n", "\n", "# Stores the values in the variables\n", - "data, folder = fetch_data(unix_start_date, unix_end_date, city_name)\n" + "data, folder = fetch_time_data(unix_start_date, unix_end_date, city_name)\n" ] }, { @@ -120,7 +118,7 @@ "# Gets the absolute path to the src folder\n", "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", - "from my_package.write_data import write_data\n", + "from my_package.data import write_data\n", "\n", "# User chose the name for the file\n", "filename = input(\"Write filename: \")\n", @@ -135,7 +133,7 @@ "source": [ "### Lese fra fil\n", "\n", - "Henter opp data lagret i filen, lagd over, og skriver ut lesbart ved hjelp av pandas" + "Ved hjelp av funksjonen `extract_city_df` fjernes unødvendige kolonner, og dataen blir normalisert for lettere lesbarhet." ] }, { @@ -146,11 +144,13 @@ "source": [ "import pandas as pd\n", "\n", - "# Read json-file using pandas\n", - "data = pd.read_json(f'../data/output_stedsnavn/data_{filename}.json')\n", + "# Reads from file using pandas\n", + "weather_data = pd.read_json(f'../data/output_stedsnavn/data_{filename}.json')\n", + "\n", + "from my_package.data import extract_city_df\n", "\n", - "# Display the data\n", - "display(data)" + "df = extract_city_df(weather_data)\n", + "display(df)" ] }, { @@ -171,30 +171,15 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "\n", - "from my_package.util import extract_city_df\n", - "from my_package.util import ensure_rain_column\n", - "from my_package.util import ensure_snow_column\n", - "\n", - "df = extract_city_df(data)\n", + "from my_package.util import ensure_column\n", "\n", - "# Checks if the rain is a value, it will not be if it is no rain and then cause a KeyError\n", - "try:\n", - " rain = df['rain.1h']\n", + "# Choose columns to ensure that are in the dataframe\n", + "columns_to_ensure = ['rain.1h', 'snow.1h']\n", "\n", - "# If no rain, make the rain column and fill it with NaN\n", - "except KeyError:\n", - " df = ensure_rain_column(df)\n", - "\n", - "# Checks if the snow is a value, it will not be if it is no snow and then cause a KeyError\n", - "try:\n", - " snow = df['snow.1h']\n", - "\n", - "# If no snow, make the snow column and fill it with NaN\n", - "except KeyError:\n", - " df = ensure_snow_column(df)\n", + "# Runs the function with the dataframe and the columns to ensure\n", + "df = ensure_column(df, columns_to_ensure)\n", "\n", + "# Display the dataframe, with potentially new columns\n", "display(df) " ] }, @@ -203,7 +188,9 @@ "metadata": {}, "source": [ "### Viser temperaturen\n", - "Regner ut gjennomsnittst-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur." + "Regner ut gjennomsnittst-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur.\n", + "\n", + "VI plotter også temperaturen i et scatter-diagram, med gjennomsnittet på en striplet linje. " ] }, { @@ -212,16 +199,64 @@ "metadata": {}, "outputs": [], "source": [ - "# Extract main values\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "\n", + "# Stores the temperature values\n", "temp = df['main.temp']\n", + "\n", "temp_mean = temp.mean().round(2)\n", - "temp_max = temp.max().round(2)\n", - "temp_min = temp.min().round(2)\n", "\n", "# Print the average temperature\n", "print(f'Mean temperatur: {temp_mean}')\n", - "print(f'Highest temperatur: {temp_max}')\n", - "print(f'Lowest temperatur: {temp_min}')" + "\n", + "# Find the highest and lowest temperatures\n", + "max_temp = df['main.temp'].max().round(2)\n", + "min_temp = df['main.temp'].min().round(2)\n", + "\n", + "print(\"Highest temperature:\", max_temp)\n", + "print(\"Lowest temperature:\", min_temp)\n", + "\n", + "\n", + "# Set the x_axis to the index, which represents the time\n", + "x_axis = df.index\n", + "\n", + "# Choose the width and height of the plot\n", + "plt.figure(figsize=(12, 6))\n", + "\n", + "# Scatter plot for each temperature reading\n", + "plt.scatter(x_axis, temp, color='tab:red', label='Temperaturmålinger', alpha=0.7)\n", + "\n", + "# Add a horizontal line for the mean temperature\n", + "plt.axhline(y=temp_mean, color='tab:red', linestyle=\"dashed\", label=f'Gj.snitt {temp_mean}°C')\n", + "\n", + "# Get the current axis and store it as ax\n", + "ax = plt.gca()\n", + "\n", + "# Customize the x-axis to show ticks for each day\n", + "ax.xaxis.set_major_locator(mdates.DayLocator(interval=1)) # Tick marks for each day\n", + "ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d %H:%M')) # Format as \"Month-day Hour:Minute\"\n", + "\n", + "# Rotate x-axis labels for better readability\n", + "plt.xticks(rotation=45)\n", + "\n", + "# Adjust layout\n", + "plt.tight_layout()\n", + "\n", + "# Add title for the plot\n", + "plt.title(f'Temperatur {city_name}, ({start_date} to {end_date})')\n", + "\n", + "# Add marker at 0 temperature\n", + "plt.axhline(y=0, color='black', linewidth=1.5)\n", + "\n", + "# Show grid\n", + "plt.grid()\n", + "\n", + "# Show legend\n", + "plt.legend(loc='upper right')\n", + "\n", + "# Show the plot\n", + "plt.show()\n" ] }, { @@ -253,7 +288,11 @@ "### Endre manglende verdier\n", "I de fleste tilfeller virker dataene å være tilnærmet \"perfekte\", men de inkluderer bare snø og regn dersom det er snø eller regn. Derfor vil vi fa NaN verdier i de målingene det ikke har regnet/snødd. \n", "\n", - "Under sjekker vi først om regn eller snø er i målingen, og hvis den er, bytter vi ut NaN med 0." + "Under sjekker vi først om regn eller snø er i målingen, og hvis de er, bytter vi ut NaN med 0. \n", + "\n", + "Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunnen til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n", + "\n", + "Deretter bruker vi interpolate dersom de skulle være NaN verdier, det er en funksjon som tar utgangspunkt i verdien før og verdien etter for å 'gjette' verdien som mangler. Vi har lagt til 'limit-direction', som gjør at den gjetter selv om man bare har en verdi på siden, som feks. første og siste verdi. " ] }, { @@ -262,50 +301,19 @@ "metadata": {}, "outputs": [], "source": [ - "from my_package.util import fill_rain_column\n", - "from my_package.util import fill_snow_column\n", - "\n", - "df = fill_rain_column(df)\n", - "df = fill_snow_column(df)\n", - "\n", - "# If wind_gust is stored, fill the NaN with 0\n", - "try: \n", - " df['wind.gust'] = df['wind.gust'].fillna(0)\n", - "except KeyError:\n", - " print(\"['wind.gust'], not in df\")\n", - "\n", - "# If wind_deg is stored, fill the NaN with 0\n", - "try: \n", - " df['wind.deg'] = df['wind.deg'].fillna(0)\n", - "except KeyError:\n", - " print(\"['wind.deg'], not in df\")\n", - "\n", - "# If wind_speed is stored, fill the NaN with 0\n", - "try: \n", - " df['wind.speed'] = df['wind.speed'].fillna(0)\n", - "except KeyError:\n", - " print(\"['wind.speed'], not in df\")\n", - "\n", - "# If temperature is missing, take the same as the one before\n", - "df['main.temp'] = df['main.temp'].fillna('obj.ffill()')\n", + "from my_package.util import fill_column_0\n", "\n", - "# Forward fill missing values in what the temperature feels like\n", - "df['main.feels_like'] = df['main.feels_like'].fillna('obj.ffill()')\n", + "# Columns we want to fill 'NaN' with 0\n", + "columns_to_fill = ['rain.1h', 'snow.1h']\n", "\n", - "# Forward fill missing values in the pressure\n", - "df['main.pressure'] = df['main.pressure'].fillna('obj.ffill()')\n", + "# Runs the function with the wanted columns\n", + "df = fill_column_0(df, columns_to_fill)\n", "\n", - "# Forward fill missing values in the humidity\n", - "df['main.humidity'] = df['main.humidity'].fillna('obj.ffill()')\n", + "# Drops all the columns, if it has 'NaN' value.\n", + "df = df.dropna(axis='columns', how='all')\n", "\n", - "# Forward fill missing values in the lowest temperature \n", - "df['main.temp_min'] = df['main.temp_min'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the highest temperature \n", - "df['main.temp_max'] = df['main.temp_max'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values of clouds\n", - "df['clouds.all'] = df['clouds.all'].fillna('obj.ffill()')\n", + "# Interpolate other missing 'NaN'-values\n", + "df = df.interpolate(method='linear', limit_direction='both')\n", "\n", "# Display the df, now without NaN\n", "display(df)" @@ -443,144 +451,6 @@ "# Show the plot\n", "plt.show()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import statistics\n", - "\n", - "# Extract temperature columns\n", - "temp_mean = df['main.temp']\n", - "\n", - "# Calculate means\n", - "temp_mean_mean = temp_mean.mean()\n", - "\n", - "\n", - "# Calculate standard deviations\n", - "temp_mean_stdev = statistics.stdev(temp_mean)\n", - "\n", - "\n", - "# Calculate 3 standard deviation limits\n", - "mean_lower_limit = temp_mean_mean - (temp_mean_stdev * 3)\n", - "mean_upper_limit = temp_mean_mean + (temp_mean_stdev * 3)\n", - "\n", - "# Identify outliers\n", - "mean_outliers = df.loc[(df['main.temp'] > mean_upper_limit) | (df['main.temp'] < mean_lower_limit), 'main.temp']\n", - "\n", - "# Print the outliers\n", - "print(\"Outliers in main.temp:\")\n", - "print(mean_outliers)\n", - "\n", - "# Replace outliers with NaN\n", - "df.loc[(df['main.temp'] > mean_upper_limit) | (df['main.temp'] < mean_lower_limit), 'main.temp'] = np.nan\n", - "\n", - "# Interpolate to replace NaN values with linear interpolation\n", - "df['main.temp'] = df['main.temp'].interpolate(method='linear')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import matplotlib.dates as mdates\n", - "import os\n", - "\n", - "# Where the figure should be saved when exported\n", - "output_folder = \"../data/output_fig\"\n", - "\n", - "# Creates the folder if it does not exist\n", - "os.makedirs(output_folder, exist_ok=True)\n", - "\n", - "# x_axis set to the index, which mean the datetime\n", - "x_axis = df.index\n", - "\n", - "# Gets the values\n", - "rain = df['rain.1h']\n", - "temp = df['main.temp']\n", - "snow = df['snow.1h']\n", - "wind_gust = df['wind.gust']\n", - "wind_speed = df['wind.speed']\n", - "temp_mean = temp.mean().round(2)\n", - "\n", - "# Two vertically stacked axis, (2 rows, 1 column), width and height of the figure, and the axis share the same x_axis\n", - "fig, (ax1, ax3) = plt.subplots(2, 1,figsize=(15, 8), sharex=True)\n", - "\n", - "\n", - "# Set the title for the diagram, above the first axis, with city_name and input_date\n", - "ax1.set_title(f'Weather data for {city_name} ({start_date}) to ({end_date}) ')\n", - "\n", - "# Plot temperature on the primary y-axis\n", - "ax1.plot(x_axis, temp, color='tab:red', label='Temperature (°C)')\n", - "ax1.axhline(y=temp_mean, color='tab:red', linestyle='dashed', label='Mean temperature (°C)')\n", - "ax1.axhline(y=0, color='black', linewidth=1.5)\n", - "\n", - "# Design the y-axis for temperatur\n", - "ax1.set_ylabel('Temperature (°C)', color='tab:red')\n", - "ax1.tick_params(axis='y', labelcolor='tab:red')\n", - "\n", - "# Plot Precipitation as bars on the secondary y-axis\n", - "ax2 = ax1.twinx()\n", - "\n", - "# Add rain\n", - "# ax2.bar(x_axis, rain, color='tab:blue', alpha=0.5, width=0.02, label='Rain (mm)')\n", - "ax2.hist(x_axis, bins=len(x_axis), weights=rain, color='tab:blue', alpha=0.5, label= 'Rain (mm)', bottom=snow)\n", - "\n", - "# Add snow\n", - "# ax2.bar(x_axis, snow, color='tab:grey', alpha=0.5, width=0.02, label='Snow (mm)')\n", - "ax2.hist(x_axis, bins=len(x_axis), weights=snow, color='tab:gray', alpha=0.5, label= 'Snow (mm)')\n", - "\n", - "# Design the y-axis for precipiation\n", - "ax2.set_ylabel(\"Precipitation (mm)\", color='tab:blue')\n", - "ax2.tick_params(axis='y', labelcolor='tab:blue')\n", - "\n", - "\n", - "# Customize the x-axis to show ticks for each hour\n", - "ax1.xaxis.set_major_locator(mdates.HourLocator(interval=12)) # Tick marks for every hour\n", - "ax1.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H')) # Format as \"Day Month Hour:Minute\"\n", - "\n", - "# Add label-description for both axis\n", - "ax1.legend(loc='upper left')\n", - "ax2.legend(loc='upper right')\n", - "\n", - "# Add grid, but only vertically\n", - "ax1.grid(axis = 'x')\n", - "\n", - "\n", - "# Plot the wind at the second x-axis (the axis below)\n", - "ax3.plot(x_axis, wind_gust, color='tab:purple', linestyle='dashed', label='Wind_gust')\n", - "ax3.plot(x_axis, wind_speed, color='tab:purple', label='Wind_speed')\n", - "ax3.set_ylabel('Wind (m/s)')\n", - "\n", - "# Add x_label visible for both x-axis\n", - "ax3.set_xlabel('Datetime')\n", - "\n", - "# Add label-description\n", - "ax3.legend(loc='upper right')\n", - "\n", - "# Customize the x-axis to show ticks for each hour\n", - "ax3.xaxis.set_major_locator(mdates.HourLocator(interval=12)) # Tick marks for every hour\n", - "ax3.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H')) # Format as \"Day Month Hour:Minute\"\n", - "\n", - "# Add grid, but only vertically\n", - "ax3.grid(axis = 'x')\n", - "\n", - "# Adjust layout\n", - "plt.tight_layout()\n", - "\n", - "# Save the plot to the data/output_fig folder\n", - "plot_path = os.path.join(output_folder, f\"weather_data_plot{city_name}.png\")\n", - "plt.savefig(plot_path) # Save the plot as a PNG file\n", - "\n", - "# Show the plot\n", - "plt.show()" - ] } ], "metadata": { diff --git a/notebooks/notebook_statistic_data.ipynb b/notebooks/notebook_statistic_data.ipynb index cdd8ca1..6e70923 100644 --- a/notebooks/notebook_statistic_data.ipynb +++ b/notebooks/notebook_statistic_data.ipynb @@ -34,13 +34,11 @@ "# Now we can import the fucntion from the module\n", "from my_package.year_data import fetch_data\n", "\n", - "# Import function to replace nordic (æøå)\n", - "from my_package.util import replace_nordic\n", + "# Import function to for input_place, replace æøå\n", + "from my_package.util import input_place\n", "\n", "# User input the city, for the weather\n", - "city_name = input(\"Enter a city in Norway: \")\n", - "\n", - "city_name = replace_nordic(city_name)\n", + "city_name = input_place()\n", "\n", "data, folder = fetch_data(city_name)" ] @@ -111,16 +109,10 @@ "outputs": [], "source": [ "import pandas as pd\n", + "from my_package.util import extract_city_data_stat\n", "\n", - "# Checks if the 'result' column is in the data\n", - "if 'result' in data:\n", - " # Normalize the json and store it as a dataframe for better readability\n", - " df = pd.json_normalize(data['result'])\n", - "\n", - " # Display the dataframe\n", - " display(df)\n", - "else:\n", - " print(\"'result' not in data\")" + "df = extract_city_data_stat(data)\n", + "display(df)" ] }, { @@ -139,11 +131,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Drop all columns that end with '...' using the filter function\n", - "df = df.drop(columns=df.filter(like='.p25').columns)\n", - "df = df.drop(columns=df.filter(like='.p75').columns)\n", - "df = df.drop(columns=df.filter(like='.st_dev').columns)\n", - "df = df.drop(columns=df.filter(like='.num').columns)\n", + "from my_package.util import clean_df\n", + "\n", + "# Cleans data for unessecarily columns\n", + "df = clean_df(df)\n", "\n", "display(df)" ] @@ -152,9 +143,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Plotter temperatur\n", - "Denne koden plotter data basert på gjennomsnitts temperatur gjennom året. For å sikre lagring av de ulike kjøringene, vil grafen bli lagret i mappen \"../data/output_fig/mean_temp_plot_{city_name}.json\"\n", - "\n" + "### Viser temperaturen\n", + "Vi bruker pandas SQL for å hente ut ønsket tempeartur fra statistic_data og lagrer den i en tabell.\n", + "\n", + "Ved hjelp av en pandas SQL setning kan vi hente og lagre gjennomsnitt, maksimalt og minste målte temperatur. Senere kan vi bare skrive en SELECT setning til denne variabelen." ] }, { @@ -163,10 +155,7 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "import matplotlib.dates as mdates\n", - "import os\n", - "import sys\n", + "from pandasql import sqldf\n", "\n", "# Gets the absolute path to the src folder\n", "sys.path.append(os.path.abspath(\"../src\"))\n", @@ -174,47 +163,27 @@ "# Import the kelvin to celsius function\n", "from my_package.util import kelvin_to_celsius\n", "\n", - "output_folder = \"../data/output_fig\"\n", - "os.makedirs(output_folder, exist_ok=True) # Create the folder if it doesn't exist\n", - "\n", - "# Converts to and make a new column with celsius temp, and not kelvin\n", "df['temp.mean_celsius'] = kelvin_to_celsius(df['temp.mean'])\n", - "temp = df['temp.mean_celsius']\n", - "\n", - "# Convert from day and month, to datetime\n", - "# df['date'] = pd.to_datetime(df[['month', 'day']].assign(year=2024))\n", + "df['temp.max_celsius'] = kelvin_to_celsius(df['temp.record_max'])\n", + "df['temp.min_celsius'] = kelvin_to_celsius(df['temp.record_min'])\n", "\n", "# Create a new column that concatenates month and day (e.g., \"03-01\" for March 1)\n", "df['month_day'] = df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", "\n", - "# Plot the graph of the mean temperature\n", - "plt.figure(figsize=(12, 6))\n", - "plt.plot(df['month_day'], temp)\n", + "temp_data = sqldf('''\n", + " SELECT month_day, `temp.mean_celsius` as temp\n", + " FROM df\n", + "''')\n", "\n", - "# Label for easier reading and understanding of the plot\n", - "plt.title(f\"Mean temp - statistic historical {city_name}\")\n", - "plt.xlabel(\"Date\")\n", - "plt.ylabel(\"Temperature (°C)\")\n", + "display(temp_data)\n", "\n", - "# Add marker at 0 temperature\n", - "plt.axhline(y=0, color='black', linewidth=1.5)\n", + "# Extract and stores temperatur data for each city using pandas sql\n", + "stat_temp = sqldf('''\n", + " SELECT AVG(`temp.mean_celsius`) AS avg_temp, MAX(`temp.max_celsius`) AS max_temp, MIN(`temp.min_celsius`) AS min_temp\n", + " FROM df\n", + " ''')\n", "\n", - "# Customize the x-axis to show ticks and labels only at the start of each month\n", - "plt.gca().xaxis.set_major_locator(mdates.MonthLocator()) \n", - "# Format ticks to show abbreviated month names (e.g., Jan, Feb)\n", - "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) \n", - "\n", - "plt.xticks(rotation=45)\n", - "plt.yticks(range(-20, 30, 2))\n", - "plt.tight_layout()\n", - "plt.grid()\n", - "\n", - "# Save the plot to the data/output_fig folder\n", - "plot_path = os.path.join(output_folder, f\"mean_temp_plot_{city_name}.png\")\n", - "plt.savefig(plot_path) # Save the plot as a PNG file\n", - "\n", - "# Show the plot\n", - "plt.show()\n" + "display(stat_temp)" ] }, { @@ -233,27 +202,18 @@ "source": [ "import matplotlib.pyplot as plt\n", "import matplotlib.dates as mdates\n", - "import os\n", - "import sys\n", - "\n", - "# Gets the absolute path to the src folder\n", - "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", - "# Import the kelvin to celsius function\n", - "from my_package.util import kelvin_to_celsius\n", - "\n", - "# Defines the output folder for the figure, and makes it if is does not exsist\n", "output_folder = \"../data/output_fig\"\n", - "os.makedirs(output_folder, exist_ok=True) \n", + "os.makedirs(output_folder, exist_ok=True) # Create the folder if it doesn't exist\n", + "\n", "\n", - "# Converts to and make a new column with celsius temp, and not kelvin\n", - "df['temp.mean_celsius'] = kelvin_to_celsius(df['temp.mean'])\n", "temp = df['temp.mean_celsius']\n", - "precipitation = df['precipitation.mean']\n", - "wind = df['wind.mean']\n", + "temp_mean = sqldf('''SELECT avg_temp FROM stat_temp''').iloc[0, 0]\n", "\n", - "# Create a new column that concatenates month and day (e.g., \"03-01\" for March 1)\n", - "df['month_day'] = df[['month', 'day']].apply(lambda x: f\"{x['month']:02d}-{x['day']:02d}\",axis=1)\n", + "# Extract precipitation values for both cities\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (all rows, first column)\n", + "precipitation = sqldf('''SELECT `precipitation.mean` FROM df''').iloc[:,0]\n", + "wind = sqldf('''SELECT `wind.mean` FROM df''').iloc[:,0]\n", "\n", "x_axis = df['month_day']\n", "\n", @@ -267,6 +227,7 @@ "\n", "# Add marker at 0 temperature\n", "ax1.axhline(y=0, color='black', linewidth=1.5)\n", + "ax1.axhline(y=temp_mean, color='red', linestyle=\"dashed\")\n", "\n", "# Plot precipitation as bars on the secondary y-axis\n", "ax2 = ax1.twinx()\n", @@ -295,9 +256,7 @@ "plt.tight_layout()\n", "\n", "# Show the plot\n", - "plt.show()\n", - "\n", - "print(df['precipitation.max'].max())" + "plt.show()" ] }, { @@ -447,9 +406,9 @@ "df.loc[(df['temp.record_max_celsius'] > max_upper_limit) | (df['temp.record_max_celsius'] < max_lower_limit), 'temp.record_max_celsius'] = np.nan\n", "\n", "# Interpolate to replace NaN values with linear interpolation\n", - "df['temp.mean_celsius'] = df['temp.mean_celsius'].interpolate(method='linear')\n", - "df['temp.record_min_celsius'] = df['temp.record_min_celsius'].interpolate(method='linear')\n", - "df['temp.record_max_celsius'] = df['temp.record_max_celsius'].interpolate(method='linear')" + "df['temp.mean_celsius'] = df['temp.mean_celsius'].interpolate(method='linear', limit_direction='both')\n", + "df['temp.record_min_celsius'] = df['temp.record_min_celsius'].interpolate(method='linear', limit_direction='both')\n", + "df['temp.record_max_celsius'] = df['temp.record_max_celsius'].interpolate(method='linear', limit_direction='both')" ] }, { diff --git a/requirements.txt b/requirements.txt index d671daa..062b51c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,6 +22,7 @@ executing==2.2.0 fastjsonschema==2.21.1 fonttools==4.56.0 fqdn==1.5.1 +greenlet==3.1.1 h11==0.14.0 httpcore==1.0.7 httpx==0.28.1 @@ -34,6 +35,7 @@ ipywidgets==8.1.5 isoduration==20.11.0 jedi==0.19.2 Jinja2==3.1.6 +joblib==1.4.2 json5==0.10.0 jsonpointer==3.0.0 jsonschema==4.23.0 @@ -68,6 +70,7 @@ numpy==1.26.4 overrides==7.7.0 packaging==24.2 pandas==2.2.3 +pandasql==0.7.3 pandocfilters==1.5.1 parso==0.8.4 pexpect==4.9.0 @@ -94,6 +97,7 @@ requests==2.32.3 rfc3339-validator==0.1.4 rfc3986-validator==0.1.1 rpds-py==0.23.1 +scikit-learn==1.6.1 scipy==1.15.2 seaborn==0.13.2 Send2Trash==1.8.3 @@ -101,9 +105,11 @@ setuptools==75.8.2 six==1.17.0 sniffio==1.3.1 soupsieve==2.6 +SQLAlchemy==2.0.40 stack-data==0.6.3 terminado==0.18.1 text-unidecode==1.3 +threadpoolctl==3.6.0 tinycss2==1.4.0 tornado==6.4.2 tqdm==4.67.1 diff --git a/src/README.md b/src/README.md index fca3fd8..b1fb22f 100644 --- a/src/README.md +++ b/src/README.md @@ -6,11 +6,18 @@ Mye av funksjonaliteten og funksjonener er skrevet i en vanlig `.py` fil, før d Her kommer en kjapp forklaring av de ulike filene og deres funksjoner: - `date_to_unix.py` bruker innebygde moduler som datetime og time, for å gjøre om datoer og tider til unix timestamp, sekunder fra 1. januar 1970. -- `fetch_current_data.py` funksjon for å hente nåværende data for ønsket sted fra API-en. Sender feilkode dersom statusen ikke har 200, altså ok. -- `fetch_data.py` henter data for ønsket sted, fra ønsket starttid til sluttid. Sender feilkode dersom statusen ikke har 200, altså ok. -- `get_record.py` brukt i `notebook_statistic_data.ipynb` for å finne rekord-målinger som høyeste og laveste målte temperatur. - `setup.py` funskjon for å hjelpe brukeren å lage en .env fil for å lagre API-key og email. +- `data.py` inneholder flere ulike funksjoner relatert til dataen. Til for eksempel: + - Hente current data + - Hente data for ønsket periode/dag + - Hente statistisk data + - Skrive data til json fil + - Hente data fra json fil, og fjerne metadata +- `util.py` inneholder ulike funksjoner som brukes flere ganger og derfor samles i en funksjon. Til for eksempel: + - Input funksjon, som renser for nordiske 'æøå' + - Konverterer grader fra kelvin til celsius + - Sjekker at ønskede kolonner eksisterer i datasettet + - Fyller "NaN" verdier med 0, i ønskede kolonner + - Fjerner alle uønskede kolonner med samme endelse (notebook_statistic_data.ipynb) + - Henter rekord målinger (notebook_statistic_data.ipynb) - `test_module.py` en test funksjon for å sjekke at venv og implementering til notebook funker som det skal. -- `util.py` inneholder funksjoner for å erstatte nordiske (æøå) og å omgjøre temperaturer fra kelvin til celsius. Altså funksjoner som bare er en enkel del av noe større. -- `write_data.py` lagrer data i json-format, med ønsket filnavn til en 'passende' mappe basert på hvor funksjonen brukes. -- `year_data.py` henter statistisk værdata basert på historikk for ønsket sted. Sender feilkode dersom statusen ikke har 200, altså ok. \ No newline at end of file diff --git a/src/my_package/data.py b/src/my_package/data.py new file mode 100644 index 0000000..23889d1 --- /dev/null +++ b/src/my_package/data.py @@ -0,0 +1,139 @@ +# Import of needed libaries +import requests +import os +from dotenv import load_dotenv +import json +import pandas as pd + +load_dotenv() + +# Gets the key, from my env file +API_KEY = os.getenv("API_KEY") + +# Gets the current data from the API - openweathermap.org +def fetch_current_data(city_name): + + + # f-string url, to add the "custom" variables to the API-request + url = f"https://api.openweathermap.org/data/2.5/weather?q={city_name},NO&units=metric&appid={API_KEY}" + + # Saves the API-request for the url + response = requests.get(url) + + # Checks if the status code is OK + if response.status_code == 200: + + # Converts the data into json + data = response.json() + folder = "../data/output_current_data" + + print("Data fetch: ok") + return data, folder + + + else: + # If html status code != 200, print the status code + print("Failed to fetch data from API. Status code:", response.status_code) + + +# Gets the wanted timeperiod data from the API - openweathermap.org +def fetch_time_data(start_date, end_date, city_name): + + + # f-string url, to add the "custom" variables to the API-request + url = f"https://history.openweathermap.org/data/2.5/history/city?q={city_name},NO&units=metric&type=hour&start={start_date}&end={end_date}&appid={API_KEY}" + + # Saves the API-request for the url + response = requests.get(url) + + # Checks if the status code is OK + if response.status_code == 200: + + # Converts the data into json + data = response.json() + folder = "../data/output_stedsnavn" + + print("Data fetch: ok") + return data, folder + + else: + # If html status code != 200, print the status code + print(f"Failed to fetch data for {city_name} from API. Status code:", response.status_code) + + +# Gets statistical data from the API - openweathermap.org +def fetch_stat_data(city_name): + + + # f-string url, to add the "custom" variables to the API-request + url = f"https://history.openweathermap.org/data/2.5/aggregated/year?q={city_name},NO&appid={API_KEY}&units=metric" + + # Saves the API-request for the url + response = requests.get(url) + + # Checks if the status code is OK + if response.status_code == 200: + + # Converts the data into json + data = response.json() + folder = "../data/output_statistikk" + + print("Data fetch: ok") + return data, folder + + else: + # If html status code != 200, print the status code + print(f"Failed to fetch data for city: {city_name} from API. Status code:", response.status_code) + + +# Write data to json-file +def write_data(data, folder, filename): + # Ensure the 'output_stedsdata' folder exists inside the 'data' folder at the root of the project + script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory of the script + project_root = os.path.abspath(os.path.join(script_dir, os.pardir, os.pardir)) # Navigate to the root of the project + data_dir = os.path.join(project_root, 'data', folder) + os.makedirs(data_dir, exist_ok=True) # Creates 'data/output_stedsdata' folder if it doesn't exist + + # Write the JSON data to a file inside the 'output_stedsdata' folder + file_path = os.path.join(data_dir, f'data_{filename}.json') # Creates 'data/output_stedsdata/data_{filename}.json' + + # Opens and write the data to a json file + with open(file_path, 'w') as json_file: + json.dump(data, json_file, indent=4) + + # Prints when succed + print(f"Data has been written to {file_path}") + + +# Function to 'normalize' the dataset, with index-changing and dropping meta-data +def extract_city_df(weather_data): + if 'list' in weather_data: + # Normalize the json for better readability + df = pd.json_normalize(weather_data['list']) + + # Delete duplicates based on the dt row, all the other values can appear more than once, but the date should only appear once + df = df.drop_duplicates(subset=['dt']) + + # The weather column dosnt have any releated information, therefor we delete it + df = df.drop(columns="weather") + + # Convert 'dt' column from Unix timestamp to datetime and set it as the index + df['dt'] = pd.to_datetime(df['dt'], unit='s') + df.set_index('dt', inplace=True) + return df + + else: + return None + +# Function to 'normalize' the dataset for statistic-data, with index-changing and dropping meta-data +def extract_city_data_stat(data): + # Checks if the 'result' column is in the data + if 'result' in data: + # Normalize the json and store it as a dataframe for better readability + df = pd.json_normalize(data['result']) + + # Display the dataframe + return df + else: + print("'result' not in data") + return None \ No newline at end of file diff --git a/src/my_package/fetch_current_data.py b/src/my_package/fetch_current_data.py deleted file mode 100644 index 787f3c3..0000000 --- a/src/my_package/fetch_current_data.py +++ /dev/null @@ -1,38 +0,0 @@ -# Import of needed libaries -import requests -import os -from dotenv import load_dotenv - -load_dotenv() - -# Gets the key, from my env file -API_KEY = os.getenv("API_KEY") - -# city_name = "Trondheim" -country_code = "NO" - - -# Gets the data from the API - openweathermap.org -def fetch_current_data(city_name): - - - # f-string url, to add the "custom" variables to the API-request - url = f"https://api.openweathermap.org/data/2.5/weather?q={city_name},NO&units=metric&appid={API_KEY}" - - # Saves the API-request for the url - response = requests.get(url) - - # Checks if the status code is OK - if response.status_code == 200: - - # Converts the data into json - data = response.json() - folder = "../data/output_current_data" - - print("Data fetch: ok") - return data, folder - - - else: - # If html status code != 200, print the status code - print("Failed to fetch data from API. Status code:", response.status_code) \ No newline at end of file diff --git a/src/my_package/fetch_data.py b/src/my_package/fetch_data.py deleted file mode 100644 index f087d28..0000000 --- a/src/my_package/fetch_data.py +++ /dev/null @@ -1,40 +0,0 @@ -# Import of needed libaries -import requests -import os -from dotenv import load_dotenv - -load_dotenv() - -# Gets the key, from my env file -API_KEY = os.getenv("API_KEY") - -# city_name = "Trondheim" -# country_code = "NO" - -# Temporarily standard times -# start_date = 1735686000 -# end_date = 1740009323 - -# Gets the data from the API - openweathermap.org -def fetch_data(start_date, end_date, city_name): - - - # f-string url, to add the "custom" variables to the API-request - url = f"https://history.openweathermap.org/data/2.5/history/city?q={city_name},NO&units=metric&type=hour&start={start_date}&end={end_date}&appid={API_KEY}" - - # Saves the API-request for the url - response = requests.get(url) - - # Checks if the status code is OK - if response.status_code == 200: - - # Converts the data into json - data = response.json() - folder = "../data/output_stedsnavn" - - print("Data fetch: ok") - return data, folder - - else: - # If html status code != 200, print the status code - print(f"Failed to fetch data for {city_name} from API. Status code:", response.status_code) diff --git a/src/my_package/get_record.py b/src/my_package/get_record.py deleted file mode 100644 index 7454681..0000000 --- a/src/my_package/get_record.py +++ /dev/null @@ -1,23 +0,0 @@ -import pandas as pd - -def get_records(df, city_name): - if df.empty: - print("df is empty") - - else: - max_temp_mean = df['temp.mean_celsius'].max() - min_temp_mean = df['temp.mean_celsius'].min() - - max_temp = df['temp.record_max_celsius'].max() - min_temp = df['temp.record_min_celsius'].min() - - summary_data = { - "Metric": ["Max Temp mean (°C)", "Min Temp Mean (°C)", "Max Temp (°C)", "Min temp (°C)"], - "Values": [max_temp_mean, min_temp_mean, max_temp, min_temp] - } - - summary_df = pd.DataFrame(summary_data) - folder = "../data/output_record" - filename = f"records_{city_name}" - - return summary_df, filename, folder \ No newline at end of file diff --git a/src/my_package/util.py b/src/my_package/util.py index 0c01171..e23afcb 100644 --- a/src/my_package/util.py +++ b/src/my_package/util.py @@ -10,73 +10,62 @@ def replace_nordic(city_name): city_name = city_name.replace('å', 'aa') return city_name +# Get the city_name input, convert nordic 'æøå' +def input_place(): + city_name = input("Enter city name: ") + city_name = replace_nordic(city_name) + return city_name + # Function to convert from kelvin to celsius temp def kelvin_to_celsius(temp_in_kelvin): temp_in_celsius = temp_in_kelvin - 273.15 return temp_in_celsius -# Fucntion to check if there are a ['rain.1h'] column in the dataset, if not make one -def ensure_rain_column(df): - try: - _ = df['rain.1h'] - - # If no rain, make the rain column and fill it with NaN - except KeyError: - print("'Rain' is not present in the JSON file.") - df['rain.1h'] = np.nan - +# Ensure wqnted columns, fill with "NaN" if they dont exsist +def ensure_column(df, columns): + for col in columns: + if col not in df.columns: + print(f"'{col}' is not present in the DataFrame. Filling with NaN.") + df[col] = np.nan return df -# Fucntion to check if there are a ['snow.1h'] column in the dataset, if not make one -def ensure_snow_column(df): - try: - _ = df['snow.1h'] - - # If no snow, make the snow column and fill it with NaN - except KeyError: - print("'Snow' is not present in the JSON file.") - df['snow.1h'] = np.nan - +# Fill "NaN" in wanted columns with 0 +def fill_column_0(df, columns): + for col in columns: + try: + df[col] = df[col].fillna(0) + except KeyError: + print(f"'{col}' not in DataFrame") return df -# Function to fill NaN values in ['rain.1h'] columns with 0 -def fill_rain_column(df): - try: - # Replace the NaN with 0, using pandas function - df['rain.1h'] = df['rain.1h'].fillna(0) - - except KeyError: - print(["'rain.1h', not in df"]) +# Cleans statistic-data, drop all columns that end with '...' using the filter function +def clean_df(df): + df = df.drop(columns=df.filter(like='.p25').columns) + df = df.drop(columns=df.filter(like='.p75').columns) + df = df.drop(columns=df.filter(like='.st_dev').columns) + df = df.drop(columns=df.filter(like='.num').columns) return df -# Function to fill NaN values in ['snow.1h'] columns with 0 -def fill_snow_column(df): - try: - # Replace the NaN with 0, using pandas function - df['snow.1h'] = df['snow.1h'].fillna(0) - - except KeyError: - print(["'snow.1h', not in df"]) +# Find highest and lowest temp from statistical data +def get_records(df, city_name): + if df.empty: + print("df is empty") - return df - -# Function to 'normalize' the dataset, with index-changing and dropping meta-data -def extract_city_df(weather_data): - if 'list' in weather_data: - # Normalize the json for better readability - df = pd.json_normalize(weather_data['list']) + else: + max_temp_mean = df['temp.mean_celsius'].max() + min_temp_mean = df['temp.mean_celsius'].min() - # Delete duplicates based on the dt row, all the other values can appear more than once, but the date should only appear once - df = df.drop_duplicates(subset=['dt']) + max_temp = df['temp.record_max_celsius'].max() + min_temp = df['temp.record_min_celsius'].min() - # The weather column dosnt have any releated information, therefor we delete it - df = df.drop(columns="weather") + summary_data = { + "Metric": ["Max Temp mean (°C)", "Min Temp Mean (°C)", "Max Temp (°C)", "Min temp (°C)"], + "Values": [max_temp_mean, min_temp_mean, max_temp, min_temp] + } - # Convert 'dt' column from Unix timestamp to datetime and set it as the index - df['dt'] = pd.to_datetime(df['dt'], unit='s') - df.set_index('dt', inplace=True) - return df + summary_df = pd.DataFrame(summary_data) + folder = "../data/output_record" + filename = f"records_{city_name}" - else: - return None \ No newline at end of file + return summary_df, filename, folder \ No newline at end of file diff --git a/src/my_package/write_data.py b/src/my_package/write_data.py deleted file mode 100644 index e46eb2a..0000000 --- a/src/my_package/write_data.py +++ /dev/null @@ -1,19 +0,0 @@ -import json -import os - -def write_data(data, folder, filename): - # Ensure the 'output_stedsdata' folder exists inside the 'data' folder at the root of the project - script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory of the script - project_root = os.path.abspath(os.path.join(script_dir, os.pardir, os.pardir)) # Navigate to the root of the project - data_dir = os.path.join(project_root, 'data', folder) - os.makedirs(data_dir, exist_ok=True) # Creates 'data/output_stedsdata' folder if it doesn't exist - - # Write the JSON data to a file inside the 'output_stedsdata' folder - file_path = os.path.join(data_dir, f'data_{filename}.json') # Creates 'data/output_stedsdata/data_{filename}.json' - - # Opens and write the data to a json file - with open(file_path, 'w') as json_file: - json.dump(data, json_file, indent=4) - - # Prints when succed - print(f"Data has been written to {file_path}") \ No newline at end of file diff --git a/src/my_package/year_data.py b/src/my_package/year_data.py deleted file mode 100644 index 3d6f780..0000000 --- a/src/my_package/year_data.py +++ /dev/null @@ -1,38 +0,0 @@ -# Import of needed libaries -import requests -import os -from dotenv import load_dotenv - - -load_dotenv() - -# Gets the key, from my env file -API_KEY = os.getenv("API_KEY") - -# city_name = "Maura" - -# Gets the data from the API - openweathermap.org -def fetch_data(city_name): - - - # f-string url, to add the "custom" variables to the API-request - url = f"https://history.openweathermap.org/data/2.5/aggregated/year?q={city_name},NO&appid={API_KEY}&units=metric" - - # Saves the API-request for the url - response = requests.get(url) - - # Checks if the status code is OK - if response.status_code == 200: - - # Converts the data into json - data = response.json() - folder = "../data/output_statistikk" - - print("Data fetch: ok") - return data, folder - - else: - # If html status code != 200, print the status code - print(f"Failed to fetch data for city: {city_name} from API. Status code:", response.status_code) - -# myData = fetch_data(city_name) \ No newline at end of file