From 8bb8ecdf7defc3b998c08c07b177b7e08688ee9d Mon Sep 17 00:00:00 2001 From: toravest Date: Sun, 20 Apr 2025 18:00:35 +0200 Subject: [PATCH] add pandasql, sql-query for data compare --- .../notebook_compare_one_week_data.ipynb | 261 +++++++----------- 1 file changed, 98 insertions(+), 163 deletions(-) diff --git a/notebooks/notebook_compare_one_week_data.ipynb b/notebooks/notebook_compare_one_week_data.ipynb index 5afaee5..30b0c88 100644 --- a/notebooks/notebook_compare_one_week_data.ipynb +++ b/notebooks/notebook_compare_one_week_data.ipynb @@ -187,12 +187,9 @@ "\n", "Henter opp data lagret i filen, lagd over, og skriver ut lesbart ved hjelp av pandas\n", "\n", - "Har har vi laget en funksjon som henter ut dataene for ønsket sted, og gjør endringer vi ønsker skal bli gjort for dataen for begge steder som:\n", - "- fjerner 'weather' kolonnen, som inneholder metadata\n", - "- setter tiden som index\n", - "- normaliserer, slik at det er enklere å lese all dataen\n", + "Vi importerer funksjonen `extract_city_df` som fjerner uønskede kolonner, og returnerer dataen mer lesbart.\n", " \n", - "Vi sjekker også at vi har data for stedene, altså at funskjonen funker, før den eventuelt viser dataen for stedene." + "Vi lagrer dataen for begge byene, men legger til kolonne 'city' for å lagre id og 'city_name' for å lagre stedsnavn. Før vi slår sammen begge dataene til en dataframe, for å lettere bruke Pandas SQL for å hente ut ønsket data for begge stedene." ] }, { @@ -204,6 +201,7 @@ "source": [ "import pandas as pd\n", "import json\n", + "from pandasql import sqldf\n", "\n", "file_path = f'../data/output_sammenligning_uke/data_{filename}.json'\n", "\n", @@ -213,23 +211,25 @@ "with open(file_path, 'r') as f:\n", " all_city_data = json.load(f)\n", "\n", - "# Separate variables for each city\n", + "# Extract and label each city, with id and name\n", "city_1_df = extract_city_df(all_city_data.get('city_1'))\n", + "city_1_df['city'] = 'city_1'\n", + "city_1_df['city_name'] = city_1\n", + "\n", "city_2_df = extract_city_df(all_city_data.get('city_2'))\n", + "city_2_df['city'] = 'city_2'\n", + "city_2_df['city_name'] = city_2\n", + "\n", + "# Concat both city df into one, to be able to use pandas sql\n", + "both_cities_df = pd.concat([city_1_df, city_2_df])\n", + "\n", + "# Pandas SQL adds more values to dt, example \"yyyy-mm-dd hh:mm:ss:xxxxx\", so to delete the extra x´s we reset the index\n", + "both_cities_df = both_cities_df.reset_index()\n", + "# And add dt with the wanted format\n", + "both_cities_df['dt'] = both_cities_df['dt'].dt.strftime('%Y-%m-%d %H:%M:%S')\n", "\n", - "# Checks if the data is not empty, aka there are values\n", - "if city_1_df is not None:\n", - " # Prints the city name\n", - " print(f\"{city_1} data:\")\n", - " # Display the dataframe readable\n", - " display(city_1_df)\n", - "\n", - "# Checks if the data is not empty, aka there are values\n", - "if city_2_df is not None:\n", - " # Prints the city name\n", - " print(f\"{city_2} data:\")\n", - " # Display the dataframe readable\n", - " display(city_2_df)" + "# Display all the values from both cities, with city 1 first, then city 2\n", + "display(sqldf('''SELECT * FROM both_cities_df'''))" ] }, { @@ -238,9 +238,9 @@ "metadata": {}, "source": [ "### Viser temperaturen\n", - "Regner ut gjennomsnitts-temperatur ved hjelp av innebygde funksjoner. Finner også høyeste og laveste målte temperatur for begge steder.\n", + "Vi bruker pandas SQL for å hente ut temperaturen for begge stedene, og lagrer de i en tabell. Ved hjelp av 'pivot' , en innebgyd funksjon for å rotere tabeller, setter vi den opp for bedre lesbarhet, ved å sette begge byene ved siden av hverandre og 'dt' som index. Dette gjør det lettere å sammenligne temperaturen for begge stedene til samme tid.\n", "\n", - "Legger dataen inn i en dataframe for å vise de i en ryddigere og mer lettlest tabell." + "Ved hjelp av en pandas SQL setning kan vi hente og lagre gjennomsnitt, maksimalt og minste målte temperatur for begge stedene. Senere kan vi bare skrive en SELECT setning til denne variabelen, for å eksepelhvis hente ut gjennomsnitts data for sted 1." ] }, { @@ -250,38 +250,31 @@ "metadata": {}, "outputs": [], "source": [ - "# Stores the temperature values of both cities\n", - "temp_city_1 = city_1_df['main.temp']\n", - "temp_city_2 = city_2_df['main.temp']\n", - "\n", - "# Find the mean temperature in both citites\n", - "temp_mean_city_1 = temp_city_1.mean().round(2)\n", - "temp_mean_city_2 = temp_city_2.mean().round(2)\n", - "\n", - "# Find the highest temperature in both cities\n", - "max_temp_city_1 = city_1_df['main.temp'].max().round(2)\n", - "max_temp_city_2 = city_2_df['main.temp'].max().round(2)\n", - "\n", - "# Find the lowest tempeartues in both cities\n", - "min_temp_city_1 = city_1_df['main.temp'].min().round(2)\n", - "min_temp_city_2 = city_2_df['main.temp'].min().round(2)\n", - "\n", - "# Stores the values of both city in a list\n", - "city_names = [city_1, city_2]\n", - "mean_temp = [temp_mean_city_1, temp_mean_city_2]\n", - "max_temp = [max_temp_city_1, max_temp_city_2]\n", - "min_temp = [min_temp_city_1, min_temp_city_2]\n", - "\n", - "# Creates dataframe of the tempvalues, to display it more readable\n", - "df_temp_cities = pd.DataFrame({\n", - " \"City\": city_names,\n", - " \"Mean Temperature (°C)\": mean_temp,\n", - " \"Highest Temperature (°C)\": max_temp,\n", - " \"Lowest Temperature (°C)\": min_temp\n", - "})\n", - "\n", - "# Display the dataframe with the values of both cities\n", - "display(df_temp_cities)\n" + "# Stores the `main.temp` from both cities using pandas sql\n", + "temp_data = sqldf('''\n", + " SELECT dt, city_name, `main.temp` as temp\n", + " FROM both_cities_df\n", + "''')\n", + "\n", + "# Set the dt to index, city for \"header\" and the temp as values for each city each hour\n", + "pivoted_temp = temp_data.pivot(index='dt', columns='city_name', values='temp')\n", + "\n", + "# Rename the columns in the pivoted_temp\n", + "pivoted_temp.columns = [f\"{col}_main_temp\" for col in pivoted_temp.columns]\n", + "\n", + "# Display the final result\n", + "print(f\"Main temperature for {city_1} and {city_2}\")\n", + "display(pivoted_temp)\n", + "\n", + "# Extract and stores temperatur data for each city using pandas sql\n", + "stat_temp_per_city = sqldf('''\n", + " SELECT city, city_name, AVG(`main.temp`) AS avg_temp, MAX(`main.temp`) AS max_temp, MIN(`main.temp`) AS min_temp\n", + " FROM both_cities_df\n", + " GROUP BY city_name\n", + " ''')\n", + "\n", + "# Display the temperatur stats for both cities\n", + "display(stat_temp_per_city)" ] }, { @@ -289,11 +282,9 @@ "id": "d0248c3b", "metadata": {}, "source": [ - "### Sjekker nedbørs-kolonnene\n", + "### Sjekker kolonner\n", "\n", - "En ting vi har oppfattet når vi har blitt kjent med dataen er at regn og snø kolonne mangler om det ikke har regnet eller snødd. Vi ønsker senere å plotte oversikt over nedbør, og det er lettere å plotte 0 nedbør enn manglende verdier. \n", - "\n", - "Derfor sjekker vi her om regn/snø kolonne eksisterer i dataen, om den ikke gjør så lager vi en og fyller den med NaN." + "Vi har oppfattet at kolonnen som i størst grad kan mangler er 'rain.1h' og 'snow.1h', vi skal også bruke disse verdiene senere i plottingen av en graf. Derfor har vi lagd funksjonen `ensure_column` som tar i mot dataframen og kolonnene vi ønsker å sjekke. Om kolonnene ikke eksisterer blir de laget, og fylt med 'NaN'." ] }, { @@ -310,20 +301,12 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.util import ensure_rain_column\n", - "from my_package.util import ensure_snow_column\n", - "\n", - "# Chekcs if there are a rain/snow column in city_1_df, if not the function creates one with NaN values\n", - "city_1_df = ensure_rain_column(city_1_df)\n", - "city_1_df = ensure_snow_column(city_1_df)\n", - "# Displays the dataframe readable\n", - "display(city_1_df)\n", - "\n", - "# Chekcs if there are a rain/snow column in city_2_df, if not the function creates one with NaN values\n", - "city_2_df = ensure_rain_column(city_2_df)\n", - "city_2_df = ensure_snow_column(city_2_df)\n", - "# Displays the dataframe readable\n", - "display(city_2_df)" + "from my_package.util import ensure_column\n", + "\n", + "columns_to_ensure = ['rain.1h', 'snow.1h']\n", + "\n", + "both_cities_df = ensure_column(both_cities_df, columns_to_ensure)\n", + "display(both_cities_df)" ] }, { @@ -334,9 +317,7 @@ "### Sjekk for manglende verdier\n", "Missigno sjekker og visualiserer manglende verdier, slik at det blir lettere å se hvilke kolonner feilen ligger i. \n", "\n", - "Vis the blir \"hull\" i en søyle, tyder the på manglende verdier.\n", - "\n", - "Det er ikke vanlig å legge til tittel, men i dette tilfellet siden vi har to serier vi sjekker data for, har vi brukt matplotlib for å lettere se hvilket sted som har manglende data." + "Hvis det blir \"hull\" i en søyle, tyder the på manglende verdier." ] }, { @@ -347,22 +328,9 @@ "outputs": [], "source": [ "import missingno as msno\n", - "import matplotlib.pyplot as plt\n", - "\n", - "# Plot missing data matrix fro city_1\n", - "msno.matrix(city_1_df)\n", "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_1}')\n", - "\n", - "# Plot missing data matrix fro city_2\n", - "msno.matrix(city_2_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_2}')\n", - "\n", - "# Show the plot\n", - "plt.show()" + "# Plot missing data matrix for both cities\n", + "msno.matrix(both_cities_df)" ] }, { @@ -372,13 +340,11 @@ "source": [ "### Endre manglende verdier\n", "\n", - "Nå skal vi ha regn/snø kolonner, og kanksje inneholder de NaN.\n", - "\n", - "Så sjekker vi om alle verdiene i en kolonne er 'NaN', isåfall så fjerner vi hele kolonnen. Grunne til at dette ikke inkluderer snø og regn, er fordi vi senere plotter disse verdiene, og da får vi ikke feil om verdien er 0, men vil få om hele kolonnen mangler.\n", + "Om vi har manglende verdieer kan det tyde på to ting, 1. feil i målingen eller 2. det er ingenting å måle. Nr. 2 ser vi spesielt på kolonnene 'rain.1h' og 'snow.1h', om det ikke er snødd/regnet vil verdien bli lagret for 'NaN'. 'NaN' er ikke mulig å plotte, så vi ønsker å lagre verdien som 0 isteden. Vi importerer derfor funksjonen `fill_column_0` som tar inn datafram og kolonner vi ønsker at 'NaN' verdien skal byttes ut med 0. \n", "\n", - "Vi sjekker også temperatur-verdiene for begge stedene. Dersom det skulle mangle en verdi bruker vi ffill (forward-fill), altså at den bruker verdien som var før.\n", + "Om det er kolonner hvor alle verdiene er 'NaN' fjerner vi hele kolonnen, dette fordi verdien 0 ikke vil gi oss mer informasjon uansett.\n", "\n", - "Det kan være mangler i andre verdier, men vi endrer ikke disse nå, da vi kun vil bruke data om nedbør og temperatur." + "Andre kolonner som har en 'NaN'-verdi velger vi å bruke interpolate, som ser på verdien før og verdien etter og 'gjetter' på den manglende verdien. Vi har lagt til 'limit-direction' for at den skal kunne gjette på første og siste, eventuelt andre verdier som bare har verdi på en side." ] }, { @@ -395,43 +361,20 @@ "sys.path.append(os.path.abspath(\"../src\"))\n", "\n", "# Now we can import the fucntion from the module\n", - "from my_package.util import fill_rain_column\n", - "from my_package.util import fill_snow_column\n", + "from my_package.util import fill_column_0\n", "\n", - "# Fill NaN values with 0, if there are any, for both rain/snow column\n", - "city_1_df = fill_rain_column(city_1_df)\n", - "city_1_df = fill_snow_column(city_1_df)\n", + "columns_to_0 = ['rain.1h', 'snow.1h']\n", "\n", "# Fill NaN values with 0, if there are any, for both rain/snow column\n", - "city_2_df = fill_rain_column(city_2_df)\n", - "city_2_df = fill_snow_column(city_2_df)\n", - "\n", - "# Drops all the columns, if it has 'NaN' value.\n", - "city_1_df = city_1_df.dropna(axis='columns', how='all')\n", - "city_2_df = city_2_df.dropna(axis='columns', how='all')\n", + "both_cities_df = fill_column_0(both_cities_df, columns_to_0)\n", "\n", - "# If temperature is missing, take the same as the one before\n", - "city_1_df['main.temp'] = city_1_df['main.temp'].fillna('obj.ffill()')\n", + "# Drops the whole column, if it only has 'NaN' value.\n", + "both_cities_df = both_cities_df.dropna(axis='columns', how='all')\n", "\n", - "# Forward fill missing values in the lowest temperature \n", - "city_1_df['main.temp_min'] = city_1_df['main.temp_min'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the highest temperature \n", - "city_1_df['main.temp_max'] = city_1_df['main.temp_max'].fillna('obj.ffill()')\n", - "\n", - "\n", - "# If temperature is missing, take the same as the one before\n", - "city_2_df['main.temp'] = city_2_df['main.temp'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the lowest temperature \n", - "city_2_df['main.temp_min'] = city_2_df['main.temp_min'].fillna('obj.ffill()')\n", - "\n", - "# Forward fill missing values in the highest temperature \n", - "city_2_df['main.temp_max'] = city_2_df['main.temp_max'].fillna('obj.ffill()')\n", + "both_cities_df = both_cities_df.interpolate(method='linear', limit_direction='both')\n", "\n", "# Display both cities readable\n", - "display(city_1_df)\n", - "display(city_2_df)" + "display(both_cities_df)" ] }, { @@ -451,22 +394,9 @@ "outputs": [], "source": [ "import missingno as msno\n", - "import matplotlib.pyplot as plt\n", - "\n", - "# Plot missing data matrix fro city_1\n", - "msno.matrix(city_1_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_1}')\n", "\n", - "# Plot missing data matrix fro city_2\n", - "msno.matrix(city_2_df)\n", - "\n", - "# Add title using matplotlib\n", - "plt.title(f'Missing Data for {city_2}')\n", - "\n", - "# Show the plot\n", - "plt.show()" + "# Plot missing data matrix for both cities\n", + "msno.matrix(both_cities_df)" ] }, { @@ -477,7 +407,7 @@ "### Visualisere data i en graf\n", "Ved hjelp av Matplotlib har vi visualiert ønsket data, og ved hjelp av subplot, en modul i matplotlib, kan vi plotte flere verdier i samme graf, og få \"to y-akse\" på samme x-akse. \n", "\n", - "Temperatur for begge stedene finner vi i den øverste grafen, hvor vi også har lagt inn gjennomsnittstemperaturen.\n", + "Temperatur for begge stedene finner vi i den øverste grafen, hvor vi også har lagt inn gjennomsnittstemperaturen. Her har vi bare skrevet nye SELECT setninger som henter ut gjennomsnitt for begge steder fra 'stat_temp_per_city'.\n", "\n", "I grafen under ser vi oversikt over nedbør for begge stedene sammenlignet 'side om side'. Vi skiller også mellom snø og regn, og dersom det skulle snø og regne i samme time, vil de bare 'stables' oppå hverandre.\n", "\n", @@ -501,13 +431,22 @@ "# Creates the folder if it does not exist\n", "os.makedirs(output_folder, exist_ok=True)\n", "\n", + "temp_city_1 = sqldf('''SELECT `main.temp` FROM both_cities_df WHERE city = \"city_1\"''')\n", + "temp_city_2 = sqldf('''SELECT `main.temp` FROM both_cities_df WHERE city = \"city_2\"''')\n", + "\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (first row, first column)\n", + "temp_mean_city_1 = sqldf('''SELECT avg_temp FROM stat_temp_per_city WHERE city = \"city_1\"''').iloc[0, 0]\n", + "temp_mean_city_2 = sqldf('''SELECT avg_temp FROM stat_temp_per_city WHERE city = \"city_2\"''').iloc[0, 0]\n", + "\n", "# Extract rain values for both cities\n", - "city_1_rain = city_1_df['rain.1h']\n", - "city_2_rain = city_2_df['rain.1h']\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (all rows, first column)\n", + "city_1_rain = sqldf('''SELECT `rain.1h` FROM both_cities_df WHERE city = \"city_1\"''').iloc[:,0]\n", + "city_2_rain = sqldf('''SELECT `rain.1h` FROM both_cities_df WHERE city = \"city_2\"''').iloc[:,0]\n", "\n", "# Extract snow values for both cities\n", - "city_1_snow = city_1_df['snow.1h']\n", - "city_2_snow = city_2_df['snow.1h']\n", + "# Because pandas sql returnes the value as a dataframe, we need to get the actual value (all rows, first column)\n", + "city_1_snow = sqldf('''SELECT `snow.1h` FROM both_cities_df WHERE city = \"city_1\"''').iloc[:,0]\n", + "city_2_snow = sqldf('''SELECT `snow.1h` FROM both_cities_df WHERE city = \"city_2\"''').iloc[:,0]\n", "\n", "# x_axis set to the index, which mean the datetime\n", "x_axis = city_1_df.index\n", @@ -519,15 +458,15 @@ "fig, (ax1, ax3) = plt.subplots(2, 1,figsize=(15, 8), sharex=True)\n", "\n", "# Set the title for the whole plot, above the upper plot\n", - "ax1.set_title(f'Weather compare for {city_1} and {city_2} ({start_date} - {end_date}) ')\n", + "ax1.set_title(f'Weather compare for {city_1} and {city_2} ({start_date} to {end_date}) ')\n", "\n", "# Plots the temperature for city_1, with mean temperature\n", - "ax1.plot(x_axis, temp_city_1, color='#2E8B57', label=f'Temperature {city_1}')\n", - "ax1.axhline(temp_mean_city_1, color='#2E8B57', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", + "ax1.plot(x_axis, temp_city_1, color='#008080', label=f'Temperature {city_1}')\n", + "ax1.axhline(temp_mean_city_1, color='#008080', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_1}')\n", "\n", "# Plots the temperature for city_2s, with mean temperature\n", - "ax1.plot(x_axis, temp_city_2, color='#FFD700', label=f'Temperature {city_2}')\n", - "ax1.axhline(temp_mean_city_2, color='#FFD700', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", + "ax1.plot(x_axis, temp_city_2, color='#FFA500', label=f'Temperature {city_2}')\n", + "ax1.axhline(temp_mean_city_2, color='#FFA500', linestyle='dashed', alpha=0.7, label=f'Mean Temperature {city_2}')\n", "\n", "# Design the y-axis for Temperature\n", "ax1.set_ylabel(\"Temperature (°C)\")\n", @@ -538,21 +477,17 @@ "# Add marker at 0 temperature\n", "ax1.axhline(y=0, color='black', linewidth=1.5)\n", "\n", - "width = 0.02\n", + "# Adjust the width of bars for better side-by-side comparison, for the precipitation\n", + "width = 0.01 \n", "\n", "# Plot rain/snow bars for both cities, sidy-by-side for better comparrison\n", - "# # Place the snow and rain bars for city_1, with offset for better comparrisson with city_2\n", - "\n", - "# ax3.bar(x_axis_numeric - width/2, city_1_rain, width=width, alpha=0.7, color='#2E8B57', label=f'Rain {city_1}', edgecolor='#FFD700', bottom=city_1_snow)\n", - "# ax3.bar(x_axis_numeric - width/2, city_1_snow, width=width, alpha=0.5, color='#2E8B57', label=f'Snow {city_1}', hatch='////', edgecolor='#FFD700')\n", - "\n", - "# # # Place the snow and rain bars for city_2, with offset for better comparrisson with city_1\n", - "# ax3.bar(x_axis_numeric + width/2, city_2_rain, width=width, alpha=0.7, color='#FFD700', label=f'Rain {city_2}', edgecolor=\"#2E8B57\", bottom=city_2_snow)\n", - "# ax3.bar(x_axis_numeric + width/2, city_2_snow, width=width, alpha=0.5, color='#FFD700', label=f'Snow {city_2}', hatch='////', edgecolor=\"#2E8B57\")\n", + "# Place the snow and rain bars for city_1, with offset for better comparrisson with city_2\n", + "ax3.bar(x_axis_numeric - width/2, city_1_rain, width=width, alpha=0.7, color='#008080', label=f'Rain {city_1}', bottom=city_1_snow, edgecolor='#008080')\n", + "ax3.bar(x_axis_numeric - width/2, city_1_snow, width=width, alpha=0.7, color='#B2D9D9', label=f'Snow {city_1}', hatch='///', edgecolor='#008080')\n", "\n", - "# Concat the snow and rain\n", - "ax3.fill_between(x_axis, city_1_rain + city_1_snow, color='green', alpha=0.3, label=f'{city_1} Total')\n", - "ax3.fill_between(x_axis, city_2_rain + city_2_snow, color='gold', alpha=0.3, label=f'{city_2} Total')\n", + "# Place the snow and rain bars for city_2, with offset for better comparrisson with city_1\n", + "ax3.bar(x_axis_numeric + width/2, city_2_rain, width=width, alpha=0.7, color='#FFA500', label=f'Rain {city_2}', bottom=city_2_snow, edgecolor='#FFA500')\n", + "ax3.bar(x_axis_numeric + width/2, city_2_snow, width=width, alpha=0.7, color='#FFE5B3', label=f'Snow {city_2}', hatch='///', edgecolor='#FFA500')\n", "\n", "\n", "# Design the y-axis for precipiation\n", @@ -562,8 +497,8 @@ "ax3.grid(axis='x')\n", "\n", "# Customize the x-axis to show ticks for each hour\n", - "ax1.xaxis.set_major_locator(mdates.HourLocator(interval=12)) # Tick marks for every hour\n", - "ax1.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H')) # Format as \"Day Month Hour:Minute\"\n", + "ax3.xaxis.set_major_locator(mdates.HourLocator(interval=12)) # Tick marks for every hour\n", + "ax3.xaxis.set_major_formatter(mdates.DateFormatter('%d %b %H')) # Format as \"Day Month Hour:Minute\"\n", "\n", "# Add label description\n", "ax1.legend(loc='upper left')\n",