Part 1: Time series and other simple plots#

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) ## suppress annoying deprecation warnings

from datetime import datetime

import pandas as pd
import seaborn.objects as so
from matplotlib import style

import plotly.express as px
# Renaming columns for better axis labels in plots
col_rename = {
	'tavg': 'Temp_Avg_°C',
	'tmax': 'Temp_Max_°C',
	'tmin': 'Temp_Min_°C',
	'rhum': 'Rel_Humidity_%',
	'coco': 'Condition',
	'wspd': 'Wind_Speed_kmh',
	'prcp': 'Precipation_mm',
	'wdir': 'Wind_Direction_°',
	'pres': 'Air_pressure_hPa',
	'dwpt': 'Dew_point_°C'
}
weather_df = pd.read_csv('global_weather.csv', parse_dates=['time'], dtype={'wmo':str, 'station':str}) 
weather_df = weather_df.dropna()

weather_df.rename(columns=col_rename, inplace=True)
weather_df = weather_df.assign(Continent = weather_df["timezone"].str.split('/').str[0])  ## Get continent from timezone column

weather_df.loc[weather_df["name"] == "Berlin / Tempelhof",:].head()  ## Let's have a look at a single city (capital)
name country region wmo icao latitude longitude elevation timezone hourly_start ... Rel_Humidity_% Condition Temp_Avg_°C Temp_Min_°C Temp_Max_°C Precipation_mm Wind_Direction_° Wind_Speed_kmh Air_pressure_hPa Continent
1412 Berlin / Tempelhof DE BE 10384 EDDI 52.4667 13.4 50.0 Europe/Berlin 1929-08-01 ... 80.0 Overcast 7.8 5.5 10.0 0.3 144.0 9.0 1007.8 Europe
1413 Berlin / Tempelhof DE BE 10384 EDDI 52.4667 13.4 50.0 Europe/Berlin 1929-08-01 ... 68.0 Overcast 9.4 6.3 13.3 0.0 132.0 7.6 1007.3 Europe
1414 Berlin / Tempelhof DE BE 10384 EDDI 52.4667 13.4 50.0 Europe/Berlin 1929-08-01 ... 64.0 Overcast 10.4 7.0 14.8 0.0 93.0 13.3 1005.8 Europe
1415 Berlin / Tempelhof DE BE 10384 EDDI 52.4667 13.4 50.0 Europe/Berlin 1929-08-01 ... 74.0 Clear 9.4 6.1 12.8 0.0 71.0 12.6 1010.0 Europe
1416 Berlin / Tempelhof DE BE 10384 EDDI 52.4667 13.4 50.0 Europe/Berlin 1929-08-01 ... 63.0 Clear 4.6 -0.8 8.0 0.0 66.0 14.4 1017.0 Europe

5 rows × 28 columns

My first seaborn.objects plot#

so.Plot.config.display["scaling"] = 1.0 ## Adjust standard output size to your liking 

(
    so.Plot(
		weather_df.loc[weather_df["name"] == "Berlin / Tempelhof",:],   ## Data layer (required)
		x="time", y="Temp_Avg_°C")                          			## Axis mapping layer (required)
    .add(so.Dot())                                                  	## Geometry layer (at least one required)
	.add(so.Line())                                                 	# Connect with lines (optional geometry layer)
	.limit(y=(0, 40))                                               	# Coordinate layer (optional: problem avoid free y-axis)
    .label(title="Temperature curve for Berlin / Tempelhof")            # Theme and label layers (optional)
)
../_images/c3b710e03fd1c1e0ed8b164732a7a234f696f734f0f174e715290eebf7371176.png

Plotting distributions#

How is the temperature range over all cities?#

miss_timepoint= weather_df.time == datetime(2024, 3, 6) ## Simulate a missing timepoint and see what happens in plots
(
	so.Plot(weather_df[~miss_timepoint], x="time", y="Temp_Avg_°C")  ## Can you spot the missing time point?
    .add(so.Band()) 										# Geometry: Min-Max Band
	.add(so.Line(), 										# Geometry: Line
  	#!#    so.Agg(func=??)    								# Statistic: Mean
	   so.Agg(func='mean')    								# Statistic: Mean
	)
    .label(title="Average temperature for all cities")
)
../_images/49d917318fcdb247ad73d4e9040229c6fe80f12dfcf0e3b8c44f99a17d021ebc.png

What are so.Band and so.Agg doing in the backgorund?#

weather_agg = pd.concat(
	[weather_df.loc[:,["time","Temp_Avg_°C"]].groupby(["time"]).min(), 	## so.Band - min-part
	weather_df.loc[:,["time","Temp_Avg_°C"]].groupby(["time"]).mean(), 	## so.Line, so.Agg
	weather_df.loc[:,["time","Temp_Avg_°C"]].groupby(["time"]).max(),	## so.Band - max-part
	weather_df.loc[:,["time","Temp_Avg_°C"]].groupby(["time"]).count()	## Let's check how many cities are aggregated
	], 
	axis=1
)
weather_agg.columns = ["temp_min","temp_avg", "temp_max","nb_cities"]

weather_agg.head(n=10)
temp_min temp_avg temp_max nb_cities
time
2024-03-01 -9.6 18.207801 33.1 141
2024-03-02 -11.0 18.255944 34.8 143
2024-03-03 -15.1 18.426950 33.9 141
2024-03-04 -13.4 18.536170 34.7 141
2024-03-05 -9.3 18.916783 34.4 143
2024-03-06 -9.4 18.864336 35.5 143
2024-03-07 -9.0 18.797183 36.3 142
2024-03-08 -11.1 18.853901 36.7 141
2024-03-09 -11.0 18.795683 35.4 139
2024-03-10 -10.0 19.744526 37.3 137

Another visualization (date is categorical, show data range not as min-max but as errorbar)#

Which visualization is better to see missing timepoint?

(
so.Plot(weather_df[~miss_timepoint], x="time", y="Temp_Avg_°C")	## Can you spot the missing time point?
	.add(so.Dot(), so.Agg()) 									# Geometry: Dot + Statistic: Mean
	#!# .add(so.???(), so.Est(errorbar=??)) 					# Geometry: Range + Statistic: Standard deviation
	.add(so.Range(), so.Est(errorbar="sd")) 					# Geometry: Range + Statistic: Standard deviation
	.limit(y=(0, 40))
	.layout(size=(20, 6)) 										# Increase the figure size for a better view
	.scale(
		x=so.Temporal().tick(upto=21).label(concise=True) 		# Increase the tick size and adjust tick labels for legibility
	)
	.label(title="Average temperature for all cities")
)
../_images/5608b2d42509ddd3e71e0bccc19f99a3ad9d7a47fa6254d4ab700b0024c5f1ad.png

The problem of Spaghetti Plots#

(
    #!# so.Plot(weather_df, x="time", y=??, color=??)
    so.Plot(weather_df, x="time", y="Wind_Speed_kmh", color="Continent")
		.add(so.Dot(), so.Agg())
		.add(so.Line(), so.Agg())
		.label(title="Average wind speed across continents")
)
../_images/057e4ac900261c3b57f69e15a5623cdb1ea01e128847c9a4846e95e6d2442e82.png

Alternatives: (1) Highlighting#

(
    so.Plot(weather_df, x="time", y="Wind_Speed_kmh", color="Continent")
		#!# .add(so.Dot(alpha=0.5), so.Agg(), marker=??)		# set transparency by alpha 
		.add(so.Dot(alpha=0.5), so.Agg(), marker="Continent")		# set transparency by alpha 
		.add(so.Line(alpha=0.5), so.Agg() )
		.scale(color=(							# Control the color scale
			"gray",								# Asia
		#!#	???,								# Highlight Europe
			"red",								# Highlight Europe
			"gray",								# Africa
			"gray",								# Pacific
			"gray",								# America
			"gray",								# Australia
			"gray",								# Atlantic
			"gray"								# Indian
			))
		.label(title="Average wind speed across continents")
)
../_images/e283b2ffa274d534765afa0745900d6c4de391a9e3af9f2f8aa4c40bee46bc36.png

Alternatives: (2) Facet#

(
    so.Plot(weather_df, x="time", y="Wind_Speed_kmh")
		.add(so.Dot(), so.Agg())
		.add(so.Line(), so.Agg())
		.add(so.Band())
		#!# .facet(??, wrap=4)			# All you need for subplots
		.facet("Continent", wrap=4)		# All you need for subplots
		.layout(size=(25, 8))
)
../_images/3fce4a96bb73047e8498bbde3a53a435c73aa09024907bcbdc2686ede6b397dc.png

Example: relationship of temperature, date and latitude (south - north)#

How to not do it: spaghetti line plot#

(
#!#	so.Plot(weather_df.loc[weather_df["Continent"] == ???], x="time", y="Temp_Avg_°C", color=??)
	so.Plot(weather_df.loc[weather_df["Continent"] == "Asia"], x="time", y="Temp_Avg_°C", color="latitude")
    .add(so.Dot())
	.add(so.Line())
    .layout(size=(12, 6))
    .label(title="Latitude effect on temperature")
)
../_images/d0aae363102bb06189102b2a0bc0ae1050b76adf079787c31029bfee73e37331.png

Alternative: Dot array and flip latitude and temperature axis#

(
	so.Plot(weather_df.loc[weather_df["Continent"] == "Asia"], x="time", y="latitude", color="Temp_Avg_°C")
    #!# .add(so.??(alpha=0.5, pointsize=10, edgecolor="black"))
    .add(so.Dot(alpha=0.5, pointsize=10, edgecolor="black"))
	.scale(
	#!# color = so.Continuous(??).tick(upto=10) # Important: choosing an intuitive colormap (https://seaborn.pydata.org/tutorial/color_palettes.html)
        color = so.Continuous("Spectral_r").tick(upto=10) # Important: choosing an intuitive colormap (https://seaborn.pydata.org/tutorial/color_palettes.html)
        ) 
    .layout(size=(12, 6))
    .label(title="Latitude effect on temperature - dot array")
)
../_images/6a121475ace28a383a99f772d307f0f62c1579b1dcb1937f4b38ef9181b4fe96.png

Alternative: Heatmap-like (via Dash)#

(
    so.Plot(weather_df.loc[weather_df["Continent"] == "Asia"], x="time", y="latitude", color="Temp_Avg_°C")
	#!# .add(so.??(alpha=0.8, width=0.8,  linewidth=8))
	.add(so.Dash(alpha=0.8, width=0.8,  linewidth=8))
	.scale(
        color=so.Continuous("Spectral_r").tick(upto=10)
        )
    .layout(size=(12, 6))
#!#	.theme({**style.library[??]}) # Increasing visibility on screens?
    .theme({**style.library["dark_background"]}) # Increasing visibility on screens?
    .label(title="Latitude effect on temperature - heatmap")
)
../_images/c74cdee4aebee743c23e669daa68514dd243e9097a791273c1ae1e6ac9bb0362.png

On all continents? no problem: facet#

(
    so.Plot(weather_df, x="time", y="latitude", color="Temp_Avg_°C")
    .add(so.Dash(alpha=0.8, width=0.8,  linewidth=4))
	.scale(
        color=so.Continuous("Spectral_r").tick(upto=10)
        )
	#!# .facet(??, wrap=4).layout(size=(25, 8))
	.facet("Continent", wrap=4).layout(size=(25, 8))
	.theme({**style.library["dark_background"]}) 
)
../_images/00517b6c253002992a1ecb884f1526133b89af02f5e2ec1cb3d2b474924a1999.png

Example: what to do if you want to plot 3-4 variables?#

interactive 3D plot with plotly#

fig = px.scatter_3d(weather_df, x="Dew_point_°C", y="latitude", z="Temp_Avg_°C",
                    #!# color=??, ##  for fourth dimension
                    color="Rel_Humidity_%", ##  for fourth dimension
					opacity=0.7) 
fig.update_traces(marker_size = 2)  # Make dots smaller
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0)) # Reduce figure margins
fig.show()