Part 1: Time series and other simple plots

Part 1: Time series and other simple plots#

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) ## suppress annoying deprecation warnings

from datetime import datetime

import pandas as pd
import seaborn.objects as so
from matplotlib import style

import plotly.express as px

# Renaming columns for better axis labels in plots
col_rename = {
	'tavg': 'Temp_Avg_°C',
	'tmax': 'Temp_Max_°C',
	'tmin': 'Temp_Min_°C',
	'rhum': 'Rel_Humidity_%',
	'coco': 'Condition',
	'wspd': 'Wind_Speed_kmh',
	'prcp': 'Precipation_mm',
	'wdir': 'Wind_Direction_°',
	'pres': 'Air_pressure_hPa',
	'dwpt': 'Dew_point_°C'
}

weather_df = pd.read_csv('global_weather.csv', parse_dates=['time'], dtype={'wmo':str, 'station':str}) 
weather_df = weather_df.dropna()

weather_df.rename(columns=col_rename, inplace=True)
weather_df = weather_df.assign(Continent = weather_df["timezone"].str.split('/').str[0])  ## Get continent from timezone column

weather_df.loc[weather_df["name"] == "Berlin / Tempelhof",:].head()  ## Let's have a look at a single city (capital)

	name	country	region	wmo	icao	latitude	longitude	elevation	timezone	hourly_start	...	Rel_Humidity_%	Condition	Temp_Avg_°C	Temp_Min_°C	Temp_Max_°C	Precipation_mm	Wind_Direction_°	Wind_Speed_kmh	Air_pressure_hPa	Continent
1412	Berlin / Tempelhof	DE	BE	10384	EDDI	52.4667	13.4	50.0	Europe/Berlin	1929-08-01	...	80.0	Overcast	7.8	5.5	10.0	0.3	144.0	9.0	1007.8	Europe
1413	Berlin / Tempelhof	DE	BE	10384	EDDI	52.4667	13.4	50.0	Europe/Berlin	1929-08-01	...	68.0	Overcast	9.4	6.3	13.3	0.0	132.0	7.6	1007.3	Europe
1414	Berlin / Tempelhof	DE	BE	10384	EDDI	52.4667	13.4	50.0	Europe/Berlin	1929-08-01	...	64.0	Overcast	10.4	7.0	14.8	0.0	93.0	13.3	1005.8	Europe
1415	Berlin / Tempelhof	DE	BE	10384	EDDI	52.4667	13.4	50.0	Europe/Berlin	1929-08-01	...	74.0	Clear	9.4	6.1	12.8	0.0	71.0	12.6	1010.0	Europe
1416	Berlin / Tempelhof	DE	BE	10384	EDDI	52.4667	13.4	50.0	Europe/Berlin	1929-08-01	...	63.0	Clear	4.6	-0.8	8.0	0.0	66.0	14.4	1017.0	Europe

5 rows × 28 columns

My first seaborn.objects plot#

so.Plot.config.display["scaling"] = 1.0 ## Adjust standard output size to your liking 

(
    so.Plot(
		weather_df.loc[weather_df["name"] == "Berlin / Tempelhof",:],   ## Data layer (required)
		x="time", y="Temp_Avg_°C")                          			## Axis mapping layer (required)
    .add(so.Dot())                                                  	## Geometry layer (at least one required)
	.add(so.Line())                                                 	# Connect with lines (optional geometry layer)
	.limit(y=(0, 40))                                               	# Coordinate layer (optional: problem avoid free y-axis)
    .label(title="Temperature curve for Berlin / Tempelhof")            # Theme and label layers (optional)
)

../_images/c3b710e03fd1c1e0ed8b164732a7a234f696f734f0f174e715290eebf7371176.png

Plotting distributions#

How is the temperature range over all cities?#

miss_timepoint= weather_df.time == datetime(2024, 3, 6) ## Simulate a missing timepoint and see what happens in plots
(
	so.Plot(weather_df[~miss_timepoint], x="time", y="Temp_Avg_°C")  ## Can you spot the missing time point?
    .add(so.Band()) 										# Geometry: Min-Max Band
	.add(so.Line(), 										# Geometry: Line
  	#!#    so.Agg(func=??)    								# Statistic: Mean
	   so.Agg(func='mean')    								# Statistic: Mean
	)
    .label(title="Average temperature for all cities")
)

../_images/49d917318fcdb247ad73d4e9040229c6fe80f12dfcf0e3b8c44f99a17d021ebc.png

What are so.Band and so.Agg doing in the backgorund?#

weather_agg = pd.concat(
	[weather_df.loc[:,["time","Temp_Avg_°C"]].groupby(["time"]).min(), 	## so.Band - min-part
	weather_df.loc[:,["time","Temp_Avg_°C"]].groupby(["time"]).mean(), 	## so.Line, so.Agg
	weather_df.loc[:,["time","Temp_Avg_°C"]].groupby(["time"]).max(),	## so.Band - max-part
	weather_df.loc[:,["time","Temp_Avg_°C"]].groupby(["time"]).count()	## Let's check how many cities are aggregated
	], 
	axis=1
)
weather_agg.columns = ["temp_min","temp_avg", "temp_max","nb_cities"]

weather_agg.head(n=10)

	temp_min	temp_avg	temp_max	nb_cities
time
2024-03-01	-9.6	18.207801	33.1	141
2024-03-02	-11.0	18.255944	34.8	143
2024-03-03	-15.1	18.426950	33.9	141
2024-03-04	-13.4	18.536170	34.7	141
2024-03-05	-9.3	18.916783	34.4	143
2024-03-06	-9.4	18.864336	35.5	143
2024-03-07	-9.0	18.797183	36.3	142
2024-03-08	-11.1	18.853901	36.7	141
2024-03-09	-11.0	18.795683	35.4	139
2024-03-10	-10.0	19.744526	37.3	137

Another visualization (date is categorical, show data range not as min-max but as errorbar)#

Which visualization is better to see missing timepoint?

(
so.Plot(weather_df[~miss_timepoint], x="time", y="Temp_Avg_°C")	## Can you spot the missing time point?
	.add(so.Dot(), so.Agg()) 									# Geometry: Dot + Statistic: Mean
	#!# .add(so.???(), so.Est(errorbar=??)) 					# Geometry: Range + Statistic: Standard deviation
	.add(so.Range(), so.Est(errorbar="sd")) 					# Geometry: Range + Statistic: Standard deviation
	.limit(y=(0, 40))
	.layout(size=(20, 6)) 										# Increase the figure size for a better view
	.scale(
		x=so.Temporal().tick(upto=21).label(concise=True) 		# Increase the tick size and adjust tick labels for legibility
	)
	.label(title="Average temperature for all cities")
)

../_images/5608b2d42509ddd3e71e0bccc19f99a3ad9d7a47fa6254d4ab700b0024c5f1ad.png

The problem of Spaghetti Plots#

(
    #!# so.Plot(weather_df, x="time", y=??, color=??)
    so.Plot(weather_df, x="time", y="Wind_Speed_kmh", color="Continent")
		.add(so.Dot(), so.Agg())
		.add(so.Line(), so.Agg())
		.label(title="Average wind speed across continents")
)

../_images/057e4ac900261c3b57f69e15a5623cdb1ea01e128847c9a4846e95e6d2442e82.png

Alternatives: (1) Highlighting#

(
    so.Plot(weather_df, x="time", y="Wind_Speed_kmh", color="Continent")
		#!# .add(so.Dot(alpha=0.5), so.Agg(), marker=??)		# set transparency by alpha 
		.add(so.Dot(alpha=0.5), so.Agg(), marker="Continent")		# set transparency by alpha 
		.add(so.Line(alpha=0.5), so.Agg() )
		.scale(color=(							# Control the color scale
			"gray",								# Asia
		#!#	???,								# Highlight Europe
			"red",								# Highlight Europe
			"gray",								# Africa
			"gray",								# Pacific
			"gray",								# America
			"gray",								# Australia
			"gray",								# Atlantic
			"gray"								# Indian
			))
		.label(title="Average wind speed across continents")
)

../_images/e283b2ffa274d534765afa0745900d6c4de391a9e3af9f2f8aa4c40bee46bc36.png

Example: relationship of temperature, date and latitude (south - north)#

How to not do it: spaghetti line plot#

(
#!#	so.Plot(weather_df.loc[weather_df["Continent"] == ???], x="time", y="Temp_Avg_°C", color=??)
	so.Plot(weather_df.loc[weather_df["Continent"] == "Asia"], x="time", y="Temp_Avg_°C", color="latitude")
    .add(so.Dot())
	.add(so.Line())
    .layout(size=(12, 6))
    .label(title="Latitude effect on temperature")
)

../_images/d0aae363102bb06189102b2a0bc0ae1050b76adf079787c31029bfee73e37331.png

Alternative: Dot array and flip latitude and temperature axis#

(
	so.Plot(weather_df.loc[weather_df["Continent"] == "Asia"], x="time", y="latitude", color="Temp_Avg_°C")
    #!# .add(so.??(alpha=0.5, pointsize=10, edgecolor="black"))
    .add(so.Dot(alpha=0.5, pointsize=10, edgecolor="black"))
	.scale(
	#!# color = so.Continuous(??).tick(upto=10) # Important: choosing an intuitive colormap (https://seaborn.pydata.org/tutorial/color_palettes.html)
        color = so.Continuous("Spectral_r").tick(upto=10) # Important: choosing an intuitive colormap (https://seaborn.pydata.org/tutorial/color_palettes.html)
        ) 
    .layout(size=(12, 6))
    .label(title="Latitude effect on temperature - dot array")
)

../_images/6a121475ace28a383a99f772d307f0f62c1579b1dcb1937f4b38ef9181b4fe96.png

Alternative: Heatmap-like (via Dash)#

(
    so.Plot(weather_df.loc[weather_df["Continent"] == "Asia"], x="time", y="latitude", color="Temp_Avg_°C")
	#!# .add(so.??(alpha=0.8, width=0.8,  linewidth=8))
	.add(so.Dash(alpha=0.8, width=0.8,  linewidth=8))
	.scale(
        color=so.Continuous("Spectral_r").tick(upto=10)
        )
    .layout(size=(12, 6))
#!#	.theme({**style.library[??]}) # Increasing visibility on screens?
    .theme({**style.library["dark_background"]}) # Increasing visibility on screens?
    .label(title="Latitude effect on temperature - heatmap")
)

../_images/c74cdee4aebee743c23e669daa68514dd243e9097a791273c1ae1e6ac9bb0362.png

Example: what to do if you want to plot 3-4 variables?#

interactive 3D plot with plotly#

fig = px.scatter_3d(weather_df, x="Dew_point_°C", y="latitude", z="Temp_Avg_°C",
                    #!# color=??, ##  for fourth dimension
                    color="Rel_Humidity_%", ##  for fourth dimension
					opacity=0.7) 
fig.update_traces(marker_size = 2)  # Make dots smaller
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0)) # Reduce figure margins
fig.show()

Part 1: Time series and other simple plots

Contents

Part 1: Time series and other simple plots#

My first seaborn.objects plot#

Plotting distributions#

How is the temperature range over all cities?#

What are so.Band and so.Agg doing in the backgorund?#

Another visualization (date is categorical, show data range not as min-max but as errorbar)#

The problem of Spaghetti Plots#

Alternatives: (1) Highlighting#

Alternatives: (2) Facet#

Example: relationship of temperature, date and latitude (south - north)#

How to not do it: spaghetti line plot#

Alternative: Dot array and flip latitude and temperature axis#

Alternative: Heatmap-like (via Dash)#

On all continents? no problem: facet#

Example: what to do if you want to plot 3-4 variables?#

interactive 3D plot with plotly#