Part 2 Data exploration by unsupervised learning

Contents

Part 2 Data exploration by unsupervised learning#

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) ## suppress annoying deprecation warnings

import pandas as pd
import seaborn.objects as so
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style

from sklearn.preprocessing import StandardScaler

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Renaming columns for better axis labels in plots
col_rename = {
	'tavg': 'Temp_Avg_°C',
	'tmax': 'Temp_Max_°C',
	'tmin': 'Temp_Min_°C',
	'rhum': 'Rel_Humidity_%',
	'coco': 'Condition',
	'wspd': 'Wind_Speed_kmh',
	'prcp': 'Precipation_mm',
	'wdir': 'Wind_Direction_°',
	'pres': 'Air_pressure_hPa',
	'dwpt': 'Dew_point_°C'
}

## Reload data
weather_df = pd.read_csv('global_weather.csv', parse_dates=['time'], dtype={'wmo':str, 'station':str}) 
weather_df = weather_df.dropna()

weather_df.rename(columns=col_rename, inplace=True)

weather_df = weather_df.assign(Continent = weather_df["timezone"].str.split('/').str[0])  ## Get continent from timezone column

Dimension reduction by PCA and t-SNE#

weather_df.select_dtypes(include='number') ## Only on numerical attributes of weather data 

	latitude	longitude	elevation	Dew_point_°C	Rel_Humidity_%	Temp_Avg_°C	Temp_Min_°C	Temp_Max_°C	Precipation_mm	Wind_Direction_°	Wind_Speed_kmh	Air_pressure_hPa
0	34.0167	71.5833	359.0	11.0	100.0	12.3	9.6	13.0	5.1	334.0	7.3	1015.4
1	34.0167	71.5833	359.0	10.0	100.0	10.8	9.0	12.5	65.0	12.0	11.7	1005.5
2	34.0167	71.5833	359.0	5.0	94.0	8.9	5.5	13.0	16.0	330.0	20.4	1012.6
3	34.0167	71.5833	359.0	3.0	54.0	11.3	4.5	18.0	0.0	270.0	14.2	1020.1
4	34.0167	71.5833	359.0	1.9	50.0	13.7	5.5	21.5	0.0	222.0	13.7	1019.3
...	...	...	...	...	...	...	...	...	...	...	...	...
3687	28.4500	-13.8667	22.0	18.0	78.0	19.9	16.7	22.0	0.0	32.0	12.2	1018.1
3688	28.4500	-13.8667	22.0	15.9	57.0	23.0	18.8	28.0	0.0	0.0	12.5	1016.2
3689	28.4500	-13.8667	22.0	16.9	73.0	21.9	20.0	26.0	0.0	331.0	27.7	1016.2
3690	28.4500	-13.8667	22.0	16.2	69.0	20.2	18.0	23.0	0.0	19.0	24.9	1017.0
3691	28.4500	-13.8667	22.0	11.1	47.0	20.2	17.4	24.0	0.0	6.0	29.7	1018.5

2825 rows × 12 columns

scaler = StandardScaler().set_output(transform="pandas") # Features need to be scaled for dimension reduction

#!# weather_d2_tsne = TSNE(n_components=?, perplexity=100, random_state=42).fit_transform(			## Define and perform dimension reduction
weather_d2_tsne = TSNE(n_components=2, perplexity=100, random_state=42).fit_transform(			## Define and perform dimension reduction
							scaler.fit_transform(weather_df.select_dtypes(include='number')) 	## Scale Data first; only numerical features can be used
							)
#!# weather_d2_pca = PCA(n_components=?).fit_transform(
weather_d2_pca = PCA(n_components=2).fit_transform(
							scaler.fit_transform(weather_df.select_dtypes(include='number'))
							)

weather_df = weather_df.assign(TSNE1= weather_d2_tsne[:,0]) ## Add as additional columns for easy plotting
weather_df = weather_df.assign(TSNE2= weather_d2_tsne[:,1])

weather_df = weather_df.assign(PC1= weather_d2_pca[:,0])
weather_df = weather_df.assign(PC2= weather_d2_pca[:,1])

Scatter plots#

(1) simple plot with no additional information#

(
    so.Plot(weather_df)
#!# .pair(x=["TSNE1","PC1"], y=["TSNE2","PC2"], cross=??) # pair as cousin of facet (sharing of axes)
	.pair(x=["TSNE1","PC1"], y=["TSNE2","PC2"], cross=False) # pair as cousin of facet (sharing of axes)
    .layout(size=(12, 6))
    .add(so.Dot())
)

../_images/4965ed9afbe141f86fc03bd84489bd2d49f74cd911be8f5342985f540202043a.png

(2) After reducing to two dimension we can overlay with numerical and categorical features for exploration of potential clusters#

(
#!# so.Plot(weather_df, color=??)
	so.Plot(weather_df, color="time")
    .pair(x=["TSNE1","PC1"], y=["TSNE2","PC2"], cross=False)
    .layout(size=(12, 6))
    .add(so.Dot())
)

../_images/7bd4835e30113846b32a0b2b0bf47ddc5984be836006a2b1b8850e00577b4026.png

(3) How about atmospheric pressure?#

Interestingly, intuitive color assignment for low/high pressure seems to be reversed in US (https://as1.ftcdn.net/v2/jpg/02/74/36/92/1000_F_274369230_Kvnl7UddxtOPCV0CDMwX6I0y6m1GezDo.jpg) compared to Germany (https://bluewhale.ch/wp-content/uploads/2020/02/Profi-Isobaren-Wetter-Karte.jpg)

(
    so.Plot(weather_df, color="Air_pressure_hPa")
    .pair(x=["TSNE1","PC1"], y=["TSNE2","PC2"], cross=False)
#!# .scale(color=so.Continuous(??).tick(upto=10)) ## Again important to choose an intuitive colormap. 
    .scale(color=so.Continuous("bwr").tick(upto=10)) ## Again important to choose an intuitive colormap. 
    .layout(size=(12, 6))
    .add(so.Dot())
	.theme({**style.library["dark_background"]})  ## If midpoint is white, a dark background is desirable
)

../_images/81a6c433c4215b37105ccf95d3916347ad3c4c7ba69c113b0474794c898c8351.png

Tackling the overplotting issue in scatter plots#

Strategies:

smaller markers
transparency to show density
marker edges

(
    so.Plot(weather_df, x="TSNE1", y="TSNE2", color="Temp_Avg_°C")
#!#	.add(so.Dot(alpha=??, pointsize=??, edgecolor="black")) # adjusting pointsize, introduce transparency, edges increase visibility (dark, bright)
	.add(so.Dot(alpha=0.3, pointsize=4, edgecolor="black")) # adjusting pointsize, introduce transparency, edges increase visibility (dark, bright)
	.scale(color=so.Continuous("Spectral_r").tick(upto=7)) # reverse
	.layout(size=(6, 6))
)

../_images/fb7ee3d974e33cbe6e178e6669624b91b4f57da1f115cdcc91beda73dc8c15ca.png

Alternative: 2D density and histogram plots (unfortunately not available via seaborn.object API)#

f, axes = plt.subplots(2,4, figsize=(18, 6), sharex=True, sharey=True)

sns.scatterplot(data=weather_df ,x="TSNE1", y="TSNE2", ax=axes[0,0], alpha=0.05)	## Scatter plot with very low alpha
sns.histplot(data=weather_df ,x="TSNE1", y="TSNE2", ax=axes[0,1])					## Tiles showing density (low resolution)
sns.kdeplot(data=weather_df ,x="TSNE1", y="TSNE2", ax=axes[0,2])					## Isoclines of density (high resolution)
#!# sns.kdeplot(data=weather_df ,x="TSNE1", y="TSNE2", ax=axes[0,3], fill=True, thresh=0, levels=??, cmap="Blues",)	## Filled density (high resolution) 
sns.kdeplot(data=weather_df ,x="TSNE1", y="TSNE2", ax=axes[0,3], fill=True, thresh=0, levels=10, cmap="Blues",)	## Filled density (high resolution) 
## try out high number of levels -> what is maximum number you can discriminate before blurring out?


## For a single continent for comparison 
sns.scatterplot(data=weather_df.where(weather_df['Continent'] == "Europe") ,x="TSNE1", y="TSNE2", ax=axes[1,0], alpha=0.05)
sns.histplot(data=weather_df.where(weather_df['Continent'] == "Europe") ,x="TSNE1", y="TSNE2", ax=axes[1,1])
sns.kdeplot(data=weather_df.where(weather_df['Continent'] == "Europe") ,x="TSNE1", y="TSNE2", ax=axes[1,2])
sns.kdeplot(data=weather_df.where(weather_df['Continent'] == "Europe") ,x="TSNE1", y="TSNE2", ax=axes[1,3], fill=True, thresh=0, levels=10, cmap="Blues")

<Axes: xlabel='TSNE1', ylabel='TSNE2'>

../_images/fc467d8213c33c1a6d278f4419dde5812d74adb553644997cc9bfc517f46198a.png

Analysis of clusters#

clustering = KMeans(n_clusters=8, random_state=42).fit(
#!# clustering = KMeans(n_clusters=??, random_state=42).fit(
	StandardScaler().fit_transform(
		weather_d2_tsne
			)
		)

clustering.labels_

array([0, 3, 0, ..., 2, 7, 7], dtype=int32)

weather_df = weather_df.assign(Cluster= [str(x) for x in clustering.labels_]) ## Add Cluster labels to dataframe for plotting

(
	so.Plot(weather_df, x="TSNE1", y="TSNE2", color="Cluster", marker="Continent") # Try out visualize cluster assignment by color and marker
    .add(so.Dot(alpha=0.8, pointsize=6, edgecolor="black"))	
	.scale(color="pastel") # Important to use a colormap for categorical data
)

../_images/d91e6ec96a6e182241f2c4c3bcb257f33aecaf2f4d658860d5231d92ea4e7ab8.png

Let’s have a look on Cluster if it is enriched with European cities#

(
	so.Plot(weather_df.loc[weather_df["Cluster"]== "0",:], x="TSNE1", y="TSNE2", color="Continent")  # try marker and color
#!# so.Plot(weather_df.loc[weather_df["Cluster"]== "0",:], x="TSNE1", y="TSNE2", ??="Continent")  # try marker and color
    .add(so.Dot(alpha=0.5, pointsize=10, edgecolor="black"))	
	.scale(color="Set2")
)

../_images/4819f661960fb20839ebc083e6ad7d144c0dc1cccb955890d3427fdc88f00292.png

Homework and discussion:#

Looking at the clustering in TSNE plots. How to check and plot city assignment to clusters? Are cities always in the same cluster?#

What is driving cluster assigment? Regions, day/night, weather …#