{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Part 2 Data exploration by unsupervised learning" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.simplefilter(action='ignore', category=FutureWarning) ## suppress annoying deprecation warnings\n", "\n", "import pandas as pd\n", "import seaborn.objects as so\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from matplotlib import style\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "from sklearn.manifold import TSNE\n", "from sklearn.decomposition import PCA\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.cluster import KMeans" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Renaming columns for better axis labels in plots\n", "col_rename = {\n", "\t'tavg': 'Temp_Avg_°C',\n", "\t'tmax': 'Temp_Max_°C',\n", "\t'tmin': 'Temp_Min_°C',\n", "\t'rhum': 'Rel_Humidity_%',\n", "\t'coco': 'Condition',\n", "\t'wspd': 'Wind_Speed_kmh',\n", "\t'prcp': 'Precipation_mm',\n", "\t'wdir': 'Wind_Direction_°',\n", "\t'pres': 'Air_pressure_hPa',\n", "\t'dwpt': 'Dew_point_°C'\n", "}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "## Reload data\n", "weather_df = pd.read_csv('global_weather.csv', parse_dates=['time'], dtype={'wmo':str, 'station':str}) \n", "weather_df = weather_df.dropna()\n", "\n", "weather_df.rename(columns=col_rename, inplace=True)\n", "\n", "weather_df = weather_df.assign(Continent = weather_df[\"timezone\"].str.split('/').str[0]) ## Get continent from timezone column" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dimension reduction by PCA and t-SNE" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | latitude | \n", "longitude | \n", "elevation | \n", "Dew_point_°C | \n", "Rel_Humidity_% | \n", "Temp_Avg_°C | \n", "Temp_Min_°C | \n", "Temp_Max_°C | \n", "Precipation_mm | \n", "Wind_Direction_° | \n", "Wind_Speed_kmh | \n", "Air_pressure_hPa | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "34.0167 | \n", "71.5833 | \n", "359.0 | \n", "11.0 | \n", "100.0 | \n", "12.3 | \n", "9.6 | \n", "13.0 | \n", "5.1 | \n", "334.0 | \n", "7.3 | \n", "1015.4 | \n", "
1 | \n", "34.0167 | \n", "71.5833 | \n", "359.0 | \n", "10.0 | \n", "100.0 | \n", "10.8 | \n", "9.0 | \n", "12.5 | \n", "65.0 | \n", "12.0 | \n", "11.7 | \n", "1005.5 | \n", "
2 | \n", "34.0167 | \n", "71.5833 | \n", "359.0 | \n", "5.0 | \n", "94.0 | \n", "8.9 | \n", "5.5 | \n", "13.0 | \n", "16.0 | \n", "330.0 | \n", "20.4 | \n", "1012.6 | \n", "
3 | \n", "34.0167 | \n", "71.5833 | \n", "359.0 | \n", "3.0 | \n", "54.0 | \n", "11.3 | \n", "4.5 | \n", "18.0 | \n", "0.0 | \n", "270.0 | \n", "14.2 | \n", "1020.1 | \n", "
4 | \n", "34.0167 | \n", "71.5833 | \n", "359.0 | \n", "1.9 | \n", "50.0 | \n", "13.7 | \n", "5.5 | \n", "21.5 | \n", "0.0 | \n", "222.0 | \n", "13.7 | \n", "1019.3 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
3687 | \n", "28.4500 | \n", "-13.8667 | \n", "22.0 | \n", "18.0 | \n", "78.0 | \n", "19.9 | \n", "16.7 | \n", "22.0 | \n", "0.0 | \n", "32.0 | \n", "12.2 | \n", "1018.1 | \n", "
3688 | \n", "28.4500 | \n", "-13.8667 | \n", "22.0 | \n", "15.9 | \n", "57.0 | \n", "23.0 | \n", "18.8 | \n", "28.0 | \n", "0.0 | \n", "0.0 | \n", "12.5 | \n", "1016.2 | \n", "
3689 | \n", "28.4500 | \n", "-13.8667 | \n", "22.0 | \n", "16.9 | \n", "73.0 | \n", "21.9 | \n", "20.0 | \n", "26.0 | \n", "0.0 | \n", "331.0 | \n", "27.7 | \n", "1016.2 | \n", "
3690 | \n", "28.4500 | \n", "-13.8667 | \n", "22.0 | \n", "16.2 | \n", "69.0 | \n", "20.2 | \n", "18.0 | \n", "23.0 | \n", "0.0 | \n", "19.0 | \n", "24.9 | \n", "1017.0 | \n", "
3691 | \n", "28.4500 | \n", "-13.8667 | \n", "22.0 | \n", "11.1 | \n", "47.0 | \n", "20.2 | \n", "17.4 | \n", "24.0 | \n", "0.0 | \n", "6.0 | \n", "29.7 | \n", "1018.5 | \n", "
2825 rows × 12 columns
\n", "