{ "cells": [ { "cell_type": "markdown", "id": "d9d7701f-4828-45fe-aeef-27db97bd3b8a", "metadata": {}, "source": [ "## Preprocessing data (Python version)\n", "\n", "This notebook provides some examples of how the functions in the `preprocessing.py` module can be used. " ] }, { "cell_type": "code", "execution_count": 1, "id": "f9f19ba1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from epigraphhub.analysis.preprocessing import *" ] }, { "cell_type": "markdown", "id": "3c8d511a", "metadata": {}, "source": [ "The functions in the preprocessing.py module allow the transformation of tabular data in a format accepted by ML models (tabular data using lagged values) and neural network models (3D array data and multiple-output).\n", "\n", "In this tutorial, we will use the data saved in the path: ./data/data_GE.csv. This dataset represents the number of tests, cases, and hospitalizations of COVID-19 reported in some cantons of Switzerland." ] }, { "cell_type": "code", "execution_count": 2, "id": "3307e375", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | test_FR | \n", "diff_test_FR | \n", "diff_2_test_FR | \n", "test_NE | \n", "diff_test_NE | \n", "diff_2_test_NE | \n", "test_TI | \n", "diff_test_TI | \n", "diff_2_test_TI | \n", "test_VD | \n", "... | \n", "hosp_NE | \n", "diff_hosp_NE | \n", "diff_2_hosp_NE | \n", "hosp_FR | \n", "diff_hosp_FR | \n", "diff_2_hosp_FR | \n", "hosp_GE | \n", "diff_hosp_GE | \n", "diff_2_hosp_GE | \n", "vac_all | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
datum | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
2020-03-01 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.142857 | \n", "0.000000 | \n", "0.000000 | \n", "0.428571 | \n", "0.142857 | \n", "0.285714 | \n", "0.428571 | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "
2020-03-02 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.285714 | \n", "0.142857 | \n", "0.142857 | \n", "0.857143 | \n", "0.428571 | \n", "0.571429 | \n", "0.428571 | \n", "0.000000 | \n", "0.142857 | \n", "0.0 | \n", "
2020-03-03 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.428571 | \n", "0.142857 | \n", "0.285714 | \n", "0.857143 | \n", "0.000000 | \n", "0.428571 | \n", "0.428571 | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "
2020-03-04 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.285714 | \n", "-0.142857 | \n", "0.000000 | \n", "0.714286 | \n", "-0.142857 | \n", "-0.142857 | \n", "0.571429 | \n", "0.142857 | \n", "0.142857 | \n", "0.0 | \n", "
2020-03-05 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.428571 | \n", "0.142857 | \n", "0.000000 | \n", "1.000000 | \n", "0.285714 | \n", "0.142857 | \n", "0.857143 | \n", "0.285714 | \n", "0.428571 | \n", "0.0 | \n", "
5 rows × 64 columns
\n", "\n", " | test_FR | \n", "test_FR_lag1 | \n", "test_FR_lag2 | \n", "test_FR_lag3 | \n", "diff_test_FR | \n", "diff_test_FR_lag1 | \n", "diff_test_FR_lag2 | \n", "diff_test_FR_lag3 | \n", "diff_2_test_FR | \n", "diff_2_test_FR_lag1 | \n", "... | \n", "diff_hosp_GE_lag2 | \n", "diff_hosp_GE_lag3 | \n", "diff_2_hosp_GE | \n", "diff_2_hosp_GE_lag1 | \n", "diff_2_hosp_GE_lag2 | \n", "diff_2_hosp_GE_lag3 | \n", "vac_all | \n", "vac_all_lag1 | \n", "vac_all_lag2 | \n", "vac_all_lag3 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
datum | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
2020-03-04 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.142857 | \n", "0.000000 | \n", "0.142857 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2020-03-05 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.428571 | \n", "0.142857 | \n", "0.000000 | \n", "0.142857 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2020-03-06 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.142857 | \n", "0.000000 | \n", "0.428571 | \n", "0.428571 | \n", "0.142857 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2020-03-07 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.285714 | \n", "0.142857 | \n", "0.285714 | \n", "0.428571 | \n", "0.428571 | \n", "0.142857 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2020-03-08 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.142857 | \n", "0.285714 | \n", "0.000000 | \n", "0.285714 | \n", "0.428571 | \n", "0.428571 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
5 rows × 256 columns
\n", "\n", " | test_FR | \n", "test_FR_lag1 | \n", "test_FR_lag2 | \n", "test_FR_lag3 | \n", "diff_test_FR | \n", "diff_test_FR_lag1 | \n", "diff_test_FR_lag2 | \n", "diff_test_FR_lag3 | \n", "diff_2_test_FR | \n", "diff_2_test_FR_lag1 | \n", "... | \n", "diff_hosp_GE_lag2 | \n", "diff_hosp_GE_lag3 | \n", "diff_2_hosp_GE | \n", "diff_2_hosp_GE_lag1 | \n", "diff_2_hosp_GE_lag2 | \n", "diff_2_hosp_GE_lag3 | \n", "vac_all | \n", "vac_all_lag1 | \n", "vac_all_lag2 | \n", "vac_all_lag3 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
datum | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
2021-01-01 | \n", "542.285714 | \n", "550.857143 | \n", "586.000000 | \n", "650.428571 | \n", "-8.571429 | \n", "-35.142857 | \n", "-64.428571 | \n", "-32.714286 | \n", "-43.714286 | \n", "-99.571429 | \n", "... | \n", "0.000000 | \n", "0.285714 | \n", "-0.142857 | \n", "-0.285714 | \n", "0.285714 | \n", "0.714286 | \n", "0.044286 | \n", "0.035714 | \n", "0.027143 | \n", "0.017143 | \n", "
2021-01-02 | \n", "536.571429 | \n", "542.285714 | \n", "550.857143 | \n", "586.000000 | \n", "-5.714286 | \n", "-8.571429 | \n", "-35.142857 | \n", "-64.428571 | \n", "-14.285714 | \n", "-43.714286 | \n", "... | \n", "-0.285714 | \n", "0.000000 | \n", "0.571429 | \n", "-0.142857 | \n", "-0.285714 | \n", "0.285714 | \n", "0.052857 | \n", "0.044286 | \n", "0.035714 | \n", "0.027143 | \n", "
2021-01-03 | \n", "530.000000 | \n", "536.571429 | \n", "542.285714 | \n", "550.857143 | \n", "-6.571429 | \n", "-5.714286 | \n", "-8.571429 | \n", "-35.142857 | \n", "-12.285714 | \n", "-14.285714 | \n", "... | \n", "0.142857 | \n", "-0.285714 | \n", "0.571429 | \n", "0.571429 | \n", "-0.142857 | \n", "-0.285714 | \n", "0.061429 | \n", "0.052857 | \n", "0.044286 | \n", "0.035714 | \n", "
2021-01-04 | \n", "552.285714 | \n", "530.000000 | \n", "536.571429 | \n", "542.285714 | \n", "22.285714 | \n", "-6.571429 | \n", "-5.714286 | \n", "-8.571429 | \n", "15.714286 | \n", "-12.285714 | \n", "... | \n", "0.428571 | \n", "0.142857 | \n", "0.000000 | \n", "0.571429 | \n", "0.571429 | \n", "-0.142857 | \n", "0.074286 | \n", "0.061429 | \n", "0.052857 | \n", "0.044286 | \n", "
2021-01-05 | \n", "552.857143 | \n", "552.285714 | \n", "530.000000 | \n", "536.571429 | \n", "0.571429 | \n", "22.285714 | \n", "-6.571429 | \n", "-5.714286 | \n", "22.857143 | \n", "15.714286 | \n", "... | \n", "0.142857 | \n", "0.428571 | \n", "-0.428571 | \n", "0.000000 | \n", "0.571429 | \n", "0.571429 | \n", "0.095714 | \n", "0.074286 | \n", "0.061429 | \n", "0.052857 | \n", "
5 rows × 256 columns
\n", "\n", " | test_FR | \n", "diff_test_FR | \n", "diff_2_test_FR | \n", "test_NE | \n", "diff_test_NE | \n", "diff_2_test_NE | \n", "test_TI | \n", "diff_test_TI | \n", "diff_2_test_TI | \n", "test_VD | \n", "... | \n", "hosp_NE | \n", "diff_hosp_NE | \n", "diff_2_hosp_NE | \n", "hosp_FR | \n", "diff_hosp_FR | \n", "diff_2_hosp_FR | \n", "hosp_GE | \n", "diff_hosp_GE | \n", "diff_2_hosp_GE | \n", "vac_all | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.009009 | \n", "0.0000 | \n", "0.00 | \n", "0.024793 | \n", "0.066667 | \n", "0.090909 | \n", "0.014634 | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "
1 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.018018 | \n", "0.0625 | \n", "0.05 | \n", "0.049587 | \n", "0.200000 | \n", "0.181818 | \n", "0.014634 | \n", "0.000000 | \n", "0.032258 | \n", "0.0 | \n", "
2 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.027027 | \n", "0.0625 | \n", "0.10 | \n", "0.049587 | \n", "0.000000 | \n", "0.136364 | \n", "0.014634 | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "
3 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.018018 | \n", "-0.0625 | \n", "0.00 | \n", "0.041322 | \n", "-0.066667 | \n", "-0.045455 | \n", "0.019512 | \n", "0.043478 | \n", "0.032258 | \n", "0.0 | \n", "
4 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.027027 | \n", "0.0625 | \n", "0.00 | \n", "0.057851 | \n", "0.133333 | \n", "0.045455 | \n", "0.029268 | \n", "0.086957 | \n", "0.096774 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
908 | \n", "0.046436 | \n", "-0.002678 | \n", "-0.025352 | \n", "0.053610 | \n", "0.005452 | \n", "0.013990 | \n", "0.096719 | \n", "-0.017641 | \n", "-0.023144 | \n", "0.069339 | \n", "... | \n", "0.000000 | \n", "0.0000 | \n", "0.00 | \n", "0.008264 | \n", "-0.200000 | \n", "-0.136364 | \n", "0.087805 | \n", "-0.043478 | \n", "0.032258 | \n", "1.0 | \n", "
909 | \n", "0.046236 | \n", "-0.002678 | \n", "-0.004024 | \n", "0.053666 | \n", "0.000779 | \n", "0.004145 | \n", "0.096447 | \n", "-0.002614 | \n", "-0.014947 | \n", "0.069339 | \n", "... | \n", "0.000000 | \n", "0.0000 | \n", "0.00 | \n", "0.008264 | \n", "0.000000 | \n", "-0.136364 | \n", "0.087805 | \n", "0.000000 | \n", "-0.032258 | \n", "1.0 | \n", "
910 | \n", "0.044516 | \n", "-0.023032 | \n", "-0.019316 | \n", "0.052940 | \n", "-0.010125 | \n", "-0.006218 | \n", "0.096991 | \n", "0.005227 | \n", "0.001929 | \n", "0.068820 | \n", "... | \n", "0.000000 | \n", "0.0000 | \n", "0.00 | \n", "0.008264 | \n", "0.000000 | \n", "0.000000 | \n", "0.073171 | \n", "-0.130435 | \n", "-0.096774 | \n", "1.0 | \n", "
911 | \n", "0.042517 | \n", "-0.026781 | \n", "-0.037425 | \n", "0.050874 | \n", "-0.028816 | \n", "-0.025907 | \n", "0.085841 | \n", "-0.107155 | \n", "-0.075217 | \n", "0.065681 | \n", "... | \n", "0.000000 | \n", "0.0000 | \n", "0.00 | \n", "0.008264 | \n", "0.000000 | \n", "0.000000 | \n", "0.063415 | \n", "-0.086957 | \n", "-0.161290 | \n", "1.0 | \n", "
912 | \n", "0.034677 | \n", "-0.025174 | \n", "-0.079276 | \n", "0.041883 | \n", "-0.034268 | \n", "-0.082902 | \n", "0.069862 | \n", "-0.095394 | \n", "-0.138862 | \n", "0.053947 | \n", "... | \n", "0.000000 | \n", "0.0000 | \n", "0.00 | \n", "0.008264 | \n", "0.000000 | \n", "0.000000 | \n", "0.048780 | \n", "-0.130435 | \n", "-0.161290 | \n", "1.0 | \n", "
913 rows × 64 columns
\n", "