Predicting the Photo-voltaic energy production#
This Notebook aims at predicting the energy producte by solar pannel.
It uses weather data extracted from the MeteoFrance numerical models, as well as history of productions provided by RTE.
[1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
[2]:
filename_ssrd_regions = "../../data/silver/mean_daily_sun_j0.csv"
filename_energy_preduction = "../../data/rte_agg_daily_2014_2024.csv"
[3]:
df_ssrd_regions = pd.read_csv(filename_ssrd_regions, parse_dates=["time"]).set_index(
"time"
)
# sanitise the column names
region_names = [
col.replace(" ", "_").replace("'", "_").replace("-", "_").lower()
for col in df_ssrd_regions.columns
]
df_ssrd_regions.columns = region_names
region_names = df_ssrd_regions.columns
# Doing so to add empty rows for missing days
df_ssrd_regions.plot(figsize=(15, 10))
df_ssrd_regions["days_from_start"] = [
(date - df_ssrd_regions.index[0]).days for date in df_ssrd_regions.index
]
df_ssrd_regions.head()
[3]:
auvergne_rhône_alpes | bourgogne_franche_comté | bretagne | centre_val_de_loire | corse | grand_est | hauts_de_france | normandie | nouvelle_aquitaine | occitanie | pays_de_la_loire | provence_alpes_côte_d_azur | île_de_france | days_from_start | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
time | ||||||||||||||
2022-02-01 | 261520.851689 | 90775.394750 | 163552.580967 | 34723.093875 | 456686.957700 | 44975.603958 | 21341.962363 | 42435.778336 | 185091.563333 | 315764.897917 | 122142.333333 | 425822.506650 | 15171.129004 | 0 |
2022-02-02 | 197822.709267 | 59674.926817 | 225057.752062 | 163614.112946 | 455816.711071 | 36565.521433 | 66463.473763 | 155318.998325 | 240519.501958 | 292222.562750 | 244625.748700 | 430004.543860 | 73755.427321 | 1 |
2022-02-03 | 299190.206288 | 190444.998208 | 81208.290680 | 150123.865750 | 408849.293058 | 124497.688083 | 48622.630200 | 47198.020700 | 218038.002000 | 342835.312917 | 119590.019712 | 383634.419163 | 87619.219558 | 2 |
2022-02-04 | 284906.858032 | 131882.476333 | 228456.231833 | 92169.886950 | 330461.622500 | 87107.948417 | 120341.102958 | 164888.956167 | 112435.209250 | 238153.647625 | 168470.041229 | 390108.203765 | 113925.635167 | 3 |
2022-02-05 | 362909.540192 | 346769.893083 | 318892.106625 | 308881.768208 | 310414.561742 | 243847.895724 | 259733.252833 | 262925.207583 | 402942.371958 | 325491.808458 | 338302.897500 | 442079.500878 | 274746.580333 | 4 |
![../../_images/user_guide_notebooks_3_PV_prediction_3_1.png](../../_images/user_guide_notebooks_3_PV_prediction_3_1.png)
[4]:
df_energy_preduction = pd.read_csv(filename_energy_preduction, index_col=0)[["Solaire"]]
df_energy_preduction.index = pd.to_datetime(df_energy_preduction.index)
df_energy_preduction.head(), df_energy_preduction.tail()
[4]:
( Solaire
Date
2015-01-01 11370.5
2015-01-02 8297.5
2015-01-03 5860.5
2015-01-04 6926.0
2015-01-05 9786.5,
Solaire
Date
2024-04-04 76581.5
2024-04-05 72847.5
2024-04-06 61577.5
2024-04-07 46718.5
2024-04-08 26677.0)
[5]:
# align the indexes of the two dataframes
data = pd.concat([df_ssrd_regions, df_energy_preduction], join="inner", axis=1)
data.head()
[5]:
auvergne_rhône_alpes | bourgogne_franche_comté | bretagne | centre_val_de_loire | corse | grand_est | hauts_de_france | normandie | nouvelle_aquitaine | occitanie | pays_de_la_loire | provence_alpes_côte_d_azur | île_de_france | days_from_start | Solaire | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2022-02-01 | 261520.851689 | 90775.394750 | 163552.580967 | 34723.093875 | 456686.957700 | 44975.603958 | 21341.962363 | 42435.778336 | 185091.563333 | 315764.897917 | 122142.333333 | 425822.506650 | 15171.129004 | 0 | 21938.5 |
2022-02-02 | 197822.709267 | 59674.926817 | 225057.752062 | 163614.112946 | 455816.711071 | 36565.521433 | 66463.473763 | 155318.998325 | 240519.501958 | 292222.562750 | 244625.748700 | 430004.543860 | 73755.427321 | 1 | 21271.0 |
2022-02-03 | 299190.206288 | 190444.998208 | 81208.290680 | 150123.865750 | 408849.293058 | 124497.688083 | 48622.630200 | 47198.020700 | 218038.002000 | 342835.312917 | 119590.019712 | 383634.419163 | 87619.219558 | 2 | 20527.5 |
2022-02-04 | 284906.858032 | 131882.476333 | 228456.231833 | 92169.886950 | 330461.622500 | 87107.948417 | 120341.102958 | 164888.956167 | 112435.209250 | 238153.647625 | 168470.041229 | 390108.203765 | 113925.635167 | 3 | 19051.0 |
2022-02-05 | 362909.540192 | 346769.893083 | 318892.106625 | 308881.768208 | 310414.561742 | 243847.895724 | 259733.252833 | 262925.207583 | 402942.371958 | 325491.808458 | 338302.897500 | 442079.500878 | 274746.580333 | 4 | 41271.5 |
[6]:
from statsmodels.formula.api import ols
# split test for time series
from sklearn.model_selection import TimeSeriesSplit
Modeling#
4 models are tested :
Only Total sun flux (no region details)
Only regions sun flux (no time feature)
Total sun flux + time
Regions sun flux + time
The Time feature allows to model a trend in the number of installed solar pannels.
[7]:
exo_vars = df_ssrd_regions.columns
exo_vars = exo_vars.drop("days_from_start")
data["mean_sun"] = data[exo_vars].mean(axis=1)
endog_var = "Solaire"
[8]:
tscv = TimeSeriesSplit(n_splits=60, test_size=1) # testing on 1 days forcast
[9]:
def test_model(formula="Solaire ~ mean_sun -1"):
mod_1_mape = []
for i, (train_index, test_index) in enumerate(tscv.split(data)):
model_1 = ols(formula, data=data.iloc[train_index]).fit()
if i == 0:
first_test_index = test_index
first_model_1 = model_1
predictions = model_1.predict(data.iloc[test_index])
error = data.iloc[test_index]["Solaire"] - predictions
mape = (error.abs() / data.iloc[test_index]["Solaire"]).mean()
mod_1_mape.append(mape)
last_test_index = test_index
last_model_1 = model_1
return mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1
mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1 = (
test_model()
)
[10]:
ax = data.plot(y="Solaire", label="True")
first_model_1.predict(data.iloc[first_test_index]).plot(
ax=ax, label="First Test Predicted"
)
last_model_1.predict(data.iloc[last_test_index]).plot(
ax=ax, label="Last Test Predicted", lw=2
)
ax.legend()
[10]:
<matplotlib.legend.Legend at 0x7fccc45d9de0>
![../../_images/user_guide_notebooks_3_PV_prediction_11_1.png](../../_images/user_guide_notebooks_3_PV_prediction_11_1.png)
[11]:
fig, ax = plt.subplots()
ax.hist(mod_1_mape, bins=20)
ax.set_title("MAPE distribution for model 1")
ax.set_xlabel("MAPE")
[11]:
Text(0.5, 0, 'MAPE')
![../../_images/user_guide_notebooks_3_PV_prediction_12_1.png](../../_images/user_guide_notebooks_3_PV_prediction_12_1.png)
[14]:
formula_2 = f"Solaire ~ {' + '.join(exo_vars)} -1"
print(formula_2)
mod_2_mape, first_test_index, first_model_2, last_test_index, last_model_2 = test_model(
formula_2
)
Solaire ~ auvergne_rhône_alpes + bourgogne_franche_comté + bretagne + centre_val_de_loire + corse + grand_est + hauts_de_france + normandie + nouvelle_aquitaine + occitanie + pays_de_la_loire + provence_alpes_côte_d_azur + île_de_france -1
[15]:
fig, ax = plt.subplots()
ax.hist(mod_2_mape, bins=20)
ax.set_title("MAPE distribution for model 2")
[15]:
Text(0.5, 1.0, 'MAPE distribution for model 2')
![../../_images/user_guide_notebooks_3_PV_prediction_14_1.png](../../_images/user_guide_notebooks_3_PV_prediction_14_1.png)
[16]:
formula_3 = "Solaire ~ mean_sun + days_from_start -1"
mod_3_mape, first_test_index, first_model_3, last_test_index, last_model_3 = test_model(
formula_3
)
formula_4 = f"Solaire ~ {' + '.join(exo_vars)} + days_from_start -1"
mod_4_mape, first_test_index, first_model_4, last_test_index, last_model_4 = test_model(
formula_4
)
print("formula_3", formula_3)
print("formula_4", formula_4)
formula_3 Solaire ~ mean_sun + days_from_start -1
formula_4 Solaire ~ auvergne_rhône_alpes + bourgogne_franche_comté + bretagne + centre_val_de_loire + corse + grand_est + hauts_de_france + normandie + nouvelle_aquitaine + occitanie + pays_de_la_loire + provence_alpes_côte_d_azur + île_de_france + days_from_start -1
[17]:
# display the MAPE distribution for all models (KDE)
fig, ax = plt.subplots()
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
pd.Series(mape).plot.kde(ax=ax, label=f"Model {i+1}")
ax.set_title("MAPE distribution for all models")
ax.legend()
[17]:
<matplotlib.legend.Legend at 0x7fcc834ec7c0>
![../../_images/user_guide_notebooks_3_PV_prediction_16_1.png](../../_images/user_guide_notebooks_3_PV_prediction_16_1.png)
[18]:
# print mean MAPE for all models
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
print(f"Model {i+1} mean MAPE: {np.mean(mape):.2%}")
Model 1 mean MAPE: 30.85%
Model 2 mean MAPE: 21.88%
Model 3 mean MAPE: 18.18%
Model 4 mean MAPE: 13.79%
Conclusion#
The model performances doesn’t really follow the expected order (more feature => better performance)
Maybe this is due to some sort of over-fitting, even thought there are not so mainy features…
[19]:
last_model_2.params.to_csv("sun_model_2_params.csv")