Connect with Google Drive to import files
from google.colab import drive
drive.mount('/content/drive')
Install modules
!apt install proj-bin libproj-dev libgeos-dev
!pip install https://github.com/matplotlib/basemap/archive/master.zip
!pip install netCDF4
Import libraries and modules
from netCDF4 import Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import matplotlib
from mpl_toolkits.basemap import Basemap
from google.colab import files
The Emissions Database for Global Atmospheric Research (EDGAR) provides global past and present day anthropogenic emissions of greenhouse gases and air pollutants by country and on spatial grid.
EDGARv5.0 provides emissions of CO2 per sector and country.
Source : https://edgar.jrc.ec.europa.eu/overview.php?v=50_GHG
CO2 emissions are provided separately:
- CO2_excl_short-cycle_org_C includes all fossil CO2 sources, such as fossil fuel combustion, non-metallic mineral processes (e.g. cement production), metal (ferrous and non-ferrous) production processes, urea production, agricultural liming and solvents use. Large scale biomass burning with Savannah burning, forest fires, and sources and sinks from land-use, land-use change and forestry (LULUCF) are excluded.
- CO2_short-cycle_org_C represent short-cycle biomass burning (such as agricultura waste burning and Savannah burning)
For the energy related sectors the activity data is mainly based on the energy balance statistics of IEA (2017) (http://www.oecd-ilibrary.org/energy/co2-emissions-from-fuel-combustion-2017_co2_fuel-2017-en), whereas the activity data for the agricultural sectors originates mainly from FAO (2018) (http://www.fao.org/faostat/en/#home). Additional information can be found in Crippa et al. (2019).
The methodology for spatial allocation of emissions on 0.1 degree by 0.1 degree grid cells is the following: A geographical database was built using spatial proxy datasets with the location of energy and manufacturing facilities, road networks, shipping routes, human and animal population density and agricultural land use, that vary over time. The input datasets where point, line and area grids at various resolutions and using GIS techniques for conversion, resampling and aggregation the dataset have been included on a 0.1°x0.1° grid. National sector totals are then distributed with the given percentages of the spatial proxies over the country’s area
Notes: Emission gridmaps are expressed in kg substance /m2 /s.
CO2 emissions excluding short-cycle biomass burning (such as agricultural waste burning and Savannah burning) but including other biomass burning (such as forest fires, post-burn decay, peat fires and decay of drained peatlands).
NB: Short-cycle biomass burning are, by convention, excluded from inventories because of the supposed carbon rotation speed for these emissions. However it is a convention and not a physical reality.
Emissions are expressed in kg substance /m2 /s.
Annual data from 2013 to 2018.
path_Edgar_2015_v1 = "/content/drive/My Drive/Data For Good/Inputs/EDGAR data/v50_CO2_excl_short-cycle_org_C_2015.0.1x0.1.nc"
path_Edgar_2016_v1 = "/content/drive/My Drive/Data For Good/Inputs/EDGAR data/v50_CO2_excl_short-cycle_org_C_2016.0.1x0.1.nc"
path_Edgar_2017_v1 = "/content/drive/My Drive/Data For Good/Inputs/EDGAR data/v50_CO2_excl_short-cycle_org_C_2017.0.1x0.1.nc"
path_Edgar_2018_v1 = "/content/drive/My Drive/Data For Good/Inputs/EDGAR data/v50_CO2_excl_short-cycle_org_C_2018.0.1x0.1.nc"
dataset_Edgar_2015_v1 = Dataset(path_Edgar_2015_v1, 'r')
dataset_Edgar_2016_v1 = Dataset(path_Edgar_2016_v1, 'r')
dataset_Edgar_2017_v1 = Dataset(path_Edgar_2017_v1, 'r')
dataset_Edgar_2018_v1 = Dataset(path_Edgar_2018_v1, 'r')
dataset_Edgar_2015_v1.variables.keys()
dataset_Edgar_2015_v1.variables['lat'].shape
dataset_Edgar_2015_v1.variables['lon'].shape
dataset_Edgar_2015_v1.variables['emi_co2'].shape
dataset_list_v1 = [dataset_Edgar_2015_v1,dataset_Edgar_2016_v1,dataset_Edgar_2017_v1,dataset_Edgar_2018_v1]
dataframe_list_v1 = list()
for dataset in dataset_list_v1:
arr_C02 = np.array(dataset.variables['emi_co2'])
arr_lat = np.array(dataset.variables['lat'])
arr_lon = np.array(dataset.variables['lon'])
preparatory_table = np.zeros((arr_lat.size*arr_lon.size,3))
for j in range(3600-1):
for i in range(1800-1):
preparatory_table[i+(1800*j),0]=arr_lat[i]
preparatory_table[i+(1800*j),1]=arr_lon[j]
preparatory_table[i+(1800*j),2]=arr_C02[i,j]
i+1
j+1
dataframe_list_v1.append(pd.DataFrame(preparatory_table))
def decale(a):
if (a > 180):
return a-360
return a
for dataframe in dataframe_list_v1:
dataframe.columns = ['latitude', 'longitude', 'CO2 emissions']
dataframe['longitude'] = dataframe['longitude'].apply(decale)
dataframe_list_v1[0]
Création des dataframes
CO2_emissions_Edgar_2015_v1 = dataframe_list_v1[0]
CO2_emissions_Edgar_2016_v1 = dataframe_list_v1[1]
CO2_emissions_Edgar_2017_v1 = dataframe_list_v1[2]
CO2_emissions_Edgar_2018_v1 = dataframe_list_v1[3]
Vérification que les données changent d'une année à l'autre - seules les données 'CO2 emissions' sont sensées changer
CO2_emissions_Edgar_2015_v1.describe()
CO2_emissions_Edgar_2018_v1.describe()
Rappel: l'unité est en kg substance /m2 /s
def draw_map_Edgar(data, titre, lon_min=-180, lon_max=180, lat_min=-90, lat_max=90, size_point=1, frontier=False):
plt.figure(figsize=(15, 10), edgecolor='w')
m = Basemap(llcrnrlat=lat_min, urcrnrlat=lat_max, llcrnrlon=lon_min, urcrnrlon=lon_max)
m.shadedrelief()
parallels = np.arange(-80.,81,10.)
m.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,20.)
m.drawmeridians(meridians,labels=[True,False,False,True])
normal = matplotlib.colors.LogNorm(vmax=data['CO2 emissions'].max())
m.scatter(data['longitude'], data['latitude'], c=data['CO2 emissions'], cmap=plt.cm.jet, s=size_point, norm=normal)
plt.colorbar()
plt.title(titre)
if (frontier):
m.drawcountries(linewidth=0.5)
m.drawcoastlines(linewidth=0.7)
plt.show()
#dataframe_list_final_v1 = (CO2_emissions_Edgar_2015_v1,CO2_emissions_Edgar_2016_v1,CO2_emissions_Edgar_2017_v1,CO2_emissions_Edgar_2018_v1)
#title_list_Edgar_v1 = ('CO2 emissions of year 2015 v1','CO2 emissions of year 2016 v1','CO2 emissions of year 2017 v1','CO2 emissions of year 2018 v1')
draw_map_Edgar(CO2_emissions_Edgar_2018_v1, 'CO2 emissions excluding short-cycle biomass burning (2018)', frontier=True)
#CO2_emissions_Edgar_2015_v1.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_Edgar_2015_v1.csv")
#CO2_emissions_Edgar_2016_v1.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_Edgar_2016_v1.csv")
#CO2_emissions_Edgar_2017_v1.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_Edgar_2017_v1.csv")
#CO2_emissions_Edgar_2018_v1.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_Edgar_2018_v1.csv")
CO2 emissions from large scale biomass burning with Savannah burning, forest fires, and sources and sinks from land-use, land-use change and forestry (LULUCF).
Annual data from 2013 to 2015 (no data available for the next years)
Emissions are expressed in kg substance /m2 /s.
path_Edgar_2015_v2 = "/content/drive/My Drive/Data For Good/Inputs/EDGAR data/v50_CO2_org_short-cycle_C_2015.0.1x0.1.nc"
dataset_Edgar_2015_v2 = Dataset(path_Edgar_2015_v2, 'r')
dataset_list_v2 = [dataset_Edgar_2015_v2]
dataframe_list_v2 = list()
for dataset in dataset_list_v2:
arr_C02 = np.array(dataset.variables['emi_co2'])
arr_lat = np.array(dataset.variables['lat'])
arr_lon = np.array(dataset.variables['lon'])
preparatory_table = np.zeros((arr_lat.size*arr_lon.size,3))
for j in range(3600-1):
for i in range(1800-1):
preparatory_table[i+(1800*j),0]=arr_lat[i]
preparatory_table[i+(1800*j),1]=arr_lon[j]
preparatory_table[i+(1800*j),2]=arr_C02[i,j]
i+1
j+1
dataframe_list_v2.append(pd.DataFrame(preparatory_table))
for dataframe in dataframe_list_v2:
dataframe.columns = ['latitude', 'longitude', 'CO2 emissions']
dataframe['longitude'] = dataframe['longitude'].apply(decale)
Création des dataframes
CO2_emissions_Edgar_2015_v2 = dataframe_list_v2[0]
CO2_emissions_Edgar_2015_v2.describe()
Comparaison avec la v1
CO2_emissions_Edgar_2015_v1.describe()
On observe que les données en CO2 sont bien plus faibles dans la v2 par rapport à la v1 (facteur 10 pour la moyenne, facteur 100 pour le maximum)
Rappel: l'unité est en kg substance /m2 /s
draw_map_Edgar(CO2_emissions_Edgar_2015_v2, 'CO2 emissions from short-cycle biomass burning (2015)', frontier=True)
#CO2_emissions_Edgar_2015_v2.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_Edgar_2015_v2.csv")
This version of the data results in the sum of CO2 emissions excluding short-cycle biomass burning (CO2_excl_short-cycle_org_C) and CO2 emissions from short-cycle biomass burning (CO2_short-cycle_org_C) in order to get the total CO2 emissions.
NB: For year 2016 to 2018, CO2 emissions from short-cycle biomass burning are not available, hence can only provide CO2 emissions excluding short-cycle biomass burning for these years. This has not a big influence as data excluding short-cycle (v1) are 10 times bigger on average, and 100 times bigger as regards the maximum CO2 emissions compared to data concerning biomass burning (short-cycle, v2). Therefore, even when v2 has data, v3 ≈ v1
Emissions are expressed in kg substance /m2 /s.
For 2015 we sum the CO2 emissions excluding short-cycle biomass burning + CO2 emissions due to biomass burning.
For 2016, 2017 and 2018 we only have CO2 emissions excluding short-cycle biomass.
# 2015
CO2_emissions_Edgar_2015_v3 = pd.DataFrame()
CO2_emissions_Edgar_2015_v3['latitude'] = CO2_emissions_Edgar_2015_v1['latitude']
CO2_emissions_Edgar_2015_v3['longitude'] = CO2_emissions_Edgar_2015_v1['longitude']
CO2_emissions_Edgar_2015_v3['CO2 emissions'] = CO2_emissions_Edgar_2015_v1['CO2 emissions'] + CO2_emissions_Edgar_2015_v2['CO2 emissions']
# 2016
CO2_emissions_Edgar_2016_v3 = CO2_emissions_Edgar_2016_v1
# 2017
CO2_emissions_Edgar_2017_v3 = CO2_emissions_Edgar_2017_v1
# 2018
CO2_emissions_Edgar_2018_v3 = CO2_emissions_Edgar_2018_v1
# Test colonne 'CO2 emissions' et leur moyenne
# On devrait obtenir des différences très petites, ce qui est le cas
print(CO2_emissions_Edgar_2015_v3.describe()['CO2 emissions'].iloc[1] -(CO2_emissions_Edgar_2015_v1.describe()['CO2 emissions'].iloc[1]+CO2_emissions_Edgar_2015_v2.describe()['CO2 emissions'].iloc[1]))
draw_map_Edgar(CO2_emissions_Edgar_2015_v3,'Total CO2 emissions including short-cycle biomass burning (2015)', frontier=True)
Visualisation of the data
plt.figure(figsize=(15, 15))
plt.scatter(np.arange(0,6480000,1),CO2_emissions_Edgar_2015_v3['CO2 emissions'], s=1)
We observe that many data have a very small value. Let's have a better view on the data repartition
list_percentile_2015=[]
for i in range(100):
list_percentile_2015.append(np.percentile(CO2_emissions_Edgar_2015_v3['CO2 emissions'],i+1))
list_percentile_2015
We get a better idea on how to classify our data
Let's check if this is the same thing for 2016
list_percentile_2016=[]
for i in range(100):
list_percentile_2016.append(np.percentile(CO2_emissions_Edgar_2016_v3['CO2 emissions'],i+1))
list_percentile_2016
Creation of the classification
We chose the following classification after observing the 100 percentiles of the data:
- CO2 emissions = 0 (encoded 'Null' or 0)
- 0 < CO2 emissions <= 10^-10 (encoded 'Very low' or 1)
- 10^-10 < CO2 emissions <= 10^-9 (encoded 'Low' or 2)
- 10^-9 < CO2 emissions <= 10^-8 (encoded 'Medium' or 3)
- CO2 emissions > 10^-8 (encoded 'High' or 4)
# 2015
CO2_emissions_Edgar_2015_v3.loc[CO2_emissions_Edgar_2015_v3['CO2 emissions']==0, 'CO2 classification'] = 0
CO2_emissions_Edgar_2015_v3.loc[(CO2_emissions_Edgar_2015_v3['CO2 emissions'] > 0) & (CO2_emissions_Edgar_2015_v3['CO2 emissions'] <= 1.0e-10), 'CO2 classification'] = 1
CO2_emissions_Edgar_2015_v3.loc[(CO2_emissions_Edgar_2015_v3['CO2 emissions'] > 1.0e-10) & (CO2_emissions_Edgar_2015_v3['CO2 emissions'] <= 1.0e-9), 'CO2 classification'] = 2
CO2_emissions_Edgar_2015_v3.loc[(CO2_emissions_Edgar_2015_v3['CO2 emissions'] > 1.0e-9) & (CO2_emissions_Edgar_2015_v3['CO2 emissions'] <= 1.0e-8), 'CO2 classification'] = 3
CO2_emissions_Edgar_2015_v3.loc[CO2_emissions_Edgar_2015_v3['CO2 emissions'] > 1.0e-8, 'CO2 classification'] = 4
# 2016
CO2_emissions_Edgar_2016_v3.loc[CO2_emissions_Edgar_2016_v3['CO2 emissions']==0, 'CO2 classification'] = 0
CO2_emissions_Edgar_2016_v3.loc[(CO2_emissions_Edgar_2016_v3['CO2 emissions'] > 0) & (CO2_emissions_Edgar_2016_v3['CO2 emissions'] <= 1.0e-10), 'CO2 classification'] = 1
CO2_emissions_Edgar_2016_v3.loc[(CO2_emissions_Edgar_2016_v3['CO2 emissions'] > 1.0e-10) & (CO2_emissions_Edgar_2016_v3['CO2 emissions'] <= 1.0e-9), 'CO2 classification'] = 2
CO2_emissions_Edgar_2016_v3.loc[(CO2_emissions_Edgar_2016_v3['CO2 emissions'] > 1.0e-9) & (CO2_emissions_Edgar_2016_v3['CO2 emissions'] <= 1.0e-8), 'CO2 classification'] = 3
CO2_emissions_Edgar_2016_v3.loc[CO2_emissions_Edgar_2016_v3['CO2 emissions'] > 1.0e-8, 'CO2 classification'] = 4
# 2017
CO2_emissions_Edgar_2017_v3.loc[CO2_emissions_Edgar_2017_v3['CO2 emissions']==0, 'CO2 classification'] = 0
CO2_emissions_Edgar_2017_v3.loc[(CO2_emissions_Edgar_2017_v3['CO2 emissions'] > 0) & (CO2_emissions_Edgar_2017_v3['CO2 emissions'] <= 1.0e-10), 'CO2 classification'] = 1
CO2_emissions_Edgar_2017_v3.loc[(CO2_emissions_Edgar_2017_v3['CO2 emissions'] > 1.0e-10) & (CO2_emissions_Edgar_2017_v3['CO2 emissions'] <= 1.0e-9), 'CO2 classification'] = 2
CO2_emissions_Edgar_2017_v3.loc[(CO2_emissions_Edgar_2017_v3['CO2 emissions'] > 1.0e-9) & (CO2_emissions_Edgar_2017_v3['CO2 emissions'] <= 1.0e-8), 'CO2 classification'] = 3
CO2_emissions_Edgar_2017_v3.loc[CO2_emissions_Edgar_2017_v3['CO2 emissions'] > 1.0e-8, 'CO2 classification'] = 4
# 2018
CO2_emissions_Edgar_2018_v3.loc[CO2_emissions_Edgar_2018_v3['CO2 emissions']==0, 'CO2 classification'] = 0
CO2_emissions_Edgar_2018_v3.loc[(CO2_emissions_Edgar_2018_v3['CO2 emissions'] > 0) & (CO2_emissions_Edgar_2018_v3['CO2 emissions'] <= 1.0e-10), 'CO2 classification'] = 1
CO2_emissions_Edgar_2018_v3.loc[(CO2_emissions_Edgar_2018_v3['CO2 emissions'] > 1.0e-10) & (CO2_emissions_Edgar_2018_v3['CO2 emissions'] <= 1.0e-9), 'CO2 classification'] = 2
CO2_emissions_Edgar_2018_v3.loc[(CO2_emissions_Edgar_2018_v3['CO2 emissions'] > 1.0e-9) & (CO2_emissions_Edgar_2018_v3['CO2 emissions'] <= 1.0e-8), 'CO2 classification'] = 3
CO2_emissions_Edgar_2018_v3.loc[CO2_emissions_Edgar_2018_v3['CO2 emissions'] > 1.0e-8, 'CO2 classification'] = 4
Test
CO2_emissions_Edgar_2017_v3[CO2_emissions_Edgar_2017_v3['CO2 classification'] == 3]
Check if all data points have been encoded
print(CO2_emissions_Edgar_2015_v3[CO2_emissions_Edgar_2015_v3['CO2 classification'] == 0].count()
+ CO2_emissions_Edgar_2015_v3[CO2_emissions_Edgar_2015_v3['CO2 classification'] == 1].count()+
CO2_emissions_Edgar_2015_v3[CO2_emissions_Edgar_2015_v3['CO2 classification'] == 2].count()+
CO2_emissions_Edgar_2015_v3[CO2_emissions_Edgar_2015_v3['CO2 classification'] == 3].count()+
CO2_emissions_Edgar_2015_v3[CO2_emissions_Edgar_2015_v3['CO2 classification'] == 4].count())
print(CO2_emissions_Edgar_2016_v3[CO2_emissions_Edgar_2016_v3['CO2 classification'] == 0].count()
+ CO2_emissions_Edgar_2016_v3[CO2_emissions_Edgar_2016_v3['CO2 classification'] == 1].count()+
CO2_emissions_Edgar_2016_v3[CO2_emissions_Edgar_2016_v3['CO2 classification'] == 2].count()+
CO2_emissions_Edgar_2016_v3[CO2_emissions_Edgar_2016_v3['CO2 classification'] == 3].count()+
CO2_emissions_Edgar_2016_v3[CO2_emissions_Edgar_2016_v3['CO2 classification'] == 4].count())
print(CO2_emissions_Edgar_2017_v3[CO2_emissions_Edgar_2017_v3['CO2 classification'] == 0].count()
+ CO2_emissions_Edgar_2017_v3[CO2_emissions_Edgar_2017_v3['CO2 classification'] == 1].count()+
CO2_emissions_Edgar_2017_v3[CO2_emissions_Edgar_2017_v3['CO2 classification'] == 2].count()+
CO2_emissions_Edgar_2017_v3[CO2_emissions_Edgar_2017_v3['CO2 classification'] == 3].count()+
CO2_emissions_Edgar_2017_v3[CO2_emissions_Edgar_2017_v3['CO2 classification'] == 4].count())
print(CO2_emissions_Edgar_2018_v3[CO2_emissions_Edgar_2018_v3['CO2 classification'] == 0].count()
+ CO2_emissions_Edgar_2018_v3[CO2_emissions_Edgar_2018_v3['CO2 classification'] == 1].count()+
CO2_emissions_Edgar_2018_v3[CO2_emissions_Edgar_2018_v3['CO2 classification'] == 2].count()+
CO2_emissions_Edgar_2018_v3[CO2_emissions_Edgar_2018_v3['CO2 classification'] == 3].count()+
CO2_emissions_Edgar_2018_v3[CO2_emissions_Edgar_2018_v3['CO2 classification'] == 4].count())
Add a non-numerical name for the classification for a better understanding
classification_Edgar = pd.DataFrame ({'CO2 classification': [0, 1, 2, 3, 4], 'CO2 classification mapping': ['Null', 'Very low', 'Low', 'Medium', 'High']})
CO2_emissions_Edgar_2015_v3 = CO2_emissions_Edgar_2015_v3.merge(classification_Edgar, on='CO2 classification', how='left')
CO2_emissions_Edgar_2016_v3 = CO2_emissions_Edgar_2016_v3.merge(classification_Edgar, on='CO2 classification', how='left')
CO2_emissions_Edgar_2017_v3 = CO2_emissions_Edgar_2017_v3.merge(classification_Edgar, on='CO2 classification', how='left')
CO2_emissions_Edgar_2018_v3 = CO2_emissions_Edgar_2018_v3.merge(classification_Edgar, on='CO2 classification', how='left')
Number of data points in the 'High' category
2015 will have more data points as the data from v2 of Edgar data (burning of biomass) are available
print("The category 'High' has", CO2_emissions_Edgar_2015_v3[CO2_emissions_Edgar_2015_v3['CO2 classification'] == 4].count()[0], "data points in 2015")
print("The category 'High' has", CO2_emissions_Edgar_2016_v3[CO2_emissions_Edgar_2016_v3['CO2 classification'] == 4].count()[0], "data points in 2016")
print("The category 'High' has", CO2_emissions_Edgar_2017_v3[CO2_emissions_Edgar_2017_v3['CO2 classification'] == 4].count()[0], "data points in 2017")
print("The catagory 'High' has", CO2_emissions_Edgar_2018_v3[CO2_emissions_Edgar_2018_v3['CO2 classification'] == 4].count()[0], "data points in 2018")
CO2_emissions_Edgar_2015_v3
CO2_emissions_Edgar_peaks_2018 = CO2_emissions_Edgar_2018_v3[CO2_emissions_Edgar_2018_v3['CO2 classification']==4]
draw_map_Edgar(CO2_emissions_Edgar_peaks_2018,'CO2 emissions peaks (2018)', frontier=True)
CO2_emissions_Edgar_2015_v3.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_Edgar_2015_v3.csv")
CO2_emissions_Edgar_2016_v3.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_Edgar_2016_v3.csv")
CO2_emissions_Edgar_2017_v3.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_Edgar_2017_v3.csv")
CO2_emissions_Edgar_2018_v3.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_Edgar_2018_v3.csv")
2.1 Power plants
Data from power plants burning fossil fuels.
Link: http://datasets.wri.org/dataset/globalpowerplantdatabase
The World Resource Institute provides a list of power plants producing electricity based on different primary energies. We filtered this list to keep only the fossil primary energies (gas, oil and coal), that release CO2 during their combustion.
The dataset gives the latitude and longitude of these power plants, as well as the electricity produced, in GWh.
We will cross this data with the emission factors (in gCO2/KWh) of the different types of power plants, in order to estimate their CO2 emissions. We consider the emissions factor from the ADEME (https://www.bilans-ges.ademe.fr/docutheque/docs/Documentation%20g%C3%A9n%C3%A9rale%20anglaise%20v17.0.pdf, page 111).
These emission factors include the upstream of fuels (extraction, processing, refining, transportation and distribution), and the combustion. Therefore, the value returned by crossing the electricity produced by the power plants with these emission factor will therefore be greater than the actual value because it includes an upstream part of emissions that did not take place directly at the site of the power plant generating electricity, and therefore CO2.
Therefore, our estimation of the power plants CO2 emissions will be be larger that the actual emissions (because it will attribute to the power plants the upstream part of emissions, that are in reality not emitted at the site of the power plant). However, given our goal is here to localize CO2 emission peaks, we leave this question aside.
path_power_plant1 = "/content/drive/My Drive/Data For Good/Inputs/WRI data/global_power_plant_database.csv"
df_source1 = pd.read_csv(path_power_plant1, sep=",")
df_source1.head()
Work on the format of the dataframe
centrale_preparation = df_source1.copy()
# Removal of non-useful columns
centrale_preparation = centrale_preparation.drop(['country','commissioning_year', 'owner', 'source','gppd_idnr', 'other_fuel1','other_fuel2','other_fuel3', 'url','geolocation_source', 'wepp_id','year_of_capacity_data'],axis=1)
# Renaming the source of the CO2 emissions for more clarity
centrale_preparation.replace('Gas', 'Gas power plant', inplace=True)
centrale_preparation.replace('Oil', 'Oil power plant', inplace=True)
centrale_preparation.replace('Coal', 'Coal power plant', inplace=True)
# Observation of all power plant types
centrale_preparation['primary_fuel'].unique()
# We keep only fossil fuel power plants as they emit CO2
Drop_liste = ('Hydro', 'Other', 'Biomass', 'Petcoke','Wind', 'Nuclear', 'Solar', 'Waste', 'Wave and Tidal', 'Geothermal', 'Cogeneration', 'Storage')
for L in Drop_liste:
IndexFuel = centrale_preparation[(centrale_preparation['primary_fuel'] == L)].index
centrale_preparation.drop(IndexFuel, inplace = True)
# Test
centrale_preparation['primary_fuel'].unique()
# Visualisation of missing data
centrale_preparation.count()
Creation of the dataframe to add emissions factors (source: ADEME), and merge of both dataframes
gCO2_KWh = pd.DataFrame ({'primary_fuel': ['Gas power plant', 'Oil power plant', 'Coal power plant'], 'gCO2/KWh': [443, 778, 1050]})
# Merge of the dataframes
centrale_preparation = centrale_preparation.merge(gCO2_KWh, on='primary_fuel', how='left')
centrale_preparation.head()
centrale_preparation.describe()
Negative data for 'generation GWh' are observed. We will deal with it later
Data processing
CO2_emissions_centrale = centrale_preparation.copy()
What do the data "estimated_generation_gwh" represent ? Below is what the source indicates (https://wriorg.s3.amazonaws.com/s3fs-public/global-power-plant-database-technical-note-v1.0.pdf): "When (annual) generation is reported, we include it directly in the database. When generation information is not available for a specific plant, we estimate it, although estimating annual generation at the plant level is challenging. Electricity generated by each plant varies by time period and depends on factors such as the regulatory environment, level of demand, cost of fuels, and extent of planned and unplanned maintenance in addition to plant-level characteristics. We devised two potential ways to estimate annual plant-level electricity generation: scaling information on aggregate generation by plant size and a machine-learning approach"
For each year, the data 'generation_gwh_year' provide the data that has been actually measured.
For 2017, we also have the estimated data when no data has been measured. We therefore have two types of data:
- Data 'generation_gwh_2017' (only actually measured)
- Data 'generation_gwh_2017_with_estimated_data' selecting in priority measured data, and if no data has been measured, then it selects the estimated data
CO2_emissions_centrale['generation_gwh_2017_with_estimated_data'] = 0
for i in range (CO2_emissions_centrale.shape[0]):
if math.isnan(CO2_emissions_centrale['generation_gwh_2017'].iloc[i]):
CO2_emissions_centrale['generation_gwh_2017_with_estimated_data'].iloc[i] = CO2_emissions_centrale['estimated_generation_gwh'].iloc[i]
else:
CO2_emissions_centrale['generation_gwh_2017_with_estimated_data'].iloc[i] = CO2_emissions_centrale['generation_gwh_2017'].iloc[i]
count = 0
for i in range(CO2_emissions_centrale.shape[0]):
if math.isnan(CO2_emissions_centrale['generation_gwh_2017_with_estimated_data'].iloc[i]):
count = count +1
print("The number of missing data of GWh electricity production for fossil fuel power plants (with estimated data) is", count, "out of the", CO2_emissions_centrale.shape[0], "power plants of the entire dataset")
Creation of the CO2 emissions data. If the GWh of electricity production are negative, then the CO2 emisisons corresponding will be a 'NaN'
CO2_emissions_centrale['tCO2_emitted_in_2013'] = 0
CO2_emissions_centrale['tCO2_emitted_in_2014'] = 0
CO2_emissions_centrale['tCO2_emitted_in_2015'] = 0
CO2_emissions_centrale['tCO2_emitted_in_2016'] = 0
CO2_emissions_centrale['tCO2_emitted_in_2017'] = 0
CO2_emissions_centrale['tCO2_emitted_in_2017_with estimated_data'] = 0
for i in range(CO2_emissions_centrale.shape[0]):
# 2013
if CO2_emissions_centrale['generation_gwh_2013'].iloc[i] <= 0:
CO2_emissions_centrale['tCO2_emitted_in_2013'].iloc[i] = np.nan
else:
CO2_emissions_centrale['tCO2_emitted_in_2013'].iloc[i] = CO2_emissions_centrale['generation_gwh_2013'].iloc[i]*CO2_emissions_centrale['gCO2/KWh'].iloc[i]
# 2014
if CO2_emissions_centrale['generation_gwh_2014'].iloc[i] <= 0:
CO2_emissions_centrale['tCO2_emitted_in_2014'].iloc[i] = np.nan
else:
CO2_emissions_centrale['tCO2_emitted_in_2014'].iloc[i] = CO2_emissions_centrale['generation_gwh_2014'].iloc[i]*CO2_emissions_centrale['gCO2/KWh'].iloc[i]
# 2015
if CO2_emissions_centrale['generation_gwh_2015'].iloc[i] <= 0:
CO2_emissions_centrale['tCO2_emitted_in_2015'].iloc[i] = np.nan
else:
CO2_emissions_centrale['tCO2_emitted_in_2015'].iloc[i] = CO2_emissions_centrale['generation_gwh_2015'].iloc[i]*CO2_emissions_centrale['gCO2/KWh'].iloc[i]
# 2016
if CO2_emissions_centrale['generation_gwh_2016'].iloc[i] <= 0:
CO2_emissions_centrale['tCO2_emitted_in_2016'].iloc[i] = np.nan
else:
CO2_emissions_centrale['tCO2_emitted_in_2016'].iloc[i] = CO2_emissions_centrale['generation_gwh_2016'].iloc[i]*CO2_emissions_centrale['gCO2/KWh'].iloc[i]
# 2017
if CO2_emissions_centrale['generation_gwh_2017'].iloc[i] <= 0:
CO2_emissions_centrale['tCO2_emitted_in_2017'].iloc[i] = np.nan
else:
CO2_emissions_centrale['tCO2_emitted_in_2017'].iloc[i] = CO2_emissions_centrale['generation_gwh_2017'].iloc[i]*CO2_emissions_centrale['gCO2/KWh'].iloc[i]
# 2017 with estimated data
if CO2_emissions_centrale['generation_gwh_2017_with_estimated_data'].iloc[i] <= 0:
CO2_emissions_centrale['tCO2_emitted_in_2017_with estimated_data'].iloc[i] = np.nan
else:
CO2_emissions_centrale['tCO2_emitted_in_2017_with estimated_data'].iloc[i] = CO2_emissions_centrale['generation_gwh_2017_with_estimated_data'].iloc[i]*CO2_emissions_centrale['gCO2/KWh'].iloc[i]
Extrapolation to get data for 2018 and 2019.
The (rough) hypothesis done is that these years will have the same data as 2017 with estimated data. The ratonal behind it is that these power plants have a "long" lifetime (e.g. > 20 years for coal power plants and often > 40 years, source: https://www.gem.wiki/Estimating_carbon_dioxide_emissions_from_coal_plants), and 2018 & 2019 years could be important for the CO2 emissions peak algorithm
CO2_emissions_centrale['tCO2_emitted_in_2018_with estimated_data_extrapolated'] = CO2_emissions_centrale['tCO2_emitted_in_2017_with estimated_data']
CO2_emissions_centrale['tCO2_emitted_in_2019_with estimated_data_extrapolated'] = CO2_emissions_centrale['tCO2_emitted_in_2017_with estimated_data']
CO2_emissions_centrale.describe()
plt.figure(figsize = (10,10))
plt.scatter(np.arange(0,CO2_emissions_centrale.shape[0],1),CO2_emissions_centrale['tCO2_emitted_in_2017_with estimated_data'].sort_values(ascending = True), s=1)
plt.xlabel('Plant ID')
plt.ylabel('Annual CO2 emissions in 2017 (tonnes) with estimated data')
The maximum is due to one single outlier. We will not remove this plant as it would remove its non-estimated data.
In fact, this outlier is not normal as the maximum of tonnes of CO2 emitted per year for power plant has an order of magnitude of 10^7 (sources: https://www.eurekalert.org/pub_releases/2007-11/cfgd-crc111207.php; https://www.theguardian.com/environment/2012/jan/12/america-top-polluting-power-stations)
def draw_map_centrale(data, year, titre, lon_min=-180, lon_max=180, lat_min=-90, lat_max=90, size_point=1, frontier=False):
plt.figure(figsize=(15, 10), edgecolor='w')
m = Basemap(llcrnrlat=lat_min, urcrnrlat=lat_max, llcrnrlon=lon_min, urcrnrlon=lon_max)
m.shadedrelief()
parallels = np.arange(-80.,81,10.)
m.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,20.)
m.drawmeridians(meridians,labels=[True,False,False,True])
normal = matplotlib.colors.LogNorm(vmax=data[year].max())
m.scatter(data['longitude'], data['latitude'], c=data[year], cmap=plt.cm.jet, s=size_point, norm=normal)
plt.colorbar()
plt.title(titre)
if (frontier):
m.drawcountries(linewidth=0.5)
m.drawcoastlines(linewidth=0.7)
plt.show()
draw_map_centrale(CO2_emissions_centrale, 'tCO2_emitted_in_2017_with estimated_data', 'Tonnes of CO2 emitted per power plant (2017)', frontier=True)
CO2_emissions_centrale.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_centrale.csv")
2.2 Data from Global coal plant tracker
This dataset has been provided by the Global Energy Monitor (GEM). It lists coal power plants and calculates the tonnes of CO2 emitted per year. The methodology can be found there https://www.gem.wiki/Estimating_carbon_dioxide_emissions_from_coal_plants.
To summarize, four factors are used to estimate the CO2 emissions from coal plants:
- Plant capacity
- Plant capacity factor
- Heat rate of plant (an expression of efficiency)
- Emissions factor of the type of coal used in the plant
We added some columns in the csv file provided - 'Status 2016 (done by Data4Good)' for example - indicating if the power plant was operating or not at the corresponding year (this data was only available as of January 2020). The methodology was the following:
- If the 'Status' column indicates 'Mothballed' then set to 'Not operating'
- If the column 'Year' is inferior ou equal to the corresponding year and this data is not empty, then set to 'Not operating'
- If the column 'RETIRED' is strictly superior to the corresponding year or the data is empty (when this data is empty, this is also the case for the column "RETIRED" meaning that the power plant will not operate, e.g. "Cancelled" status), then set to 'Not operating'
- Else set to 'Operating'
We also added a column 'Total annual CO2 (million tonnes / annum)' that sums all the 'Annual CO2 (million tonnes / annum)' of all units per coal power plant - when this plant is operating - in order not to lose information when removing duplicates to get only one location per plant. Many plants have indeed several units, and all these units have the same coordinate
CO2_emissions_coal_plant = pd.read_csv(r"/content/drive/My Drive/Data For Good/Inputs/Global coal plant tracker/January 2020 Global Coal Plant Tracker.csv", sep=";", encoding='latin1')
CO2_emissions_coal_plant.head()
Removal of useless columns
CO2_emissions_coal_plant = CO2_emissions_coal_plant.drop(['Tracker ID', 'TrackerLOC', 'Wiki page', 'Subnational unit (province, state)','Unit', 'Chinese Name','Other names',
'Sponsor', 'Parent','Suspension policy', 'Pre-suspension status','Year','RETIRED', 'Planned Retire', 'Coal source',
'Location', 'Local area (taluk, county)', 'Major area (prefecture, district)', 'Region', 'Accuracy',
'Lifetime CO2 (million tonnes)', 'Permits', 'Coordinates', 'Annual CO2 (million tonnes / annum) for calculation'],axis=1)
CO2_emissions_coal_plant.head()
Drop of the duplicates corresponding to the several units of each power plants (when any)
CO2_emissions_coal_plant.drop_duplicates(['Latitude','Longitude'], keep = 'first', inplace=True)
Creation of new columns calculating the CO2 emitted for a specific year
# 2016
CO2_emissions_coal_plant['Annual CO2 emissions (millions of tonnes) in 2016'] = CO2_emissions_coal_plant['Total annual CO2 (million tonnes / annum)']
CO2_emissions_coal_plant.loc[CO2_emissions_coal_plant['Status 2016 (done by Data4Good)'] == 'Not operating', 'Annual CO2 emissions (millions of tonnes) in 2016'] = np.nan
# 2017
CO2_emissions_coal_plant['Annual CO2 emissions (millions of tonnes) in 2017'] = CO2_emissions_coal_plant['Total annual CO2 (million tonnes / annum)']
CO2_emissions_coal_plant.loc[CO2_emissions_coal_plant['Status 2017 (done by Data4Good)'] == 'Not operating', 'Annual CO2 emissions (millions of tonnes) in 2017'] = np.nan
# 2018
CO2_emissions_coal_plant['Annual CO2 emissions (millions of tonnes) in 2018'] = CO2_emissions_coal_plant['Total annual CO2 (million tonnes / annum)']
CO2_emissions_coal_plant.loc[CO2_emissions_coal_plant['Status 2018 (done by Data4Good)'] == 'Not operating', 'Annual CO2 emissions (millions of tonnes) in 2018'] = np.nan
# 2019
CO2_emissions_coal_plant['Annual CO2 emissions (millions of tonnes) in 2019'] = CO2_emissions_coal_plant['Total annual CO2 (million tonnes / annum)']
CO2_emissions_coal_plant.loc[CO2_emissions_coal_plant['Status 2019 (done by Data4Good)'] == 'Not operating', 'Annual CO2 emissions (millions of tonnes) in 2019'] = np.nan
CO2_emissions_coal_plant.head()
Visualisation of the CO2 emissions data - example of 2016
plt.figure(figsize = (10,10))
plt.scatter(np.arange(0,CO2_emissions_coal_plant.shape[0],1),CO2_emissions_coal_plant['Annual CO2 emissions (millions of tonnes) in 2016'].sort_values(ascending = True), s=1)
plt.xlabel('Coal plant ID')
plt.ylabel('Annual CO2 emissions (millions of tonnes) in 2016')
CO2_emissions_coal_plant.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_coal_plant.csv")
2.3 Cities v1 - few cities/CO2 data available
The source of the data is: https://public.opendatasoft.com/explore/dataset/co2-emissions-cities/export/?sort=study_year_wb
It is comprised of the major cities in the world
In this dataset, we have the annual tCO2-eq emissions data for Scope 1 and Scope 2. We will only use Scope 1 data, as it concerns direct emissions occuring from the cities (e.g. direct emissions from fossil fuel burning generated from road traffic), whereas Scope 2 emissions are indirect emissions coming from the use of electricity, heat, steam or cold (therefore the CO2 emissions are not present where the energy is consumed, but where the energy has been produced, which is not the city).
Moreover, some data available are in CO2-eq, which is not only comprised of CO2, but also of CH4 and NO2 for instance. Therefore, the actual CO2 emissions would be lower than the data of this dataset for some data. We will find below that 'They are 44 cities with tCO2 data - and not tCO2-eq data - out of 187 cities in total'
At last, we do not have a comprehensive split by years. Therefore, we will assume that they are not major changes from 2016 to 2019, which seems to make sense as major cities does not 'evolve' a lot within a few years
path_cities_v1 = "/content/drive/My Drive/Data For Good/Inputs/Cities/co2-emissions-cities.csv"
cities_v1_preparation = pd.read_csv(path_cities_v1, sep=";")
cities_v1_preparation.head()
Selection of useful columns
cities_v1 = cities_v1_preparation[['City name', 'Country', 'Scope-1 GHG emissions [tCO2 or tCO2-eq]', 'Scope-1 source dataset', 'Scope-1 GHG emissions units', 'Year of emission',
'City location (CDP) [degrees]', 'Population (CDP)', 'Population year (CDP)']]
cities_v1.head()
Split of the column 'City location (CDP) [degrees]' to get latitudes and longitudes
# Create two lists for the loop results to be placed
lat = []
lon = []
# For each row in a varible,
for row in cities_v1['City location (CDP) [degrees]']:
# Try to,
try:
# Split the row by comma and append
# everything before the comma to lat
lat.append(row.split(',')[0])
# Split the row by comma and append
# everything after the comma to lon
lon.append(row.split(',')[1])
# But if you get an error
except:
# append a missing value to lat
lat.append(np.NaN)
# append a missing value to lon
lon.append(np.NaN)
# Create two new columns from lat and lon
cities_v1['latitude'] = lat
cities_v1['longitude'] = lon
# Convert into numeric values
cities_v1[['latitude', 'longitude']] = cities_v1[['latitude', 'longitude']].apply(pd.to_numeric)
cities_v1.head()
print("They are", cities_v1.shape[0], "cities in the dataset")
Removal of the cities which latitude and longitude are NaN
cities_v1.dropna(subset = ['latitude', 'longitude'], inplace=True)
print("They are", cities_v1.shape[0], "cities after having removed cities without coordinate")
count = cities_v1.loc[cities_v1['Scope-1 GHG emissions units'] == 'tCO2', 'Scope-1 GHG emissions units'].count()
print("They are", count, "cities with tCO2 data - and not tCO2-eq data - out of", cities_v1.shape[0], "cities with coordinates")
draw_map_centrale(cities_v1, 'Scope-1 GHG emissions [tCO2 or tCO2-eq]', 'Scope 1 CO2e cities emissions')
cities_v1.to_csv("/content/drive/My Drive/Data For Good/Outputs/cities_v1.csv")
2.4 Cities v2 - lots of cities/no CO2 data
The source of the data is https://simplemaps.com/data/world-cities ("We're proud to offer a simple, accurate and up-to-date database of the world's cities and towns. We've built it from the ground up using authoritative sources such as the NGIA, US Geological Survey, US Census Bureau, and NASA")
"It was last refreshed in December of 2019". We will assume no major changes from 2016 to 2019 when later creating datasets for these years (as for cities_v1)
Here we have around 15 000 cities with the population data, but we do not have any CO2 data as opposed to cities_v1
path_cities_v2 = "/content/drive/My Drive/Data For Good/Inputs/Cities/worldcities.csv"
cities_v2_preparation = pd.read_csv(path_cities_v2, sep=";", encoding = 'latin1')
cities_v2_preparation['lat'] = cities_v2_preparation['lat'].apply(lambda x: x.replace(',','.')).astype('float')
cities_v2_preparation['lng'] = cities_v2_preparation['lng'].apply(lambda x: x.replace(',','.')).astype('float')
cities_v2_preparation.head()
cities_v2 = pd.DataFrame()
cities_v2['city'] = cities_v2_preparation['city']
cities_v2['country'] = cities_v2_preparation['country']
cities_v2['latitude'] = cities_v2_preparation['lat']
cities_v2['longitude'] = cities_v2_preparation['lng']
cities_v2['pop'] = cities_v2_preparation['population']
cities_v2.head()
draw_map_centrale(cities_v2, 'pop', 'Population of main cities')
cities_v2.to_csv("/content/drive/My Drive/Data For Good/Outputs/cities_v2.csv")
2.5. ETS data
Data representing the declared CO2 equivalent emissions of all infrastructures subject to European carbon market (ETS). This data were provided by the think tank Sandbag (https://sandbag.be/) for the use of this project.
The location of the infrastructure was found using OpenStreetMap (more details in the script: https://github.com/dataforgoodfr/batch7_satellite_ges/blob/master/notebooks/Process%20Sandbag%20data%20-%20Find%20infrastructure%20location.ipynb)
path_sandbag = "/content/drive/My Drive/Data For Good/Inputs/SANDBAG data/sandbag_data_infrastructure_location.csv"
sandbag_data = pd.read_csv(path_sandbag, sep=";", encoding='latin1')
sandbag_data.head()
sandbag_data.shape
# Remove installation from the aviation sector because emissions are not declared at their emission site
sandbag_data = sandbag_data[sandbag_data['SandbagSector']!='Aviation']
# Removal of the coma for latitudes and longitudes
sandbag_data['latitude'] = sandbag_data['latitude'].astype('str').apply(lambda x: x.replace(',','.')).astype('float')
sandbag_data['longitude'] = sandbag_data['longitude'].astype('str').apply(lambda x: x.replace(',','.')).astype('float')
sandbag_data.shape
def draw_map_peaks(data, titre, lon_min=-180, lon_max=180, lat_min=-90, lat_max=90, size_point=1, frontier=False):
plt.figure(figsize=(15, 10), edgecolor='w')
m = Basemap(llcrnrlat=lat_min, urcrnrlat=lat_max, llcrnrlon=lon_min, urcrnrlon=lon_max)
m.shadedrelief()
parallels = np.arange(-80.,81,10.)
m.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,20.)
m.drawmeridians(meridians,labels=[True,False,False,True])
normal = matplotlib.colors.LogNorm(vmax=data['tCO2e'].max())
m.scatter(data['longitude'], data['latitude'], c=data['tCO2e'], cmap=plt.cm.jet, s=size_point, norm=normal)
plt.colorbar()
plt.title(titre)
if (frontier):
m.drawcountries(linewidth=0.5)
m.drawcoastlines(linewidth=0.7)
plt.show()
draw_map_peaks(sandbag_data, 'CO2 emissions of European infrastructure (Sandbag 2019)')
draw_map_peaks(sandbag_data, 'CO2 emissions of European infrastructure (Sandbag 2019)', lon_min=-30, lon_max=30, lat_min=30, lat_max=60)
2.6 Merge of the CO2 emissions peaks data
Note that in the column 'CO2/CO2e emissions source', we kept the segmentation coal / gas / oil for power plants from Global Energy Monitor and the World Resource Institute. In Sandbag, the corresponding category is 'power and heat' withtout the granularity of the type of fossil fuel burnt
Data from power plants (2.1)
2016
centrale_all_for_merge_2016 = pd.DataFrame()
centrale_all_for_merge_2016['latitude'] = CO2_emissions_centrale['latitude']
centrale_all_for_merge_2016['longitude'] = CO2_emissions_centrale['longitude']
centrale_all_for_merge_2016['Data source'] = 'World Resources Institute and ADEME for the emissions factors'
centrale_all_for_merge_2016['CO2/CO2e emissions source'] = CO2_emissions_centrale['primary_fuel']
centrale_all_for_merge_2016['CO2/CO2e emissions (in tonnes per year)'] = CO2_emissions_centrale['tCO2_emitted_in_2016']
centrale_all_for_merge_2016['CO2 or CO2e'] = 'CO2'
# Delete plants which CO2 emissions is a NaN
centrale_all_for_merge_2016.dropna(subset = ['CO2/CO2e emissions (in tonnes per year)'], inplace=True)
2017 (the CO2 emissions data selected here is 'with estimated data')
centrale_all_for_merge_2017 = pd.DataFrame()
centrale_all_for_merge_2017['latitude'] = CO2_emissions_centrale['latitude']
centrale_all_for_merge_2017['longitude'] = CO2_emissions_centrale['longitude']
centrale_all_for_merge_2017['Data source'] = 'World Resources Institute and ADEME for the emissions factors'
centrale_all_for_merge_2017['CO2/CO2e emissions source'] = CO2_emissions_centrale['primary_fuel']
centrale_all_for_merge_2017['CO2/CO2e emissions (in tonnes per year)'] = CO2_emissions_centrale['tCO2_emitted_in_2017_with estimated_data']
centrale_all_for_merge_2017['CO2 or CO2e'] = 'CO2'
# Delete plants which CO2 emissions is a NaN
centrale_all_for_merge_2017.dropna(subset = ['CO2/CO2e emissions (in tonnes per year)'], inplace=True)
2018
centrale_all_for_merge_2018 = pd.DataFrame()
centrale_all_for_merge_2018['latitude'] = CO2_emissions_centrale['latitude']
centrale_all_for_merge_2018['longitude'] = CO2_emissions_centrale['longitude']
centrale_all_for_merge_2018['Data source'] = 'World Resources Institute and ADEME for the emissions factors'
centrale_all_for_merge_2018['CO2/CO2e emissions source'] = CO2_emissions_centrale['primary_fuel']
centrale_all_for_merge_2018['CO2/CO2e emissions (in tonnes per year)'] = CO2_emissions_centrale['tCO2_emitted_in_2018_with estimated_data_extrapolated']
centrale_all_for_merge_2018['CO2 or CO2e'] = 'CO2'
# Delete plants which CO2 emissions is a NaN
centrale_all_for_merge_2018.dropna(subset = ['CO2/CO2e emissions (in tonnes per year)'], inplace=True)
2019
centrale_all_for_merge_2019 = pd.DataFrame()
centrale_all_for_merge_2019['latitude'] = CO2_emissions_centrale['latitude']
centrale_all_for_merge_2019['longitude'] = CO2_emissions_centrale['longitude']
centrale_all_for_merge_2019['Data source'] = 'World Resources Institute and ADEME for the emissions factors'
centrale_all_for_merge_2019['CO2/CO2e emissions source'] = CO2_emissions_centrale['primary_fuel']
centrale_all_for_merge_2019['CO2/CO2e emissions (in tonnes per year)'] = CO2_emissions_centrale['tCO2_emitted_in_2019_with estimated_data_extrapolated']
centrale_all_for_merge_2019['CO2 or CO2e'] = 'CO2'
# Delete plants which CO2 emissions is a NaN
centrale_all_for_merge_2019.dropna(subset = ['CO2/CO2e emissions (in tonnes per year)'], inplace=True)
centrale_all_for_merge_2019.head()
Data from coal power plants (2.2)
2016
coal_plant_for_merge_2016 = pd.DataFrame()
coal_plant_for_merge_2016['latitude'] = CO2_emissions_coal_plant['Latitude']
coal_plant_for_merge_2016['longitude'] = CO2_emissions_coal_plant['Longitude']
coal_plant_for_merge_2016['Data source'] = 'Global Energy Monitor'
coal_plant_for_merge_2016['CO2/CO2e emissions source'] = 'Coal power plant'
coal_plant_for_merge_2016['CO2/CO2e emissions (in tonnes per year)'] = CO2_emissions_coal_plant['Annual CO2 emissions (millions of tonnes) in 2016']*1000000
coal_plant_for_merge_2016['CO2 or CO2e'] = 'CO2'
# Delete plants which CO2 emissions is a NaN
coal_plant_for_merge_2016.dropna(subset = ['CO2/CO2e emissions (in tonnes per year)'], inplace=True)
2017
coal_plant_for_merge_2017 = pd.DataFrame()
coal_plant_for_merge_2017['latitude'] = CO2_emissions_coal_plant['Latitude']
coal_plant_for_merge_2017['longitude'] = CO2_emissions_coal_plant['Longitude']
coal_plant_for_merge_2017['Data source'] = 'Global Energy Monitor'
coal_plant_for_merge_2017['CO2/CO2e emissions source'] = 'Coal power plant'
coal_plant_for_merge_2017['CO2/CO2e emissions (in tonnes per year)'] = CO2_emissions_coal_plant['Annual CO2 emissions (millions of tonnes) in 2017']*1000000
coal_plant_for_merge_2017['CO2 or CO2e'] = 'CO2'
# Delete plants which CO2 emissions is a NaN
coal_plant_for_merge_2017.dropna(subset = ['CO2/CO2e emissions (in tonnes per year)'], inplace=True)
2018
coal_plant_for_merge_2018 = pd.DataFrame()
coal_plant_for_merge_2018['latitude'] = CO2_emissions_coal_plant['Latitude']
coal_plant_for_merge_2018['longitude'] = CO2_emissions_coal_plant['Longitude']
coal_plant_for_merge_2018['Data source'] = 'Global Energy Monitor'
coal_plant_for_merge_2018['CO2/CO2e emissions source'] = 'Coal power plant'
coal_plant_for_merge_2018['CO2/CO2e emissions (in tonnes per year)'] = CO2_emissions_coal_plant['Annual CO2 emissions (millions of tonnes) in 2018']*1000000
coal_plant_for_merge_2018['CO2 or CO2e'] = 'CO2'
# Delete plants which CO2 emissions is a NaN
coal_plant_for_merge_2018.dropna(subset = ['CO2/CO2e emissions (in tonnes per year)'], inplace=True)
2019
coal_plant_for_merge_2019 = pd.DataFrame()
coal_plant_for_merge_2019['latitude'] = CO2_emissions_coal_plant['Latitude']
coal_plant_for_merge_2019['longitude'] = CO2_emissions_coal_plant['Longitude']
coal_plant_for_merge_2019['Data source'] = 'Global Energy Monitor'
coal_plant_for_merge_2019['CO2/CO2e emissions source'] = 'Coal power plant'
coal_plant_for_merge_2019['CO2/CO2e emissions (in tonnes per year)'] = CO2_emissions_coal_plant['Annual CO2 emissions (millions of tonnes) in 2019']*1000000
coal_plant_for_merge_2019['CO2 or CO2e'] = 'CO2'
# Delete plants which CO2 emissions is a NaN
coal_plant_for_merge_2019.dropna(subset = ['CO2/CO2e emissions (in tonnes per year)'], inplace=True)
coal_plant_for_merge_2019.head()
Data from cities (2.3)
cities_v1_for_merge = pd.DataFrame()
cities_v1_for_merge['city'] = cities_v1['City name']
cities_v1_for_merge['latitude'] = cities_v1['latitude']
cities_v1_for_merge['longitude'] = cities_v1['longitude']
cities_v1_for_merge['Data source'] = 'Opendatasoft'
cities_v1_for_merge['CO2/CO2e emissions source'] = 'City'
cities_v1_for_merge['CO2/CO2e emissions (in tonnes per year)'] = cities_v1['Scope-1 GHG emissions [tCO2 or tCO2-eq]']
cities_v1_for_merge['CO2 or CO2e'] = 'CO2e'
cities_v1_for_merge.loc[cities_v1['Scope-1 GHG emissions units'] == 'tCO2', 'CO2 or CO2e'] = 'CO2'
cities_v1_for_merge.head()
Data from cities (2.4)
cities_v2_for_merge = pd.DataFrame()
# Filter the cities to get cities which population is significant ( > 500 000 habitants chosen here, we get around 1 000 cities this way)
cities_v2_copy = cities_v2[cities_v2['pop'] > 500000]
cities_v2_for_merge['city'] = cities_v2_copy['city']
cities_v2_for_merge['latitude'] = cities_v2_copy['latitude']
cities_v2_for_merge['longitude'] = cities_v2_copy['longitude']
cities_v2_for_merge['Data source'] = 'Simplemaps'
cities_v2_for_merge['CO2/CO2e emissions source'] = 'City'
cities_v2_for_merge['CO2/CO2e emissions (in tonnes per year)'] = np.nan
cities_v2_for_merge['CO2 or CO2e'] = np.nan
cities_v2_for_merge.head()
Data from Sandbag (2.5)
sandbag_for_merge_preparation = sandbag_data.copy()
sandbag_for_merge_preparation.head()
# Drop all rows which SandbagSector has not been categorized
sandbag_for_merge_preparation.dropna(subset = ['SandbagSector'], inplace=True)
sandbag_for_merge_preparation['SandbagSector'].unique()
sandbag_for_merge = pd.DataFrame()
sandbag_for_merge['latitude'] = sandbag_for_merge_preparation['latitude']
sandbag_for_merge['longitude'] = sandbag_for_merge_preparation['longitude']
sandbag_for_merge['Data source'] = 'Sandbag'
sandbag_for_merge['CO2/CO2e emissions source'] = sandbag_for_merge_preparation['SandbagSector']
sandbag_for_merge['CO2/CO2e emissions (in tonnes per year)'] = sandbag_for_merge_preparation['tCO2e']
sandbag_for_merge['CO2 or CO2e'] = 'CO2e'
Removal of the duplicates
- For cities: we keep cities_v1 data over cities_v2 in priority as it provides CO2 data (reminder: most of the data are not in CO2 but in CO2-eq or CO2e). Moreover, cities_v2 has been filtered to get cities with more than 500 000 habitants
- For plants: it concerns the coal power plants that are present in both dataset. The source from 'Global power plant database' has been selected in priority when there are some duplicates over 'Global Energy Monitor' as well as over Sandbag. The reason Sandbag has been selected in last is to have a great number of data from different sources, and Sandbag provides also a lot of data for ciment, glass etc.
- In case some plants are located at the headquarters of the company - therefore at the city location - the "cities_merged" data will be selected in priority when there are some duplicates to remove this issue
Concerning the column 'CO2 emissions (in tonnes per year)', we are mixing data from different sources that use different methodologies to calculate the CO2 or CO2e. Therefore, this data has been kept here for a better understanding of the CO2/CO2e emissions peak, but should be used with caution
Merge of the cities datasets
cities_merged = pd.concat([cities_v1_for_merge, cities_v2_for_merge])
cities_merged.drop_duplicates(['city'], keep = 'first', inplace=True)
cities_merged = cities_merged.drop(columns = 'city')
nb_cities_duplicates = (cities_v1_for_merge.shape[0] + cities_v2_for_merge.shape[0]) - cities_merged.shape[0]
print("There are", nb_cities_duplicates, "duplicates")
cities_merged.head()
Merge of all peaks datasets
2016
CO2_emissions_peaks_merged_2016 = pd.concat([cities_merged, centrale_all_for_merge_2016, coal_plant_for_merge_2016, sandbag_for_merge])
CO2_emissions_peaks_merged_2016.drop_duplicates(['latitude','longitude'], keep = 'first', inplace=True)
nb_power_plant_duplicates = (centrale_all_for_merge_2016.shape[0] + coal_plant_for_merge_2016.shape[0] + cities_merged.shape[0] + sandbag_for_merge.shape[0]) - CO2_emissions_peaks_merged_2016.shape[0]
print("There are", nb_power_plant_duplicates, "duplicates in 2016")
2017
CO2_emissions_peaks_merged_2017 = pd.concat([cities_merged, centrale_all_for_merge_2017, coal_plant_for_merge_2017, sandbag_for_merge])
CO2_emissions_peaks_merged_2017.drop_duplicates(['latitude','longitude'], keep = 'first', inplace=True)
nb_power_plant_duplicates = (centrale_all_for_merge_2017.shape[0] + coal_plant_for_merge_2017.shape[0] + cities_merged.shape[0] + sandbag_for_merge.shape[0]) - CO2_emissions_peaks_merged_2017.shape[0]
print("There are", nb_power_plant_duplicates, "duplicates in 2017")
2018
CO2_emissions_peaks_merged_2018 = pd.concat([cities_merged, centrale_all_for_merge_2018, coal_plant_for_merge_2018, sandbag_for_merge])
CO2_emissions_peaks_merged_2018.drop_duplicates(['latitude','longitude'], keep = 'first', inplace=True)
nb_power_plant_duplicates = (centrale_all_for_merge_2018.shape[0] + coal_plant_for_merge_2018.shape[0] + cities_merged.shape[0] + sandbag_for_merge.shape[0]) - CO2_emissions_peaks_merged_2018.shape[0]
print("There are", nb_power_plant_duplicates, "duplicates in 2018")
2019
CO2_emissions_peaks_merged_2019 = pd.concat([cities_merged, centrale_all_for_merge_2019, coal_plant_for_merge_2019, sandbag_for_merge])
CO2_emissions_peaks_merged_2019.drop_duplicates(['latitude','longitude'], keep = 'first', inplace=True)
nb_power_plant_duplicates = (centrale_all_for_merge_2019.shape[0] + coal_plant_for_merge_2019.shape[0] + cities_merged.shape[0] + sandbag_for_merge.shape[0]) - CO2_emissions_peaks_merged_2019.shape[0]
print("There are", nb_power_plant_duplicates, "duplicates in 2019")
CO2_emissions_peaks_merged_2019
CO2_emissions_peaks_merged_2016.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_peaks_merged_2016.csv")
CO2_emissions_peaks_merged_2017.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_peaks_merged_2017.csv")
CO2_emissions_peaks_merged_2018.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_peaks_merged_2018.csv")
CO2_emissions_peaks_merged_2019.to_csv("/content/drive/My Drive/Data For Good/Outputs/CO2_emissions_peaks_merged_2019.csv")