from google.colab import drive
drive.mount('/content/drive')
!pip install netCDF4
!apt-get install libgeos-3.5.0
!apt-get install libgeos-dev
!pip install https://github.com/matplotlib/basemap/archive/master.zip
!pip install pyproj==1.9.6
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv("/content/drive/My Drive/Data For Good - S7: Ministè€re & O-CO2/oco2_2019_12.csv", sep=";")
data.head()
data.describe()
end_week_1 = 2019120800000000
end_week_2 = 2019121500000000
end_week_3 = 2019122200000000
end_week_4 = 2019122900000000
data_1 = data[data['sounding_id'] < end_week_1]
data_2 = data[data['sounding_id'] < end_week_2]
data_2 = data_2[data['sounding_id'] > end_week_1]
data_3 = data[data['sounding_id'] < end_week_3]
data_3 = data_3[data['sounding_id'] > end_week_2]
data_4 = data[data['sounding_id'] < end_week_4]
data_4 = data_4[data['sounding_id'] > end_week_3]
print("Number of observations per week:")
print("Week 1: ", data_1.shape[0])
print("Week 2: ", data_2.shape[0])
print("Week 3: ", data_3.shape[0])
print("Week 4: ", data_4.shape[0])
from datetime import datetime
def to_date(a):
return datetime.strptime(str(a), '%Y%m%d%H%M%S%f')
data['date'] = data['sounding_id'].apply(to_date)
data.head()
draw_map: Function to draw the map and the observations (relief style)
Parameters:
- (DataFrame) data: the dataset to map.
- (int) lon_min : the minimum longitude. default: -180
- (int) lon_max: the maximum longitude. default: 180
- (int) lat_min: the minimum latitude. default: -90
- (int) lat_max: the maximum latitude. default: 90
- (int) size_point: size of the point to plot (useful if we zoom in). default: 1
- (Bool) frontier: whether or not to draw the countries borders. default: False
from mpl_toolkits.basemap import Basemap
def draw_map(data, lon_min=-180, lon_max=180, lat_min=-90, lat_max=90, size_point=1, frontier=False):
plt.figure(figsize=(15, 10), edgecolor='w')
m = Basemap(llcrnrlat=lat_min, urcrnrlat=lat_max, llcrnrlon=lon_min, urcrnrlon=lon_max)
m.shadedrelief()
if (frontier):
m.drawcountries(linewidth=1)
parallels = np.arange(-80.,81,10.)
m.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,20.)
m.drawmeridians(meridians,labels=[True,False,False,True])
m.scatter(data['longitude'], data['latitude'], c=data['xco2'], cmap=plt.cm.jet, s=size_point)
plt.show()
Visualisations of:
- One month of Data for the whole world
- 1st week of Data for the whole world
- 2nd week of Data for the whole world
draw_map(data)
draw_map(data_1)
draw_map(data_2)
get_data_zone: retrieve the data for one specific zone.
Parameters:
- (DataFrame) data: the origin dataset.
- (int) lon_min : the minimum longitude. default: -180
- (int) lon_max: the maximum longitude. default: 180
- (int) lat_min: the minimum latitude. default: -90
- (int) lat_max: the maximum latitude. default: 90
def get_data_zone(data, lon_min=-180, lon_max=180, lat_min=-90, lat_max=90):
data = data[data['longitude'] < lon_max]
data = data[data['longitude'] > lon_min]
data = data[data['latitude'] < lat_max]
data = data[data['latitude'] > lat_min]
return data
data_10_20 = get_data_zone(data, lon_min=10, lon_max=20, lat_min=-90, lat_max=90)
draw_map(data_10_20)
This can be interesting to study some points below the latitude -80.
These points appear to be anomalies since the measuement technique of the satellite is measuring the CO2 concentrations through spectroscopy (and therefore is dependant on light). Since there is few enlightment in this region, it is surprising to observe measurements there.
data_artctica = get_data_zone(data, lon_min=-180, lon_max=180, lat_min=-90, lat_max=-75)
print("Number of observations in the -75° zone: ", data_artctica.shape[0])
draw_map(data_artctica, lat_max=-75)
Let's zoom in a particular region:
data_artctica = get_data_zone(data, lon_min=-70, lon_max=-50, lat_min=-90, lat_max=-75)
print("Number of observations in the -75°:-70°-50° zone: ", data_artctica.shape[0])
draw_map(data_artctica, lat_max=-75, lon_min=-70, lon_max=-50, size_point=5)
The same thing can be done over a specific country. Here we have the month data above India:
data_india = get_data_zone(data, lon_min=65, lon_max=100, lat_min=5, lat_max=40)
draw_map(data_india, lon_min=65, lon_max=100, lat_min=5, lat_max=40, frontier=True)
We can now explore the CO2 concentration values to get some insights on the distribution and spot some outsiders:
import matplotlib.pyplot as plt
data_test = data[10:2000]
plt.figure(figsize=(15, 10), edgecolor='w')
plt.scatter(data_test['date'], data_test['xco2'])
We have 2.53M entries, ranging from 288 to 423, with a mean of 409.
data['date'].describe()
import seaborn as sns
plt.figure(figsize=(15, 10), edgecolor='w')
sns.distplot(data['xco2'], bins=100)
We can try to spot (and remove?) the outsider extremes that may be considered as anomalies.
We can see below an outsider above 420 and suprisingly low observations below 400:
plt.figure(figsize=(15, 10), edgecolor='w')
sns.boxplot(data['xco2'])
We have:
- 01 obs above 420: This seems to be an anomaly.
- 13 obs below 400: These may not all be anomalies since some of them make sense considering a gaussian distribution.
print("Values above 420: \n", data[data['xco2'] > 420]['xco2'])
print("Values below 400: \n", data[data['xco2'] < 400]['xco2'])
#Here we consider the 28904th orbit (6555 observations):
data_28904 = data[data['orbit'] == 28904]
#Here we consider the 28905th orbit:
data_28905 = data[data['orbit'] == 28905]
#Here we consider the 28906th orbit:
data_28906 = data[data['orbit'] == 28906]
plt.figure(figsize=(30, 7), edgecolor='w')
ax1 = plt.subplot(131, sharey=ax1)
plt.scatter(data_28904['date'], data_28904['xco2'])
ax2 = plt.subplot(132, sharey=ax1)
plt.scatter(data_28905['date'], data_28905['xco2'])
ax3 = plt.subplot(133, sharey=ax1)
plt.scatter(data_28906['date'], data_28906['xco2'])
def get_orbit_data(data, orbit=28905):
return data[data['orbit'] == orbit]
get_orbit_data(data, 28960)
There seems to be:
- Some anomalies (observation high alone)
- Some noise (for the same line of observation)
It would be interesting to visualise these as a line. We could regroup temporal values to ther mean. To do so, we can regroup datapoints in time.
CAUTION:
This "noise reduction" is too basic and needs to be improved !
def group_by_minute(data):
data['date'] = data['date'].astype('datetime64[m]')
return data.groupby(['date']).mean()
data_28905_grouped = group_by_minute(data_28905)
data_28905_grouped.head()
plt.figure(figsize=(20, 7), edgecolor='w')
ax1 = plt.subplot(121)
plt.scatter(grouped.index, y = grouped['xco2'])
ax2 = plt.subplot(122, sharex=ax1, sharey=ax1)
plt.scatter(data_28905['date'], data_28905['xco2'])
data_28950 = get_orbit_data(data, 28950)
data_28950_grouped = group_by_minute(data_28950)
data_28951 = get_orbit_data(data, 28951)
data_28951_grouped = group_by_minute(data_28951)
data_28952 = get_orbit_data(data, 28952)
data_28952_grouped = group_by_minute(data_28952)
plt.figure(figsize=(30, 10), edgecolor='w')
ax1 = plt.subplot(231)
plt.scatter(data_28950_grouped.index, data_28950_grouped['xco2'])
ax2 = plt.subplot(234, sharex=ax1, sharey=ax1)
plt.scatter(data_28950['date'], data_28950['xco2'])
ax3 = plt.subplot(232, sharey=ax1)
plt.scatter(data_28951_grouped.index, data_28951_grouped['xco2'])
ax4 = plt.subplot(235, sharex=ax3, sharey=ax3)
plt.scatter(data_28951['date'], data_28951['xco2'])
ax5 = plt.subplot(233, sharey=ax1)
plt.scatter(data_28952_grouped.index, data_28952_grouped['xco2'])
ax6 = plt.subplot(236, sharex=ax5, sharey=ax5)
plt.scatter(data_28952['date'], data_28952['xco2'])
Apply the wind (reverse)
The observation of the CO2 concentration isn't done immediatly after the emission. Since the satellite has a 16-days "refreshing" period for a geographical point, the CO2 could have been emitted quite far from the place it has been measured.
One reason for that is that the wind is moving the air, so the CO2 particles. Therefore, it can be interesting to understand the where the CO2 particles could have been before being moved by the wind.
The wind data are available from the NASA and already compiled in the original OCO2 Dataset.
from scipy.signal import find_peaks
peaks, _ = find_peaks(data['xco2'][:500], threshold=1)
plt.figure(figsize=(15, 10), edgecolor='w')
plt.scatter(data['date'], data['xco2'])
plt.scatter(peaks, data['xco2'][:500][peaks], linewidth=0.3, s=50, c='r')
#print(data['xco2'][peaks-1], " -> ", data['xco2'][peaks])
If we take into consideration the fact that two close peaks are in fact one only phenomenon (due to the noise), we would rather try to find global peaks.
The distance here is choosen empirically to visually match the obsrevation.
peaks, _ = find_peaks(data['xco2'], threshold=1, distance=3000)
plt.figure(figsize=(25, 10), edgecolor='w')
plt.plot(data['xco2'][:500000])
plt.scatter(peaks, data['xco2'][:500000][peaks], linewidth=0.3, s=50, c='r')
peaks, _ = find_peaks(data['xco2'], threshold=3, distance=5000)
plt.figure(figsize=(25, 10), edgecolor='w')
plt.plot(data['xco2'][:500000])
plt.scatter(peaks, data['xco2'][:500000][peaks], linewidth=0.3, s=50, c='r')
#m.scatter(data_1['longitude'], data_1['latitude'], c=data_1['xco2'], cmap=plt.cm.jet, s=1)
lon = data['longitude'][:]
lat = data['latitude'][:]
CO2 = data['xco2'][:]
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
m = Basemap(llcrnrlat=-90, urcrnrlat=90,
llcrnrlon=-180, urcrnrlon=180)
plt.figure(figsize=(15, 10), edgecolor='w')
m.shadedrelief()
m.scatter(lon, lat, s=1, c=CO2, cmap=plt.cm.jet, alpha=0.5)
for index,row in data[:25].iterrows():
row['date'] = datetime.strptime(str(row['sounding_id']), '%Y%m%d%H%M%S%f')
data.head()
plt.plot(data['xco2'])
plt.scatter(c_max_index[0],data['xco2'][c_max_index[0]],linewidth=0.3, s=50, c='r')
# Reducing the datetime precision by the minute:
data_28905['date'] = data_28905['date'].astype('datetime64[m]')
data_28906['date'] = data_28906['date'].apply(lambda x: x.replace(microsecond=0))
data_28906['date'] = data_28906['date'].apply(lambda x: x.replace(second=x.second[:1]))
grouped = data_28905.groupby(['date']).mean()
grouped.head()