Last active
March 23, 2023 10:12
-
-
Save Gro-Tsen/92774bc42899872acc87c9de197becd0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### Some statistical analysis of daily mean temperatures in France | |
#### See: <URL: https://twitter.com/gro_tsen/status/1620488744867598337 > | |
## Input data: /tmp/temps.csv is expected to contain a list of daily | |
## average temperatures in France (first column = date (unused), | |
## second column = temperature in Celsius), over a whole number of | |
## calendar years. It can be produced from the data source described | |
## in <URL: https://twitter.com/gro_tsen/status/1620435663844970497 > | |
## by the following command line: | |
# perl -ne 'next if /^\#/; die unless /^([0-9]{8})\s+(\-?[0-9]+\.[0-9]+(?:[Ee](?:[+-]?[0-9]+))?)\s*$/; printf "%s,%.4f\n", $1,($2+0)' /data/meteo/climexp/iera5_t2m_daily_eu_France_metropolitan_su.dat > /tmp/temps.csv | |
### Read data | |
import csv | |
data = list(csv.reader(open("/tmp/temps.csv","r"))) | |
data = list(map(lambda x: [x[0],float(x[1])],data)) | |
# Truncate to an integer number of years: | |
datayears = floor(len(data)/365.25) | |
data = data[:floor(datayears*365.25+0.25)] | |
### Sample basic plots | |
line([(i/365.25+1950, data[i][1]) for i in range(len(data))], thickness=0.1, title="Température quotidienne moyenne en France", axes_labels=(None, "T (°C)")).show(dpi=192) | |
line([(i/365.25+1950, data[i][1]) for i in range(len(data))[-365:]], thickness=0.5, title="Température quotidienne moyenne en France", axes_labels=(None, "T (°C)")).show(dpi=192) | |
### Compute expected temperature per day of year and anomalies | |
smoothing = 6 # Smaller means more smoothing | |
## Old version (perform Fourier by hand): | |
# twoIpiN = N(2*I*pi) | |
# fourier = [sum([data[i][1]*(N(exp(-twoIpiN*i*k/365.25))) for i in range(len(data))])/len(data) for k in range(61)] | |
# fouriersm = [fourier[k]*N(exp(-(k/smoothing)^2)) for k in range(len(fourier))] | |
# stdyear = [sum([(2 if k>0 else 1)*real(fouriersm[k]*(N(exp(twoIpiN*i*k/365.25)))) for k in range(len(fourier))]) for i in range(366)] | |
# g = line([(i+1,stdyear[i]) for i in range(366)], title="Température moyenne lissée en France dans l'année (1950–2022)", axes_labels=(None, "T (°C)")) | |
# g.show(dpi=192) | |
# # stdyearderiv = [sum([2*real(twoIpiN*k*fouriersm[k]*(N(exp(twoIpiN*i*k/365.25)))) for k in range(len(fourier))]) for i in range(366)] | |
# (list_plot([(frac(i/365.2425)*365.2425+1, data[i][1]) for i in range(len(data))], size=1) + line([(i+1,stdyear[i]) for i in range(366)],color="red", title="Température quotidienne moyenne en France dans l'année (1950–2022)", axes_labels=(None, "T (°C)"))).show(dpi=192) | |
# anomalies = [(data[i][0], data[i][1] - sum([(2 if k>0 else 1)*real(fouriersm[k]*(N(exp(twoIpiN*i*k/365.25)))) for k in range(len(fourier))])) for i in range(len(data))] | |
## Better version (with FastFourierTransform): | |
fft = FastFourierTransform(len(data)) | |
for i in range(len(data)): | |
fft[i] = (data[i][1], 0) | |
fft.forward_transform() | |
for i in range(len(data)): | |
ii = min(i,len(data)-i) | |
if ii%datayears != 0: | |
fft[i]=(0,0) | |
fft.inverse_transform() | |
avgyear = [fft[i][0] for i in range(366)] | |
fftsm = FastFourierTransform(len(data)) | |
for i in range(len(data)): | |
fftsm[i] = (data[i][1], 0) | |
fftsm.forward_transform() | |
for i in range(len(data)): | |
ii = min(i,len(data)-i) | |
if ii%datayears != 0: | |
fftsm[i]=(0,0) | |
else: | |
k = ii/datayears | |
muf = N(exp(-(k/smoothing)^2)) | |
fftsm[i] = (fftsm[i][0]*muf, fftsm[i][1]*muf) | |
fftsm.inverse_transform() | |
stdyear = [fftsm[i][0] for i in range(366)] | |
gavg = line([(i+1,avgyear[i]) for i in range(366)], title=("Température moyenne en France dans l'année (1950–%d)"%(1950+datayears-1)), axes_labels=(None, "T (°C)"), color="gray", thickness=0.7) | |
g = line([(i+1,stdyear[i]) for i in range(366)], title=("Température moyenne lissée en France dans l'année (1950–%d)"%(1950+datayears-1)), axes_labels=(None, "T (°C)")) | |
(gavg+g).show(dpi=192) | |
fullstd = [(data[i][0], fftsm[i][0]) for i in range(len(data))] | |
anomalies = [(data[i][0], data[i][1] - fftsm[i][0]) for i in range(len(data))] | |
# fullstdsorted = fullstd.copy() | |
# fullstdsorted.sort(key=lambda v: v[1]) | |
### Perform long-term regression on anomalies | |
# ansorted = anomalies.copy() | |
# ansorted.sort(key=lambda v: v[1]) | |
import scipy.stats | |
regr = scipy.stats.linregress([(i/365.25,anomalies[i][1]) for i in range(len(anomalies))]) | |
ganom = list_plot([(i/365.25+1950, anomalies[i][1]) for i in range(len(data))], size=1, color="gray") | |
(ganom+plot(regr.slope*(x-1950)+regr.intercept, (x,1950,2023), color="red", title="Anomalie brute de température")).show(dpi=192) | |
## Different polynomial regressions: | |
import numpy as np | |
regrpol1 = np.polyfit([i/365.25 for i in range(len(anomalies))], [v[1] for v in anomalies], 1) | |
regrpol2 = np.polyfit([i/365.25 for i in range(len(anomalies))], [v[1] for v in anomalies], 2) | |
gregrpol1 = plot(regrpol1[0]*(x-1950)+regrpol1[1], (x,1950,2023), color="red") | |
gregrpol2 = plot(regrpol2[0]*(x-1950)^2+regrpol2[1]*(x-1960)+regrpol2[2], (x,1950,2023), color="magenta") | |
### Corrected anomalies: | |
bnomalies = [(anomalies[i][0], anomalies[i][1]-(regr.slope*(i/365.25)+regr.intercept)) for i in range(len(anomalies))] | |
bnsorted = bnomalies.copy() | |
bnsorted.sort(key=lambda v: v[1]) | |
## Standard deviation, skewness and excess kurtosis: | |
sigma = scipy.stats.tstd([v[1] for v in bnomalies]) | |
skew = scipy.stats.skew([v[1] for v in bnomalies]) | |
exckurt = scipy.stats.kurtosis([v[1] for v in bnomalies], fisher=True) | |
## Quantiles | |
import numpy | |
numpy.percentile([v[1] for v in bnomalies], 25) | |
numpy.percentile([v[1] for v in bnomalies], 50) | |
numpy.percentile([v[1] for v in bnomalies], 75) | |
## Average position of 2-sigma events (check that this is close to 0.5): | |
N(sum([i for i in range(len(bnomalies)) if bnomalies[i][1]>sigma*2])/(len([i for i in range(len(bnomalies)) if bnomalies[i][1]>sigma*2])*(len(bnomalies)-1))) | |
N(sum([i for i in range(len(bnomalies)) if bnomalies[i][1]<-sigma*2])/(len([i for i in range(len(bnomalies)) if bnomalies[i][1]<-sigma*2])*(len(bnomalies)-1))) | |
## Standard deviation on 25-year intervals: | |
scipy.stats.tstd([v[1] for v in bnomalies[:9131]] | |
scipy.stats.tstd([v[1] for v in bnomalies[8766:17897]] | |
scipy.stats.tstd([v[1] for v in bnomalies[17532:]] | |
## (I guess 9131 etc should be computed as floor(...*365.25+0.25)) | |
## Plot corrected anomalies: | |
gbnom = list_plot([(i/365.25+1950, bnomalies[i][1]) for i in range(len(data))], size=1, color="gray") + list_plot([(i/365.25+1950, bnomalies[i][1]) for i in range(len(data)) if bnomalies[i][1]>sigma*2], size=5, color="red") + list_plot([(i/365.25+1950, bnomalies[i][1]) for i in range(len(data)) if bnomalies[i][1]<-sigma*2], size=5, color="blue", title="Anomalie corrigée de température (évts à 2σ en gras)") | |
gbnom.show(dpi=192) | |
## Plot distribution of corrected anomalies (linear scale): | |
gbsrt = line([(sigma,0),(sigma,1)],color="black",linestyle="dashed")+line([(-sigma,0),(-sigma,1)],color="black",linestyle="dashed")+line([(bnsorted[i][1], (i+1)/(len(bnsorted)+1)) for i in range(len(bnomalies))], title="Distribution de l'anomalie corrigée", axes_labels=("T (°C)", None)) | |
gbsrt.show(dpi=192) | |
## Plot distribution of corrected anomalies (log scale): | |
gbsrtlogscale = plot((1+erf(x/(sqrt(2)*sigma)))/2, (x,-4*sigma,4*sigma), scale="semilogy", color="gray", linestyle="dashed") + plot((1-erf(x/(sqrt(2)*sigma)))/2, (x,-4*sigma,4*sigma), scale="semilogy", color="gray", linestyle="dashed") + line([(bnsorted[i][1], (i+1)/(len(bnsorted)+1)) for i in range(len(bnomalies))], title="Distribution de l'anomalie corrigée", axes_labels=("T (°C)", None), scale="semilogy") + line([(bnsorted[i][1], 1-(i+1)/(len(bnsorted)+1)) for i in range(len(bnomalies))], title="Distribution de l'anomalie corrigée", axes_labels=("T (°C)", None), color="red", scale="semilogy") | |
gbsrtlogscale.show(dpi=192) | |
### Autocorrelation analysis: | |
autocorrels = [1]+[scipy.stats.pearsonr([v[1] for v in bnomalies[:-k]], [v[1] for v in bnomalies[k:]])[0] for k in range(1,15)] | |
gautocorr = line([(i,autocorrels[i]) for i in range(len(autocorrels))], scale="semilogy", title="Autocorrélation à n jours des anomalies corrigées", axes_labels=("n", "r")) | |
gautocorr.show(dpi=192) | |
scipy.stats.linregress([(i,log(autocorrels[i])) for i in range(len(autocorrels))]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See Twitter thread starting at https://twitter.com/gro_tsen/status/1620488744867598337 and ending at https://twitter.com/gro_tsen/status/1622918364086534145 for context and explanations about this.