library(ismev)
library(fExtremes)
y=read.table("D:\\nasdaq_logreturn.txt")
y1=as.matrix(y,nc=1)
yy=-y1 #losses!
summary(yy)

# block maxima method
b=blockMaxima(yy,20) #monthly logreturns
hist(b)

#Fitting distribution
#goodness of fit
#Diagnostics
zz=gev.fit(b) 
gev.diag(zz)

ks.test(b, "pgev",xi=zz$mle[3],mu=zz$mle[1],beta=zz$mle[2])$p.value
#we accept the fit

ks.test(b, "pnorm",m=mean(b),s=sd(b))$p.value #rejected

b1=blockMaxima(yy,5) 
b2=blockMaxima(yy,10)
b3=blockMaxima(yy,20)
b4=blockMaxima(yy,50)
b5=blockMaxima(yy,100)

ff1=gev.fit(b1)
ff2=gev.fit(b2)
ff3=gev.fit(b3)
ff4=gev.fit(b4)
ff5=gev.fit(b5)

ks.test(b1, "pgev",xi=ff1$mle[3],mu=ff1$mle[1],beta=ff1$mle[2])$p.value 
ks.test(b2, "pgev",xi=ff2$mle[3],mu=ff2$mle[1],beta=ff2$mle[2])$p.value
ks.test(b3, "pgev",xi=ff3$mle[3],mu=ff3$mle[1],beta=ff3$mle[2])$p.value
ks.test(b4, "pgev",xi=ff4$mle[3],mu=ff4$mle[1],beta=ff4$mle[2])$p.value
ks.test(b5, "pgev",xi=ff5$mle[3],mu=ff5$mle[1],beta=ff5$mle[2])$p.value

#all are accepted
#but it is not exact:
ism=100
ered=rep(0,times=100)
for (i in 1:ism)
{y=rgev(200,xi=ff$mle[3],mu=ff$mle[1],beta=ff$mle[2])
ff5=gev.fit(y)
ered[i]=ks.test(y, "pgev",xi=ff5$mle[3],mu=ff5$mle[1],beta=ff5$mle[2])$p.value
}
hist(ered)
#not uniform, because the parameters were estimated

#it is OK if the parameters are known
for (i in 1:ism)
{y=rgev(200,xi=ff$mle[3],mu=ff$mle[1],beta=ff$mle[2])
ered[i]=ks.test(y, "pgev",xi=ff$mle[3],mu=ff$mle[1],beta=ff$mle[2])$p.value
}
hist(ered)

#so that is why we must simulate the critical values!
ism=1000 #some minutes to run
n=length(b3)
eredr=rep(0,times=ism)
for (i in 1:ism)
{y=rgev(n,xi=ff3$mle[3],mu=ff3$mle[1],beta=ff3$mle[2])
ff5=gev.fit(y)
eredr[i]=ks.test(y, "pgev",xi=ff5$mle[3],mu=ff5$mle[1],beta=ff5$mle[2])$statistic
}
hist(eredr) #some minutes to run
critn=eredr
sb3=ks.test(b3, "pgev",xi=ff3$mle[3],mu=ff3$mle[1],beta=ff3$mle[2])$statistic
sum(sb3<critn)/ism #estimated p-value
ks.test(b3, "pgev",xi=ff3$mle[3],mu=ff3$mle[1],beta=ff3$mle[2]) 
#quite big difference from the previous one

#############

y=read.table("D:\\nasdaq_logreturn.txt")
y1=as.matrix(y,nc=1)
yy=-y1  #losses!
length(yy)

options(digits=20)

#1: threshold = 0.05
uu=0.05 
xx=c(1:100)/1000
f=gpdFit(yy,u=uu)
pars=f@fit$par.ests
hist(yy[yy>uu],freq=F,breaks=20)  
lines(xx,dgpd(xx,pars[1],uu,pars[2]),col=2)
plot(density(yy[yy>uu]))  # kernel density
lines(xx,dgpd(xx,pars[1],uu,pars[2]),col=2)
#goodness of fit
zz=gpd.fit(yy,threshold=uu) 
pars2=zz$mle
dif[1]=abs(abs(pars[1])-abs(pars2[2]))
dif[2]=abs(abs(pars[2])-abs(pars2[1]))
gpd.diag(zz)

# pars
# xi                  beta 
# -0.072277274840058220  0.015019724528503878
#
# $mle
# [1]  0.01501509 -0.07229043
# 
# dif
# 1.3151620015666721e-05 4.6334867610901831e-06 

#2: threshold = 0.03
uu=0.03 
xx=c(1:100)/1000
f=gpdFit(yy,u=uu)
pars=f@fit$par.ests
hist(yy[yy>uu],freq=F,breaks=20)  
lines(xx,dgpd(xx,pars[1],uu,pars[2]),col=2)
plot(density(yy[yy>uu]))  # kernel density
lines(xx,dgpd(xx,pars[1],uu,pars[2]),col=2)
#goodness of fit
zz=gpd.fit(yy,threshold=uu) 
pars2=zz$mle
dif[1]=abs(abs(pars[1])-abs(pars2[2]))
dif[2]=abs(abs(pars[2])-abs(pars2[1]))
gpd.diag(zz)

# $mle
# [1] 0.01135923 0.11162187
#
# pars
# xi                 beta 
# 0.111328388885567869 0.011360813265348069 
# 
# dif
# 2.9348106126209084e-04 1.5810420300882422e-06


#3: threshold = 0.01
uu=0.01  
xx=c(1:100)/1000
f=gpdFit(yy,u=uu)
pars=f@fit$par.ests 
hist(yy[yy>uu],freq=F,breaks=20)  
lines(xx,dgpd(xx,pars[1],uu,pars[2]),col=2)
plot(density(yy[yy>uu]))  # kernel density
lines(xx,dgpd(xx,pars[1],uu,pars[2]),col=2)
#goodness of fit
zz=gpd.fit(yy,threshold=uu) 
pars2=zz$mle
dif[1]=abs(abs(pars[1])-abs(pars2[2]))
dif[2]=abs(abs(pars[2])-abs(pars2[1]))
gpd.diag(zz)

# $mle
# [1]  0.01353422 -0.01584399
# 
# pars
# xi                  beta 
# -0.016128713572147221  0.013538638150638936
# 
# dif
# 2.8472557081378469e-04 4.4177397296122495e-06

#comment: u=0.03 looks the best - there are enough data and the plots
#show a reasonable fit