Dear all,
I am trying to find 25 parameters for 25 (non-linear) moment conditions with a different parameter for each condition.
I use Gauss 16.0.2 build 4191. My problem is that convergence takes too long. STATA (command 'gmm') achieves convergence after 30 or 40 iterations, whereas the Gauss procedure SQPsolveMT need tens of thausends of iterations.
I have added ctl.useThreads=1 to speed up the program. In addition, I have implemented the analytically computed Jacobian (zero matrix and replacing its diagonal with the gradient). However, this leads to wrong results (and even more iterations).
I have also tested if my user-defined Jacobian is correct using the Gauss procedure gradMTTm. The results using the Gauss procedure gradMTTm (for 5 moment conditions only) are:
-0.000490 -0.000047 -0.001999 -0.000157 -0.003972
-0.000047 -0.000047 -0.000348 -0.000037 -0.000789
-0.001999 -0.000348 -0.020916 -0.001388 -0.036422
-0.000157 -0.000037 -0.001388 -0.000157 -0.003091
-0.003972 -0.000789 -0.036422 -0.003091 -0.118575
The results using my own Jacobian are:
-0.000490
-0.000047
-0.020916
-0.000157
-0.118575
The diagonal elements are identical, however, the off-diagonal elements are not zero when using the Gauss procedure gradMTTm (although they should).
The Gauss help states:
ctl.gradProc | scalar, pointer to a procedure that computes the gradient of the function with respect to the parameters. Default = ., i.e., no gradient procedure has been provided. |
Does that mean that the gradient has to be computed as a scalar? Or does the scalar refer to the pointer? However, writing the Jacobian as a scalar does not work neither.
How can this issue be solved?
Is there probably a more efficient procedure than SQPsolveMT available ?
Best, h
Gauss code:
new;cls;rndseed 12345666;ti1=date;
#include sqpSolveMT.sdf;
#include d:\chapter3\gauss\prg\BinMatch_5.0.4_hb.inc;
#include d:\chapter3\gauss\prg\var_mediation.inc;
/*
alter hhsize hhincome depindg2 soz_teil care_kid training friends arztbesu disabili health care_rel famstand
schul2 beruf2 migr em_befor empl_pre ue_rate female workhour juspo mental;
*/
let string xx = /*alter*/ mediator em_befor empl_pre workhour;
/*k = 4; n=10000; x=rndn(n,k); xorg=x; d= 0.1+0.3*x[.,1]+0.2*x[.,2]+rndn(n,1).>0; y=rndn(n,1)+d+0.5*x[.,1]+0.2*x[.,2];*/
y = readvar("d:\\chapter3\\gauss\\temp\\temptem4","F1HEALTH");
d = readvar("d:\\chapter3\\gauss\\temp\\temptem4","TREAT");
//x = readvar("d:\\chapter3\\gauss\\temp\\temptem4","MEDIATOR"$|"em_befor"$|"empl_pre"$|"workhour");
x = readvar("d:\\chapter3\\gauss\\temp\\temptem4",xx@xvar1@);
xorg=x;
x=ones(rows(x),1)~x;
{b,nix,nix} = probit2(d,x,zeros(cols(x),1),1e-8,0,ones(rows(x),1));
/*startv=b;*/
p=cdfn(x*b);
{ipt,w,ws,wa,b_tilt,ps,pa}=ipt_atent(y,d,x,p,b);
x_pop = sumc(w.*xorg);
x_study = sumc(ws.*xorg);
x_aux = sumc(wa.*xorg);
x_pop';x_study';x_aux';?;
sumc(w);;sumc(ws);;sumc(wa);ipt;
call prog_end(date~time,ti1);
end;
proc(7) = ipt_atent(y,d,x,p,b);
local b_tilt,retcode,i,ps,pa,q,w,ws,wa,ipt,iterations;
b_tilt = missex(ones(cols(x),2),ones(cols(x),2));
retcode = missex(ones(2,1),ones(2,1));
iterations = retcode;
for i(1,2,1); /*i=1:study population(treated) i=2:auxiliary population(controls)*/
{b_tilt[.,i],retcode[i]} = tilt(d,x,p,b,i);
if retcode[i] /= 0; "no balancing with tilting"; end; endif;
endfor;
ps = cdfn(x*b_tilt[.,1]);
pa = cdfn(x*b_tilt[.,2]);
q = sumc(1-p);
w = (1-p)/q;
ws = d.*(1-p)./(ps*q);
wa = (1-d).*(1-p)./((1-pa)*q);
ipt = sumc(y.*(ws-wa));
retp(ipt,w,ws,wa,b_tilt,ps,pa);
endp; /* ============================================================= */
proc(2)=tilt(d,x,p,startv,i);
local bhat,retcode,iterations;
struct DS d0;
d0 = dsCreate;
d0.dataMatrix = d~x~p~ones(rows(x),1)*i;
struct PV p0;
p0 = pvCreate;
p0 = pvPack(p0,startv,"coefficients");
struct SQPsolveMTControl c0;
c0 = sqpSolveMTcontrolCreate;
c0.dirTol = 1e-5;
c0.useThreads = 1;
c0.printIters = 1;
c0.gradProc = &gradient_fct_tilt;
struct SQPsolveMTOut out0;
out0 = SQPsolveMT(&fct_tilt,p0,d0,c0);
bhat = pvUnpack(out0.par,"coefficients");
retcode = out0.retcode ;
retp(bhat,retcode);
endp; /* ============================================================= */
proc fct_tilt(struct PV p0, struct DS d0);
local m,col,e,d,x,p,b,ptilt,ftilt,q,i,n,eqn;
m = d0.dataMatrix;
n = rows(m);
col = cols(m);
d = m[.,1];
x = m[.,2:col-2];
//e = zeros(1,1);
p = m[.,col-1];
i = m[1,col];
b = pvUnpack(p0,"coefficients");
ptilt = cdfn(x*b);
ftilt = pdfn(x*b);
q = meanc(1-p);
e = zeros(cols(x),1);
if i==1;
e = sumr((x.*(1-p)/q/n.*(d./ptilt-1))');
else;
e=sumr((((1-d).*(1-p)./((1-ptilt)*q)-((1-p)/q)).*x/n)');
endif;
retp(e'e);
endp; /* ============================================================= */
proc (3)=probit2(depvar,indepvar,startv,tol,indic,wgt);
local i,stderr,t,df,pvt,vc_ml,db,f1,iter,p,ll,m,xu,k,b,ef,x,y,mat,obs,
n,pt, ys,ky,czeil,cspalt,depi,indepi,opg,ssr,yq,wssr,er2,nwp,vc_qml,
qstderr,qt,qpvt,hxss,w,f,pm,v, itermax;
output off;
db = 1 + tol; iter = 1;
do until (abs(db) < tol) or (iter>30);
clear p, pm,v,f,ll, m, xu, obs, x, y, ys;
y = depvar;
x = indepvar;
if iter == 1;
b = startv;
endif;
w = x*b;
p = cdfn(w);
f = pdfn(w);
p = p + .0000001 * ((p.<=0) - (p.>=1));
pm = 1 - p;
xu = xu + X' ( (f ./ (p.*pm)) .* (y-p).* wgt); /* Score */
v = f + w .* p;
m = m + X'(wgt .* abs(f .* (y.*(v./(p^2)) + ((1-y).*((v-w)./(pm^2)))) ).* X); /* -Hessematrix */
ll = ll + sumc(y.*ln(p)+(1-y).*ln(pm));
trap 1;
db = solpd(xu,m);
trap 0;
if scalerr(db);
db = 0;
endif;
b = b + db;
iter = iter + 1;
endo;
if iter >= 30;
itermax=1;
else;
itermax=0;
endif;
retp(b,ll,itermax);
endp; /* ============================================================= */
proc gradient_fct_tilt(struct PV p0, struct DS d0);
local m,col,e,d,x,p,b,ptilt,q,i,ftilt,n,eqn;
m = d0.dataMatrix;
n = rows(m);
col = cols(m);
d = m[.,1];
x = m[.,2:col-2];
p = m[.,col-1];
i = m[1,col];
b = pvUnpack(p0,"coefficients");
ptilt = cdfn(x*b);
ftilt = pdfn(x*b);
q = meanc(1-p);
e = zeros(cols(x),cols(x));
if i==1;
eqn = sumr((x.*(1-p)/(sumc(1-p)/rows(x)).*(d.*(-pdfn(x*b).*x)./cdfn(x*b)^2))');
else;
eqn = sumr((x.*(1-p)/(sumc(1-p)/rows(x)).*((1-d).*(pdfn(x*b).*x)./(1-cdfn(x*b))^2))');
endif;
e = diagrv(e,eqn);
retp(e);
endp;
2 Answers
0
I would need access to the data to be able to run problem and be able to make suggestions. Put the data and the code into a .zip file and attach it, and I'll take a look.
The Constrained Maximum Likelihood Application has a lot more bells and whistles than does Sqpsolvemt which is a very basic optimization program.
0
First, I checked your gradient procedure. Gradmt produced this result
-95.197663 -4299.192290 -11.443621
and Sqpsolvemt this
79.5385 3588.03237 9.74258
The signs are wrong. But changing the sign of the return from your gradient procedure doesn't really solve the problem. The gradient values aren't really close enough. This indicates that the gradient procedure needs some work.
Then I ran the problem without the gradient procedure and it produced the following results
-------------------------------------------------------------------
iter 20 newton function 0.00000
log(cond(hessian)) 9.525 step length 1.00000
-------------------------------------------------------------------
Parameters Estimates Direction Gradient
coefficients[1,1] -1.29341 0.00000 0.00000
coefficients[2,1] -0.00544 0.00001 0.00000
coefficients[3,1] 1.03111 0.00000 0.00000
If optimisation procedure works correctly (converges), these weighted covariates should be identical: full sample, treated, controls
43.304976 0.184187
43.304985 0.184217
43.304975 0.184182
If optimisation procedure works correctly (converges), these weights should add up to one: full sample, treated, controls
1.000000 0.999593 1.000053
I'm not sure how to explain they're all being 1. According to the code they are sum of w, ws, and wa, computed in ipt_atent. The code can be rewritten
w = (1-p)/q;
ws = w .* (d ./ ps);
wa = w .* ((1 - d)./(1 - pa));
d is a vector of zeros and ones, with 525 of the elements set to 0ne.
If I change the code to this
ws = w.*d;
wa = w.*(1-d);
I get
1.000000 0.108750 0.891250
which suggests that 10.8% of the sample is treated, and 89.1% controls which adds to 1.0 for the full sample.
But I might have what you're trying to do here completely wrong.
__________________________________________________
Duration of programme: 2.04 seconds
Programme executed on: 21 .1 .2016 , 10 h 53 min
__________________________________________________
Your Answer
2 Answers
I would need access to the data to be able to run problem and be able to make suggestions. Put the data and the code into a .zip file and attach it, and I'll take a look.
The Constrained Maximum Likelihood Application has a lot more bells and whistles than does Sqpsolvemt which is a very basic optimization program.
First, I checked your gradient procedure. Gradmt produced this result
-95.197663 -4299.192290 -11.443621
and Sqpsolvemt this
79.5385 3588.03237 9.74258
The signs are wrong. But changing the sign of the return from your gradient procedure doesn't really solve the problem. The gradient values aren't really close enough. This indicates that the gradient procedure needs some work.
Then I ran the problem without the gradient procedure and it produced the following results
-------------------------------------------------------------------
iter 20 newton function 0.00000
log(cond(hessian)) 9.525 step length 1.00000
-------------------------------------------------------------------
Parameters Estimates Direction Gradient
coefficients[1,1] -1.29341 0.00000 0.00000
coefficients[2,1] -0.00544 0.00001 0.00000
coefficients[3,1] 1.03111 0.00000 0.00000
If optimisation procedure works correctly (converges), these weighted covariates should be identical: full sample, treated, controls
43.304976 0.184187
43.304985 0.184217
43.304975 0.184182
If optimisation procedure works correctly (converges), these weights should add up to one: full sample, treated, controls
1.000000 0.999593 1.000053
I'm not sure how to explain they're all being 1. According to the code they are sum of w, ws, and wa, computed in ipt_atent. The code can be rewritten
w = (1-p)/q;
ws = w .* (d ./ ps);
wa = w .* ((1 - d)./(1 - pa));
d is a vector of zeros and ones, with 525 of the elements set to 0ne.
If I change the code to this
ws = w.*d;
wa = w.*(1-d);
I get
1.000000 0.108750 0.891250
which suggests that 10.8% of the sample is treated, and 89.1% controls which adds to 1.0 for the full sample.
But I might have what you're trying to do here completely wrong.
__________________________________________________
Duration of programme: 2.04 seconds
Programme executed on: 21 .1 .2016 , 10 h 53 min
__________________________________________________