fft benchmark

Stephen Weeks MLton@sourcelight.com
Thu, 3 Aug 2000 20:32:16 -0700 (PDT)


I was looking at Tarditi's thesis and I noticed that for the fft benchmark they
were beating SML/NJ by a factor of 10.  We're only beating them by a factor of
1.5.  Granted, there's lots of differences (SML/NJ version, architecture), but I 
think there must be more to it than that.  I profiled the MLton compiled version 
of fft and found that half of the time is spent in the following basic block
(represented as C code).  Does anything look particularly egregious to you?

	RI(8) = Int_add(SI(308), SI(252));
	RI(9) = Int_add(RI(8), SI(252));
	RI(10) = Int_add(RI(9), SI(252));
	RD(9) = XD(SP(256), SI(308));
	RD(10) = XD(SP(256), RI(9));
	RD(11) = Real_sub(RD(9), RD(10));
	RD(12) = XD(SP(256), SI(308));
	RD(13) = XD(SP(256), RI(9));
	RD(14) = Real_add(RD(12), RD(13));
	XD(SP(256), SI(308)) = RD(14);
	RD(15) = XD(SP(256), RI(8));
	RD(16) = XD(SP(256), RI(10));
	RD(17) = Real_sub(RD(15), RD(16));
	RD(18) = XD(SP(256), RI(8));
	RD(19) = XD(SP(256), RI(10));
	RD(20) = Real_add(RD(18), RD(19));
	XD(SP(256), RI(8)) = RD(20);
	RD(21) = XD(SP(260), SI(308));
	RD(22) = XD(SP(260), RI(9));
	RD(23) = Real_sub(RD(21), RD(22));
	RD(24) = XD(SP(260), SI(308));
	RD(25) = XD(SP(260), RI(9));
	RD(26) = Real_add(RD(24), RD(25));
	XD(SP(260), SI(308)) = RD(26);
	RD(27) = XD(SP(260), RI(8));
	RD(28) = XD(SP(260), RI(10));
	RD(29) = Real_sub(RD(27), RD(28));
	RD(30) = XD(SP(260), RI(8));
	RD(31) = XD(SP(260), RI(10));
	RD(32) = Real_add(RD(30), RD(31));
	XD(SP(260), RI(8)) = RD(32);
	RD(33) = Real_sub(RD(11), RD(29));
	RD(34) = Real_add(RD(11), RD(29));
	RD(35) = Real_sub(RD(17), RD(23));
	RD(36) = Real_add(RD(17), RD(23));
	RD(37) = Real_mul(RD(34), SD(264));
	RD(38) = Real_mul(RD(35), SD(272));
	RD(39) = Real_sub(RD(37), RD(38));
	XD(SP(256), RI(9)) = RD(39);
	RD(40) = Real_neg(RD(35));
	RD(41) = Real_mul(RD(40), SD(264));
	RD(42) = Real_mul(RD(34), SD(272));
	RD(43) = Real_sub(RD(41), RD(42));
	XD(SP(260), RI(9)) = RD(43);
	RD(44) = Real_mul(RD(33), SD(280));
	RD(45) = Real_mul(RD(36), SD(288));
	RD(46) = Real_add(RD(44), RD(45));
	XD(SP(256), RI(10)) = RD(46);
	RD(47) = Real_mul(RD(36), SD(280));
	RD(48) = Real_mul(RD(33), SD(288));
	RD(49) = Real_sub(RD(47), RD(48));
	XD(SP(260), RI(10)) = RD(49);
	RI(11) = Int_add(SI(308), SI(240));
	SI(308) = RI(11);