# flops.b # ported from flops.c by Robert Bohrer, July 25, 2001 #/*****************************/ #/* FLOPS.c */ #/* Version 2.0, 18 Dec 1992 */ #/* Al Aburto */ #/* aburto@nosc.mil */ #/*****************************/ #/* # Flops.c is a 'c' program which attempts to estimate your systems # floating-point 'MFLOPS' rating for the FADD, FSUB, FMUL, and FDIV # operations based on specific 'instruction mixes' (discussed below). # The program provides an estimate of PEAK MFLOPS performance by making # maximal use of register variables with minimal interaction with main # memory. The execution loops are all small so that they will fit in # any cache. Flops.c can be used along with Linpack and the Livermore # kernels (which exersize memory much more extensively) to gain further # insight into the limits of system performance. The flops.c execution # modules also include various percent weightings of FDIV's (from 0% to # 25% FDIV's) so that the range of performance can be obtained when # using FDIV's. FDIV's, being computationally more intensive than # FADD's or FMUL's, can impact performance considerably on some systems. # # Flops.c consists of 8 independent modules (routines) which, except for # module 2, conduct numerical integration of various functions. Module # 2, estimates the value of pi based upon the Maclaurin series expansion # of atan(1). MFLOPS ratings are provided for each module, but the # programs overall results are summerized by the MFLOPS(1), MFLOPS(2), # MFLOPS(3), and MFLOPS(4) outputs. # # The MFLOPS(1) result is identical to the result provided by all # previous versions of flops.c. It is based only upon the results from # modules 2 and 3. Two problems surfaced in using MFLOPS(1). First, it # was difficult to completely 'vectorize' the result due to the # recurrence of the 's' variable in module 2. This problem is addressed # in the MFLOPS(2) result which does not use module 2, but maintains # nearly the same weighting of FDIV's (9.2%) as in MFLOPS(1) (9.6%). # The second problem with MFLOPS(1) centers around the percentage of # FDIV's (9.6%) which was viewed as too high for an important class of # problems. This concern is addressed in the MFLOPS(3) result where NO # FDIV's are conducted at all. # # The number of floating-point instructions per iteration (loop) is # given below for each module executed: # # MODULE FADD FSUB FMUL FDIV TOTAL Comment # 1 7 0 6 1 14 7.1% FDIV's # 2 3 2 1 1 7 difficult to vectorize. # 3 6 2 9 0 17 0.0% FDIV's # 4 7 0 8 0 15 0.0% FDIV's # 5 13 0 15 1 29 3.4% FDIV's # 6 13 0 16 0 29 0.0% FDIV's # 7 3 3 3 3 12 25.0% FDIV's # 8 13 0 17 0 30 0.0% FDIV's # # A*2+3 21 12 14 5 52 A=5, MFLOPS(1), Same as # 40.4% 23.1% 26.9% 9.6% previous versions of the # flops.c program. Includes # only Modules 2 and 3, does # 9.6% FDIV's, and is not # easily vectorizable. # # 1+3+4 58 14 66 14 152 A=4, MFLOPS(2), New output # +5+6+ 38.2% 9.2% 43.4% 9.2% does not include Module 2, # A*7 but does 9.2% FDIV's. # # 1+3+4 62 5 74 5 146 A=0, MFLOPS(3), New output # +5+6+ 42.9% 3.4% 50.7% 3.4% does not include Module 2, # 7+8 but does 3.4% FDIV's. # # 3+4+6 39 2 50 0 91 A=0, MFLOPS(4), New output # +8 42.9% 2.2% 54.9% 0.0% does not include Module 2, # and does NO FDIV's. # # NOTE: Various timer routines are included as indicated below. The # timer routines, with some comments, are attached at the end # of the main program. # # NOTE: Please do not remove any of the printouts. # # EXAMPLE COMPILATION: # UNIX based systems # cc -DUNIX -O flops20.c -o flops # cc -DUNIX -DROPT flops20.c -o flops # cc -DUNIX -fast -O4 flops20.c -o flops # . # . # . # etc. # # Al Aburto # aburto@marlin.nosc.mil #*/ implement Flops; include "sys.m"; sys: Sys; print: import sys; include "draw.m"; Context: import Draw; include "math.m"; math: Math; Flops: module { init: fn(ctxt: ref Context, argv: list of string); }; init(nil: ref Context, nil: list of string) { # Variables needed for 'dtime()'. nulltime: real; Time: real; # Threshold to determine Number of loops to run. Fixed at 15.0 seconds. TLimit: real; # Global Array used to hold timing results and other information. T := array[36] of real; sa,sb,sc,sd,one,two,three: real; four,five,piref,piprg: real; scale,pierr: real; A0 := 1.0; A1 := -0.1666666666671334; A2 := 0.833333333809067E-2; A3 := 0.198412715551283E-3; A4 := 0.27557589750762E-5; A5 := 0.2507059876207E-7; A6 := 0.164105986683E-9; B0 := 1.0; B1 := -0.4999999999982; B2 := 0.4166666664651E-1; B3 := -0.1388888805755E-2; B4 := 0.24801428034E-4; B5 := -0.2754213324E-6; B6 := 0.20189405E-8; C0 := 1.0; C1 := 0.99999999668; C2 := 0.49999995173; C3 := 0.16666704243; C4 := 0.4166685027E-1; C5 := 0.832672635E-2; C6 := 0.140836136E-2; C7 := 0.17358267E-3; C8 := 0.3931683E-4; D1 := 0.3999999946405E-1; D2 := 0.96E-3; D3 := 0.1233153E-5; E2 := 0.48E-3; E3 := 0.411051E-6; s,u,v,w,x: real; loops, NLimit: int; i, m, n: int; sys = load Sys Sys->PATH; math = load Math Math->PATH; print("\n"); print(" FLOPS Limbo Program, V2.0 18 Dec 1992\n\n"); #/****************************/ loops = 15625; #/* Initial number of loops. */ #/* DO NOT CHANGE! */ #/****************************/ #/****************************************************/ #/* Set Variable Values. */ #/* T[1] references all timing results relative to */ #/* one million loops. */ #/* */ #/* The program will execute from 31250 to 512000000 */ #/* loops based on a runtime of Module 1 of at least */ #/* TLimit = 15.0 seconds. That is, a runtime of 15 */ #/* seconds for Module 1 is used to determine the */ #/* number of loops to execute. */ #/* */ #/* No more than NLimit = 512000000 loops are allowed*/ #/****************************************************/ T[1] = 1.0E+06 / real loops; TLimit = 15.0; NLimit = 512000000; piref = 3.14159265358979324; one = 1.0; two = 2.0; three = 3.0; four = 4.0; five = 5.0; scale = one; print(" Module Error RunTime MFLOPS\n"); print(" (usec)\n"); #/*************************/ #/* Initialize the timer. */ #/*************************/ dtime(); dtime(); #/*******************************************************/ #/* Module 1. Calculate integral of df(x)/f(x) defined */ #/* below. Result is ln(f(1)). There are 14 */ #/* double precision operations per loop */ #/* ( 7 +, 0 -, 6 *, 1 / ) that are included */ #/* in the timing. */ #/* 50.0% +, 00.0% -, 42.9% *, and 07.1% / */ #/*******************************************************/ n = loops; sa = 0.0; while ( sa < TLimit ) { n = 2 * n; x = one / real n; #/*********************/ s = 0.0; #/* Loop 1. */ v = 0.0; #/*********************/ w = one; dtime(); for( i = 1 ; i <= n-1 ; i++ ) { v = v + w; u = v * x; s = s + (D1+u*(D2+u*D3))/(w+u*(D1+u*(E2+u*E3))); } sa = dtime(); if ( n == NLimit ) break; #/* print(" %10d %12.5f\n", n, sa); */ } scale = 1.0E+06 / real n; T[1] = scale; #/****************************************/ #/* Estimate nulltime ('for' loop time). */ #/****************************************/ dtime(); for( i = 1 ; i <= n-1 ; i++ ) { } Time = dtime(); nulltime = T[1] * Time; if ( nulltime < 0.0 ) nulltime = 0.0; T[2] = T[1] * sa - nulltime; sa = (D1+D2+D3)/(one+D1+E2+E3); sb = D1; T[3] = T[2] / 14.0; #/*********************/ sa = x * ( sa + sb + two * s ) / two; #/* Module 1 Results. */ sb = one / sa; #/*********************/ n = int ( real ( 40000 * int sb ) / scale ); sc = sb - 25.2; T[4] = one / T[3]; #/********************/ #/* DO NOT REMOVE */ #/* THIS PRINTOUT! */ #/********************/ print(" 1 %13.4e %10.4f %10.4f\n", sc, T[2], T[4]); m = n; #/*******************************************************/ #/* Module 2. Calculate value of PI from Taylor Series */ #/* expansion of atan(1.0). There are 7 */ #/* double precision operations per loop */ #/* ( 3 +, 2 -, 1 *, 1 / ) that are included */ #/* in the timing. */ #/* 42.9% +, 28.6% -, 14.3% *, and 14.3% / */ #/*******************************************************/ s = -five; #/********************/ sa = -one; #/* Loop 2. */ #/********************/ dtime(); for ( i = 1 ; i <= m ; i++ ) { s = -s; sa = sa + s; } Time = dtime(); T[5] = T[1] * Time; if ( T[5] < 0.0 ) T[5] = 0.0; sc = real m; u = sa; #/*********************/ v = 0.0; #/* Loop 3. */ w = 0.0; #/*********************/ x = 0.0; dtime(); for ( i = 1 ; i <= m ; i++) { s = -s; sa = sa + s; u = u + two; x = x +(s - u); v = v - s * u; w = w + s / u; } Time = dtime(); T[6] = T[1] * Time; T[7] = ( T[6] - T[5] ) / 7.0; #/*********************/ m = int ( sa * x / sc ); #/* PI Results */ sa = four * w / five; #/*********************/ sb = sa + five / v; sc = 31.25; piprg = sb - sc / (v * v * v); pierr = piprg - piref; T[8] = one / T[7]; #/*********************/ #/* DO NOT REMOVE */ #/* THIS PRINTOUT! */ #/*********************/ print(" 2 %13.4e %10.4f %10.4f\n", pierr, T[6]-T[5], T[8]); #/*******************************************************/ #/* Module 3. Calculate integral of sin(x) from 0.0 to */ #/* PI/3.0 using Trapazoidal Method. Result */ #/* is 0.5. There are 17 double precision */ #/* operations per loop (6 +, 2 -, 9 *, 0 /) */ #/* included in the timing. */ #/* 35.3% +, 11.8% -, 52.9% *, and 00.0% / */ #/*******************************************************/ x = piref / ( three * real m ); #/*********************/ s = 0.0; #/* Loop 4. */ v = 0.0; #/*********************/ dtime(); for( i = 1 ; i <= m-1 ; i++ ) { v = v + one; u = v * x; w = u * u; s = s + u * ((((((A6*w-A5)*w+A4)*w-A3)*w+A2)*w+A1)*w+one); } Time = dtime(); T[9] = T[1] * Time - nulltime; u = piref / three; w = u * u; sa = u * ((((((A6*w-A5)*w+A4)*w-A3)*w+A2)*w+A1)*w+one); T[10] = T[9] / 17.0; #/*********************/ sa = x * ( sa + two * s ) / two; #/* sin(x) Results. */ sb = 0.5; #/*********************/ sc = sa - sb; T[11] = one / T[10]; #/*********************/ #/* DO NOT REMOVE */ #/* THIS PRINTOUT! */ #/*********************/ print(" 3 %13.4e %10.4f %10.4f\n", sc, T[9], T[11]); #/************************************************************/ #/* Module 4. Calculate Integral of cos(x) from 0.0 to PI/3 */ #/* using the Trapazoidal Method. Result is */ #/* sin(PI/3). There are 15 double precision */ #/* operations per loop (7 +, 0 -, 8 *, and 0 / ) */ #/* included in the timing. */ #/* 50.0% +, 00.0% -, 50.0% *, 00.0% / */ #/************************************************************/ A3 = -A3; A5 = -A5; x = piref / ( three * real m ); #/*********************/ s = 0.0; #/* Loop 5. */ v = 0.0; #/*********************/ dtime(); for( i = 1 ; i <= m-1 ; i++ ) { u = real i * x; w = u * u; s = s + w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; } Time = dtime(); T[12] = T[1] * Time - nulltime; u = piref / three; w = u * u; sa = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; T[13] = T[12] / 15.0; #/*******************/ sa = x * ( sa + one + two * s ) / two; #/* Module 4 Result */ u = piref / three; #/*******************/ w = u * u; sb = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+A0); sc = sa - sb; T[14] = one / T[13]; #/*********************/ #/* DO NOT REMOVE */ #/* THIS PRINTOUT! */ #/*********************/ print(" 4 %13.4e %10.4f %10.4f\n", sc, T[12], T[14]); #/************************************************************/ #/* Module 5. Calculate Integral of tan(x) from 0.0 to PI/3 */ #/* using the Trapazoidal Method. Result is */ #/* ln(cos(PI/3)). There are 29 double precision */ #/* operations per loop (13 +, 0 -, 15 *, and 1 /)*/ #/* included in the timing. */ #/* 46.7% +, 00.0% -, 50.0% *, and 03.3% / */ #/************************************************************/ x = piref / ( three * real m ); #/*********************/ s = 0.0; #/* Loop 6. */ v = 0.0; #/*********************/ dtime(); for( i = 1 ; i <= m-1 ; i++ ) { u = real i * x; w = u * u; v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); s = s + v / (w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one); } Time = dtime(); T[15] = T[1] * Time - nulltime; u = piref / three; w = u * u; sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; sa = sa / sb; T[16] = T[15] / 29.0; #/*******************/ sa = x * ( sa + two * s ) / two; #/* Module 5 Result */ sb = 0.6931471805599453; #/*******************/ sc = sa - sb; T[17] = one / T[16]; #/*********************/ #/* DO NOT REMOVE */ #/* THIS PRINTOUT! */ #/*********************/ print(" 5 %13.4e %10.4f %10.4f\n", sc, T[15], T[17]); #/************************************************************/ #/* Module 6. Calculate Integral of sin(x)*cos(x) from 0.0 */ #/* to PI/4 using the Trapazoidal Method. Result */ #/* is sin(PI/4)^2. There are 29 double precision */ #/* operations per loop (13 +, 0 -, 16 *, and 0 /)*/ #/* included in the timing. */ #/* 46.7% +, 00.0% -, 53.3% *, and 00.0% / */ #/************************************************************/ x = piref / ( four * real m ); #/*********************/ s = 0.0; #/* Loop 7. */ v = 0.0; #/*********************/ dtime(); for( i = 1 ; i <= m-1 ; i++ ) { u = real i * x; w = u * u; v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); s = s + v*(w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one); } Time = dtime(); T[18] = T[1] * Time - nulltime; u = piref / four; w = u * u; sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; sa = sa * sb; T[19] = T[18] / 29.0; #/*******************/ sa = x * ( sa + two * s ) / two; #/* Module 6 Result */ sb = 0.25; #/*******************/ sc = sa - sb; T[20] = one / T[19]; #/*********************/ #/* DO NOT REMOVE */ #/* THIS PRINTOUT! */ #/*********************/ print(" 6 %13.4e %10.4f %10.4f\n", sc, T[18], T[20]); #/*******************************************************/ #/* Module 7. Calculate value of the definite integral */ #/* from 0 to sa of 1/(x+1), x/(x*x+1), and */ #/* x*x/(x*x*x+1) using the Trapizoidal Rule.*/ #/* There are 12 double precision operations */ #/* per loop ( 3 +, 3 -, 3 *, and 3 / ) that */ #/* are included in the timing. */ #/* 25.0% +, 25.0% -, 25.0% *, and 25.0% / */ #/*******************************************************/ #/*********************/ s = 0.0; #/* Loop 8. */ w = one; #/*********************/ sa = 102.3321513995275; v = sa / real m; dtime(); for ( i = 1 ; i <= m-1 ; i++) { x = real i * v; u = x * x; s = s - w / ( x + w ) - x / ( u + w ) - u / ( x * u + w ); } Time = dtime(); T[21] = T[1] * Time - nulltime; #/*********************/ #/* Module 7 Results */ #/*********************/ T[22] = T[21] / 12.0; x = sa; u = x * x; sa = -w - w / ( x + w ) - x / ( u + w ) - u / ( x * u + w ); sa = 18.0 * v * (sa + two * s ); m = -2000 * int sa; m = int ( real m / scale ); sc = sa + 500.2; T[23] = one / T[22]; #/********************/ #/* DO NOT REMOVE */ #/* THIS PRINTOUT! */ #/********************/ print(" 7 %13.4e %10.4f %10.4f\n", sc, T[21], T[23]); #/************************************************************/ #/* Module 8. Calculate Integral of sin(x)*cos(x)*cos(x) */ #/* from 0 to PI/3 using the Trapazoidal Method. */ #/* Result is (1-cos(PI/3)^3)/3. There are 30 */ #/* double precision operations per loop included */ #/* in the timing: */ #/* 13 +, 0 -, 17 * 0 / */ #/* 46.7% +, 00.0% -, 53.3% *, and 00.0% / */ #/************************************************************/ x = piref / ( three * real m ); #/*********************/ s = 0.0; #/* Loop 9. */ v = 0.0; #/*********************/ dtime(); for( i = 1 ; i <= m-1 ; i++ ) { u = real i * x; w = u * u; v = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; s = s + v*v*u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); } Time = dtime(); T[24] = T[1] * Time - nulltime; u = piref / three; w = u * u; sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; sa = sa * sb * sb; T[25] = T[24] / 30.0; #/*******************/ sa = x * ( sa + two * s ) / two; #/* Module 8 Result */ sb = 0.29166666666666667; #/*******************/ sc = sa - sb; T[26] = one / T[25]; #/*********************/ #/* DO NOT REMOVE */ #/* THIS PRINTOUT! */ #/*********************/ print(" 8 %13.4e %10.4f %10.4f\n", sc, T[24], T[26]); #/**************************************************/ #/* MFLOPS(1) output. This is the same weighting */ #/* used for all previous versions of the flops.c */ #/* program. Includes Modules 2 and 3 only. */ #/**************************************************/ T[27] = ( five * (T[6] - T[5]) + T[9] ) / 52.0; T[28] = one / T[27]; #/**************************************************/ #/* MFLOPS(2) output. This output does not include */ #/* Module 2, but it still does 9.2% FDIV's. */ #/**************************************************/ T[29] = T[2] + T[9] + T[12] + T[15] + T[18]; T[29] = (T[29] + four * T[21]) / 152.0; T[30] = one / T[29]; #/**************************************************/ #/* MFLOPS(3) output. This output does not include */ #/* Module 2, but it still does 3.4% FDIV's. */ #/**************************************************/ T[31] = T[2] + T[9] + T[12] + T[15] + T[18]; T[31] = (T[31] + T[21] + T[24]) / 146.0; T[32] = one / T[31]; #/**************************************************/ #/* MFLOPS(4) output. This output does not include */ #/* Module 2, and it does NO FDIV's. */ #/**************************************************/ T[33] = (T[9] + T[12] + T[18] + T[24]) / 91.0; T[34] = one / T[33]; print("\n"); print(" Iterations = %10d\n", m); print(" NullTime (usec) = %10.4f\n", nulltime); print(" MFLOPS(1) = %10.4f\n", T[28]); print(" MFLOPS(2) = %10.4f\n", T[30]); print(" MFLOPS(3) = %10.4f\n", T[32]); print(" MFLOPS(4) = %10.4f\n\n", T[34]); } #/*****************************************************/ #/* Various timer routines. */ #/* Al Aburto, aburto@marlin.nosc.mil, 16 Dec 1995 */ #/* */ #/* dtime(p) outputs the elapsed time seconds in p[1] */ #/* from a call of dtime(p) to the next call of */ #/* dtime(p). Use CAUTION as some of these routines */ #/* will mess up when timing across the hour mark!!! */ #/* */ #/* For timing I use the 'user' time whenever */ #/* possible. Using 'user+sys' time is a separate */ #/* issue. */ #/* */ #/*****************************************************/ LastTime := 0.0; dtime(): real { last := LastTime; LastTime = real sys->millisec() * 1.0e-03; return LastTime - last; }