[project @ 2000-08-04 23:28:04 by lewie]
[ghc.git] / ghc / rts / gmp / SPEED
1 ==============================================================================
2 Cycle counts and throughput for low-level routines in GNU MP as currently
3 implemented.
4
5 A range means that the timing is data-dependent.  The slower number of such
6 an interval is usually the best performance estimate.
7
8 The throughput value, measured in Gb/s (gigabits per second) has a meaning
9 only for comparison between CPUs.
10
11 A star before a line means that all values on that line are estimates.  A
12 star before a number means that that number is an estimate.  A `p' before a
13 number means that the code is not complete, but the timing is believed to be
14 accurate.
15
16             |   mpn_lshift      mpn_add_n       mpn_mul_1       mpn_addmul_1
17             |   mpn_rshift      mpn_sub_n                       mpn_submul_1
18 ------------+-----------------------------------------------------------------
19 DEC/Alpha   |
20 EV4         |   4.75 cycles/64b 7.75 cycles/64b 42 cycles/64b   42 cycles/64b
21   200MHz    |   2.7 Gb/s        1.65 Gb/s       20 Gb/s         20 Gb/s
22 EV5 old code|   4.0 cycles/64b  5.5 cycles/64b  18 cycles/64b   18 cycles/64b
23   267MHz    |   4.27 Gb/s       3.10 Gb/s       61 Gb/s         61 Gb/s
24   417MHz    |   6.67 Gb/s       4.85 Gb/s       95 Gb/s         95 Gb/s
25 EV5 tuned   |   3.25 cycles/64b 4.75 cycles/64b
26   267MHz    |   5.25 Gb/s       3.59 Gb/s               as above
27   417MHz    |   8.21 Gb/s       5.61 Gb/s
28 ------------+-----------------------------------------------------------------
29 Sun/SPARC   |
30 SPARC v7    |   14.0 cycles/32b 8.5 cycles/32b  37-54 cycl/32b  37-54 cycl/32b
31 SuperSPARC  |   3 cycles/32b    2.5 cycles/32b  8.2 cycles/32b  10.8 cycles/32b
32   50MHz     |   0.53 Gb/s       0.64 Gb/s       6.2 Gb/s        4.7 Gb/s
33 **SuperSPARC|           tuned addmul and submul will take:      9.25 cycles/32b
34 MicroSPARC2 |   ?               6.65 cycles/32b 30 cycles/32b   31.5 cycles/32b
35   110MHz    |   ?               0.53 Gb/s       3.75 Gb/s       3.58 Gb/s
36 SuperSPARC2 |   ?               ?               ?               ?
37 Ultra/32 (4)|   2.5 cycles/32b  6.5 cycles/32b  13-27 cyc/32b   16-30 cyc/32b
38   182MHz    |   2.33 Gb/s       0.896 Gb/s      14.3-6.9 Gb/s
39 Ultra/64 (5)|   2.5 cycles/64b  10 cycles/64b   40-70 cyc/64b   46-76 cyc/64b
40   182MHz    |   4.66 Gb/s       1.16 Gb/s       18.6-11 Gb/s
41 HalSPARC64  |   ?               ?               ?               ?
42 ------------+-----------------------------------------------------------------
43 SGI/MIPS    |
44 R3000       |   6 cycles/32b    9.25 cycles/32b 16 cycles/32b   16 cycles/32b
45   40MHz     |   0.21 Gb/s       0.14 Gb/s       2.56 Gb/s       2.56 Gb/s
46 R4400/32    |   8.6 cycles/32b  10 cycles/32b   16-18           19-21
47   200MHz    |   0.74 Gb/s       0.64 Gb/s       13-11 Gb/s      11-9.6 Gb/s
48 *R4400/64   |   8.6 cycles/64b  10 cycles/64b   22 cycles/64b   22 cycles/64b
49   *200MHz   |   1.48 Gb/s       1.28 Gb/s       37 Gb/s         37 Gb/s
50 R4600/32    |   6 cycles/64b    9.25 cycles/32b 15 cycles/32b   19 cycles/32b
51   134MHz    |   0.71 Gb/s       0.46 Gb/s       9.1 Gb/s        7.2 Gb/s
52 R4600/64    |   6 cycles/64b    9.25 cycles/64b ?               ?
53   134MHz    |   1.4 Gb/s        0.93 Gb/s       ?               ?
54 R8000/64    |   3 cycles/64b    4.6 cycles/64b  8 cycles/64b    8 cycles/64b
55   75MHz     |   1.6 Gb/s        1.0 Gb/s        38 Gb/s         38 Gb/s
56 *R10000/64  |   2 cycles/64b    3 cycles/64b    11 cycles/64b   11 cycles/64b
57   *200MHz   |   6.4 Gb/s        4.27 Gb/s       74 Gb/s         74 Gb/s
58   *250MHz   |   8.0 Gb/s        5.33 Gb/s       93 Gb/s         93 Gb/s
59 ------------+-----------------------------------------------------------------
60 Motorola    |
61 MC68020     |   ?               24 cycles/32b   62 cycles/32b   70 cycles/32b
62 MC68040     |   ?               6 cycles/32b    24 cycles/32b   25 cycles/32b
63 MC88100     |   >5 cycles/32b   4.6 cycles/32b  16/21 cyc/32b   p 18/23 cyc/32b
64 MC88110  wt |   ?               3.75 cycles/32b 6 cycles/32b    8.5 cyc/32b
65 *MC88110 wb |   ?               2.25 cycles/32b 4 cycles/32b    5 cycles/32b
66 ------------+-----------------------------------------------------------------
67 HP/PA-RISC  |
68 PA7000      |   4 cycles/32b    5 cycles/32b    9 cycles/32b    11 cycles/32b
69   67MHz     |   0.53 Gb/s       0.43 Gb/s       7.6 Gb/s        6.2 Gb/s
70 PA7100      |   3.25 cycles/32b 4.25 cycles/32b 7 cycles/32b    8 cycles/32b
71   99MHz     |   0.97 Gb/s       0.75 Gb/s       14 Gb/s         12.8 Gb/s
72 PA7100LC    |   ?               ?               ?               ?
73 PA7200  (3) |   3 cycles/32b    4 cycles/32b    7 cycles/32b    6.5 cycles/32b
74   100MHz    |   1.07 Gb/s       0.80            14 Gb/s         15.8 Gb/s
75 PA7300LC    |   ?               ?               ?               ?
76 *PA8000     |   3 cycles/64b    4 cycles/64b    7 cycles/64b    6.5 cycles/64b
77   180MHz    |   3.84 Gb/s       2.88 Gb/s       105 Gb/s        113 Gb/s
78 ------------+-----------------------------------------------------------------
79 Intel/x86   |
80 386DX       |   20 cycles/32b   17 cycles/32b   41-70 cycl/32b  50-79 cycl/32b
81   16.7MHz   |   0.027 Gb/s      0.031 Gb/s      0.42-0.24 Gb/s  0.34-0.22 Gb/s
82 486DX       |   ?               ?               ?               ?
83 486DX4      |   9.5 cycles/32b  9.25 cycles/32b 17-23 cycl/32b  20-26 cycl/32b
84   100MHz    |   0.34 Gb/s       0.35 Gb/s       6.0-4.5 Gb/s    5.1-3.9 Gb/s
85 Pentium     |   2/6 cycles/32b  2.5 cycles/32b  13 cycles/32b   14 cycles/32b
86   167MHz    |   2.7/0.89 Gb/s   2.1 Gb/s        13.1 Gb/s       12.2 Gb/s
87 Pentium Pro |   2.5 cycles/32b  3.5 cycles/32b  6 cycles/32b    9 cycles/32b
88   200MHz    |   2.6 Gb/s        1.8 Gb/s        34 Gb/s         23 Gb/s
89 ------------+-----------------------------------------------------------------
90 IBM/POWER   |
91 RIOS 1      |   3 cycles/32b    4 cycles/32b    11.5-12.5 c/32b 14.5/15.5 c/32b
92 RIOS 2      |   2 cycles/32b    2 cycles/32b    7 cycles/32b    8.5 cycles/32b
93 ------------+-----------------------------------------------------------------
94 PowerPC     |
95 PPC601  (1) |   3 cycles/32b    6 cycles/32b    11-16 cycl/32b  14-19 cycl/32b
96 PPC601  (2) |   5 cycles/32b    6 cycles/32b    13-22 cycl/32b  16-25 cycl/32b
97   67MHz (2) |   0.43 Gb/s       0.36 Gb/s       5.3-3.0 Gb/s    4.3-2.7 Gb/s
98 PPC603      |   ?               ?               ?               ?
99 *PPC604     |   2               3               2               3
100   *167MHz   |                                                   57 Gb/s
101 PPC620      |   ?               ?               ?               ?
102 ------------+-----------------------------------------------------------------
103 Tege        |
104 Model 1     |   2 cycles/64b    3 cycles/64b    2 cycles/64b    3 cycles/64b
105   250MHz    |   8 Gb/s          5.3 Gb/s        500 Gb/s        340 Gb/s
106   500MHz    |   16 Gb/s         11 Gb/s         1000 Gb/s       680 Gb/s
107 ____________|_________________________________________________________________
108 (1) Using POWER and PowerPC instructions
109 (2) Using only PowerPC instructions
110 (3) Actual timing for shift/add/sub depends on code alignment.  PA7000 code
111     is smaller and therefore often faster on this CPU.
112 (4) Multiplication routines modified for bogus UltraSPARC early-out
113     optimization.  Smaller operand is put in rs1, not rs2 as it should
114     according to the SPARC architecture manuals.
115 (5) Preliminary timings, since there is no stable 64-bit environment.
116 (6) Use mulu.d at least for mpn_lshift.  With mak/extu/or, we can only get
117     to 2 cycles/32b.
118
119 =============================================================================
120 Estimated theoretical asymptotic cycle counts for low-level routines:
121
122             |   mpn_lshift      mpn_add_n       mpn_mul_1       mpn_addmul_1
123             |   mpn_rshift      mpn_sub_n                       mpn_submul_1
124 ------------+-----------------------------------------------------------------
125 DEC/Alpha   |
126 EV4         |   3 cycles/64b    5 cycles/64b    42 cycles/64b   42 cycles/64b
127 EV5         |   3 cycles/64b    4 cycles/64b    18 cycles/64b   18 cycles/64b
128 ------------+-----------------------------------------------------------------
129 Sun/SPARC   |
130 SuperSPARC  |   2.5 cycles/32b  2 cycles/32b    8 cycles/32b    9 cycles/32b
131 ------------+-----------------------------------------------------------------
132 SGI/MIPS    |
133 R4400/32    |   5 cycles/64b    8 cycles/64b    16 cycles/64b   16 cycles/64b
134 R4400/64    |   5 cycles/64b    8 cycles/64b    22 cycles/64b   22 cycles/64b
135 R4600       |
136 ------------+-----------------------------------------------------------------
137 HP/PA-RISC  |
138 PA7100      |   3 cycles/32b    4 cycles/32b    6.5 cycles/32b  7.5 cycles/32b
139 PA7100LC    |
140 ------------+-----------------------------------------------------------------
141 Motorola    |
142 MC88110     |   1.5 cyc/32b (6) 1.5 cycle/32b   1.5 cycles/32b  2.25 cycles/32b
143 ------------+-----------------------------------------------------------------
144 Intel/x86   |
145 486DX4      |
146 Pentium P5x |   5 cycles/32b    2 cycles/32b    11.5 cycles/32b 13 cycles/32b
147 Pentium Pro |   2 cycles/32b    3 cycles/32b    4 cycles/32b    6 cycles/32b
148 ------------+-----------------------------------------------------------------
149 IBM/POWER   |
150 RIOS 1      |   3 cycles/32b    4 cycles/32b
151 RIOS 2      |   1.5 cycles/32b  2 cycles/32b    4.5 cycles/32b  5.5 cycles/32b
152 ------------+-----------------------------------------------------------------
153 PowerPC     |
154 PPC601  (1) |   3 cycles/32b    ?4 cycles/32b
155 PPC601  (2) |   4 cycles/32b    ?4 cycles/32b
156 ____________|_________________________________________________________________