From 33bfc6a700eaab9bc06974d6f71a80e61d9177c9 Mon Sep 17 00:00:00 2001 From: Geoffrey Mainland Date: Wed, 31 Oct 2012 15:42:01 +0000 Subject: [PATCH] Add support for passing SSE vectors in registers. This patch adds support for 6 XMM registers on x86-64 which overlap with the F and D registers and may hold 128-bit wide SIMD vectors. Because there is not a good way to attach type information to STG registers, we aggressively bitcast in the LLVM back-end. --- compiler/cmm/CmmCallConv.hs | 4 +- compiler/cmm/CmmExpr.hs | 10 +++ compiler/cmm/CmmMachOp.hs | 46 +++++++----- compiler/cmm/PprC.hs | 9 +++ compiler/cmm/PprCmmExpr.hs | 1 + compiler/codeGen/CgUtils.hs | 7 ++ compiler/codeGen/StgCmmPrim.hs | 18 +++-- compiler/llvmGen/LlvmCodeGen/Base.hs | 11 +-- compiler/llvmGen/LlvmCodeGen/CodeGen.hs | 121 +++++++++++++++++++++++-------- compiler/llvmGen/LlvmCodeGen/Regs.hs | 7 ++ compiler/nativeGen/X86/CodeGen.hs | 88 +++++++++++----------- includes/CodeGen.Platform.hs | 26 +++++++ includes/stg/MachRegs.h | 24 +++--- includes/stg/Regs.h | 42 +++++++++++ includes/stg/Types.h | 2 + utils/deriveConstants/DeriveConstants.hs | 6 ++ 16 files changed, 308 insertions(+), 114 deletions(-) diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs index dd4d6a6..913f15d 100644 --- a/compiler/cmm/CmmCallConv.hs +++ b/compiler/cmm/CmmCallConv.hs @@ -70,7 +70,9 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments) assign_regs assts (r:rs) regs | isVecType ty = vec | isFloatType ty = float | otherwise = int - where vec = (assts, (r:rs)) + where vec = case (w, regs) of + (W128, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss)) + _ -> (assts, (r:rs)) float = case (w, regs) of (W32, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (FloatReg s), (vs, fs, ds, ls, ss)) (W32, (vs, f:fs, ds, ls, ss)) diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs index dce9624..1df8e84 100644 --- a/compiler/cmm/CmmExpr.hs +++ b/compiler/cmm/CmmExpr.hs @@ -336,6 +336,9 @@ data GlobalReg | LongReg -- long int registers (64-bit, really) {-# UNPACK #-} !Int -- its number + | XmmReg -- 128-bit SIMD vector register + {-# UNPACK #-} !Int -- its number + -- STG registers | Sp -- Stack ptr; points to last occupied stack location. | SpLim -- Stack limit @@ -371,6 +374,7 @@ instance Eq GlobalReg where FloatReg i == FloatReg j = i==j DoubleReg i == DoubleReg j = i==j LongReg i == LongReg j = i==j + XmmReg i == XmmReg j = i==j Sp == Sp = True SpLim == SpLim = True Hp == Hp = True @@ -392,6 +396,7 @@ instance Ord GlobalReg where compare (FloatReg i) (FloatReg j) = compare i j compare (DoubleReg i) (DoubleReg j) = compare i j compare (LongReg i) (LongReg j) = compare i j + compare (XmmReg i) (XmmReg j) = compare i j compare Sp Sp = EQ compare SpLim SpLim = EQ compare Hp Hp = EQ @@ -413,6 +418,8 @@ instance Ord GlobalReg where compare _ (DoubleReg _) = GT compare (LongReg _) _ = LT compare _ (LongReg _) = GT + compare (XmmReg _) _ = LT + compare _ (XmmReg _) = GT compare Sp _ = LT compare _ Sp = GT compare SpLim _ = LT @@ -455,6 +462,8 @@ globalRegType dflags (VanillaReg _ VNonGcPtr) = bWord dflags globalRegType _ (FloatReg _) = cmmFloat W32 globalRegType _ (DoubleReg _) = cmmFloat W64 globalRegType _ (LongReg _) = cmmBits W64 +globalRegType _ (XmmReg _) = cmmVec 4 (cmmBits W32) + globalRegType dflags Hp = gcWord dflags -- The initialiser for all -- dynamically allocated closures @@ -465,4 +474,5 @@ isArgReg (VanillaReg {}) = True isArgReg (FloatReg {}) = True isArgReg (DoubleReg {}) = True isArgReg (LongReg {}) = True +isArgReg (XmmReg {}) = True isArgReg _ = False diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs index 4e38cd4..0f18029 100644 --- a/compiler/cmm/CmmMachOp.hs +++ b/compiler/cmm/CmmMachOp.hs @@ -118,6 +118,10 @@ data MachOp | MO_VS_Rem Length Width | MO_VS_Neg Length Width + -- Floting point vector element insertion and extraction operations + | MO_VF_Insert Length Width -- Insert scalar into vector + | MO_VF_Extract Length Width -- Extract scalar from vector + -- Floating point vector operations | MO_VF_Add Length Width | MO_VF_Sub Length Width @@ -360,22 +364,25 @@ machOpResultType dflags mop tys = MO_SF_Conv _ to -> cmmFloat to MO_FF_Conv _ to -> cmmFloat to - MO_V_Insert {} -> ty1 - MO_V_Extract {} -> vecElemType ty1 - - MO_V_Add {} -> ty1 - MO_V_Sub {} -> ty1 - MO_V_Mul {} -> ty1 - - MO_VS_Quot {} -> ty1 - MO_VS_Rem {} -> ty1 - MO_VS_Neg {} -> ty1 - - MO_VF_Add {} -> ty1 - MO_VF_Sub {} -> ty1 - MO_VF_Mul {} -> ty1 - MO_VF_Quot {} -> ty1 - MO_VF_Neg {} -> ty1 + MO_V_Insert l w -> cmmVec l (cmmBits w) + MO_V_Extract _ w -> cmmBits w + + MO_V_Add l w -> cmmVec l (cmmBits w) + MO_V_Sub l w -> cmmVec l (cmmBits w) + MO_V_Mul l w -> cmmVec l (cmmBits w) + + MO_VS_Quot l w -> cmmVec l (cmmBits w) + MO_VS_Rem l w -> cmmVec l (cmmBits w) + MO_VS_Neg l w -> cmmVec l (cmmBits w) + + MO_VF_Insert l w -> cmmVec l (cmmFloat w) + MO_VF_Extract _ w -> cmmFloat w + + MO_VF_Add l w -> cmmVec l (cmmFloat w) + MO_VF_Sub l w -> cmmVec l (cmmFloat w) + MO_VF_Mul l w -> cmmVec l (cmmFloat w) + MO_VF_Quot l w -> cmmVec l (cmmFloat w) + MO_VF_Neg l w -> cmmVec l (cmmFloat w) where (ty1:_) = tys @@ -443,8 +450,8 @@ machOpArgReps dflags op = MO_FS_Conv from _ -> [from] MO_FF_Conv from _ -> [from] - MO_V_Insert l r -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags] - MO_V_Extract l r -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags] + MO_V_Insert l r -> [typeWidth (vec l (cmmBits r)),r,wordWidth dflags] + MO_V_Extract l r -> [typeWidth (vec l (cmmBits r)),wordWidth dflags] MO_V_Add _ r -> [r,r] MO_V_Sub _ r -> [r,r] @@ -454,6 +461,9 @@ machOpArgReps dflags op = MO_VS_Rem _ r -> [r,r] MO_VS_Neg _ r -> [r] + MO_VF_Insert l r -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags] + MO_VF_Extract l r -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags] + MO_VF_Add _ r -> [r,r] MO_VF_Sub _ r -> [r,r] MO_VF_Mul _ r -> [r,r] diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs index 8712d5f..cda68ef 100644 --- a/compiler/cmm/PprC.hs +++ b/compiler/cmm/PprC.hs @@ -661,6 +661,15 @@ pprMachOp_for_C mop = case mop of (panic $ "PprC.pprMachOp_for_C: MO_VS_Neg" ++ " should have been handled earlier!") + MO_VF_Insert {} -> pprTrace "offending mop:" + (ptext $ sLit "MO_VF_Insert") + (panic $ "PprC.pprMachOp_for_C: MO_VF_Insert" + ++ " should have been handled earlier!") + MO_VF_Extract {} -> pprTrace "offending mop:" + (ptext $ sLit "MO_VF_Extract") + (panic $ "PprC.pprMachOp_for_C: MO_VF_Extract" + ++ " should have been handled earlier!") + MO_VF_Add {} -> pprTrace "offending mop:" (ptext $ sLit "MO_VF_Add") (panic $ "PprC.pprMachOp_for_C: MO_VF_Add" diff --git a/compiler/cmm/PprCmmExpr.hs b/compiler/cmm/PprCmmExpr.hs index 3c9fa06..d1128b0 100644 --- a/compiler/cmm/PprCmmExpr.hs +++ b/compiler/cmm/PprCmmExpr.hs @@ -255,6 +255,7 @@ pprGlobalReg gr FloatReg n -> char 'F' <> int n DoubleReg n -> char 'D' <> int n LongReg n -> char 'L' <> int n + XmmReg n -> ptext (sLit "XMM") <> int n Sp -> ptext (sLit "Sp") SpLim -> ptext (sLit "SpLim") Hp -> ptext (sLit "Hp") diff --git a/compiler/codeGen/CgUtils.hs b/compiler/codeGen/CgUtils.hs index bdb7f69..c06dd60 100644 --- a/compiler/codeGen/CgUtils.hs +++ b/compiler/codeGen/CgUtils.hs @@ -49,6 +49,13 @@ baseRegOffset dflags (DoubleReg 4) = oFFSET_StgRegTable_rD4 dflags baseRegOffset dflags (DoubleReg 5) = oFFSET_StgRegTable_rD5 dflags baseRegOffset dflags (DoubleReg 6) = oFFSET_StgRegTable_rD6 dflags baseRegOffset _ (DoubleReg n) = panic ("Registers above D6 are not supported (tried to use D" ++ show n ++ ")") +baseRegOffset dflags (XmmReg 1) = oFFSET_StgRegTable_rXMM1 dflags +baseRegOffset dflags (XmmReg 2) = oFFSET_StgRegTable_rXMM2 dflags +baseRegOffset dflags (XmmReg 3) = oFFSET_StgRegTable_rXMM3 dflags +baseRegOffset dflags (XmmReg 4) = oFFSET_StgRegTable_rXMM4 dflags +baseRegOffset dflags (XmmReg 5) = oFFSET_StgRegTable_rXMM5 dflags +baseRegOffset dflags (XmmReg 6) = oFFSET_StgRegTable_rXMM6 dflags +baseRegOffset _ (XmmReg n) = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")") baseRegOffset dflags Sp = oFFSET_StgRegTable_rSp dflags baseRegOffset dflags SpLim = oFFSET_StgRegTable_rSpLim dflags baseRegOffset dflags (LongReg 1) = oFFSET_StgRegTable_rL1 dflags diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs index 10a514b..4e0d773 100644 --- a/compiler/codeGen/StgCmmPrim.hs +++ b/compiler/codeGen/StgCmmPrim.hs @@ -1183,8 +1183,11 @@ doVecPackOp maybe_pre_write_cast ty z es res = do vecPack src (e : es) i = do dst <- newTemp ty - emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid) - [CmmReg (CmmLocal src), cast e, iLit]) + if isFloatType (vecElemType ty) + then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid) + [CmmReg (CmmLocal src), cast e, iLit]) + else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid) + [CmmReg (CmmLocal src), cast e, iLit]) vecPack dst es (i + 1) where -- vector indices are always 32-bits @@ -1214,8 +1217,11 @@ doVecUnpackOp maybe_post_read_cast ty e res = return () vecUnpack (r : rs) i = do - emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid) - [e, iLit])) + if isFloatType (vecElemType ty) + then emitAssign (CmmLocal r) (cast (CmmMachOp (MO_VF_Extract len wid) + [e, iLit])) + else emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid) + [e, iLit])) vecUnpack rs (i + 1) where -- vector indices are always 32-bits @@ -1244,7 +1250,9 @@ doVecInsertOp maybe_pre_write_cast ty src e idx res = do -- vector indices are always 32-bits let idx' :: CmmExpr idx' = CmmMachOp (MO_SS_Conv (wordWidth dflags) W32) [idx] - emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx']) + if isFloatType (vecElemType ty) + then emitAssign (CmmLocal res) (CmmMachOp (MO_VF_Insert len wid) [src, cast e, idx']) + else emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx']) where cast :: CmmExpr -> CmmExpr cast val = case maybe_pre_write_cast of diff --git a/compiler/llvmGen/LlvmCodeGen/Base.hs b/compiler/llvmGen/LlvmCodeGen/Base.hs index 1457efe..bcfce34 100644 --- a/compiler/llvmGen/LlvmCodeGen/Base.hs +++ b/compiler/llvmGen/LlvmCodeGen/Base.hs @@ -131,11 +131,12 @@ llvmFunArgs :: DynFlags -> LiveGlobalRegs -> [LlvmVar] llvmFunArgs dflags live = map (lmGlobalRegArg dflags) (filter isPassed (activeStgRegs platform)) where platform = targetPlatform dflags - isLive r = not (isFloat r) || r `elem` alwaysLive || r `elem` live - isPassed r = not (isFloat r) || isLive r - isFloat (FloatReg _) = True - isFloat (DoubleReg _) = True - isFloat _ = False + isLive r = not (isSSE r) || r `elem` alwaysLive || r `elem` live + isPassed r = not (isSSE r) || isLive r + isSSE (FloatReg _) = True + isSSE (DoubleReg _) = True + isSSE (XmmReg _) = True + isSSE _ = False -- | Llvm standard fun attributes llvmStdFunAttrs :: [LlvmFuncAttr] diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs index efa7e9a..969bca8 100644 --- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs +++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs @@ -470,6 +470,7 @@ castVar dflags v t (vt, _) | isInt vt && isPointer t -> LM_Inttoptr (vt, _) | isPointer vt && isInt t -> LM_Ptrtoint (vt, _) | isPointer vt && isPointer t -> LM_Bitcast + (vt, _) | isVector vt && isVector t -> LM_Bitcast (vt, _) -> panic $ "castVars: Can't cast this type (" ++ show vt ++ ") to (" ++ show t ++ ")" @@ -582,16 +583,21 @@ genAssign env reg val = do let stmts = stmts1 `appOL` stmts2 let ty = (pLower . getVarType) vreg - case isPointer ty && getVarType vval == llvmWord dflags of - -- Some registers are pointer types, so need to cast value to pointer - True -> do - (v, s1) <- doExpr ty $ Cast LM_Inttoptr vval ty - let s2 = Store v vreg - return (env2, stmts `snocOL` s1 `snocOL` s2, top1 ++ top2) + case ty of + -- Some registers are pointer types, so need to cast value to pointer + LMPointer _ | getVarType vval == llvmWord dflags -> do + (v, s1) <- doExpr ty $ Cast LM_Inttoptr vval ty + let s2 = Store v vreg + return (env2, stmts `snocOL` s1 `snocOL` s2, top1 ++ top2) - False -> do - let s1 = Store vval vreg - return (env2, stmts `snocOL` s1, top1 ++ top2) + LMVector _ _ -> do + (v, s1) <- doExpr ty $ Cast LM_Bitcast vval ty + let s2 = Store v vreg + return (env2, stmts `snocOL` s1 `snocOL` s2, top1 ++ top2) + + _ -> do + let s1 = Store vval vreg + return (env2, stmts `snocOL` s1, top1 ++ top2) -- | CmmStore operation @@ -884,14 +890,14 @@ genMachOp env _ op [x] = case op of vecty = LMVector len ty all0 = LMIntLit (-0) ty all0s = LMLitVar $ LMVectorLit (replicate len all0) - in negate vecty all0s LM_MO_Sub + in negateVec vecty all0s LM_MO_Sub MO_VF_Neg len w -> let ty = widthToLlvmFloat w vecty = LMVector len ty all0 = LMFloatLit (-0) ty all0s = LMLitVar $ LMVectorLit (replicate len all0) - in negate vecty all0s LM_MO_FSub + in negateVec vecty all0s LM_MO_FSub -- Handle unsupported cases explicitly so we get a warning -- of missing case when new MachOps added @@ -943,6 +949,9 @@ genMachOp env _ op [x] = case op of MO_VS_Quot _ _ -> panicOp MO_VS_Rem _ _ -> panicOp + + MO_VF_Insert _ _ -> panicOp + MO_VF_Extract _ _ -> panicOp MO_VF_Add _ _ -> panicOp MO_VF_Sub _ _ -> panicOp @@ -957,6 +966,12 @@ genMachOp env _ op [x] = case op of (v1, s1) <- doExpr ty $ LlvmOp negOp v2 vx return (env', v1, stmts `snocOL` s1, top) + negateVec ty v2 negOp = do + (env', vx, stmts1, top) <- exprToVar env x + ([vx'], stmts2) <- castVars dflags [(vx, ty)] + (v1, s1) <- doExpr ty $ LlvmOp negOp v2 vx' + return (env', v1, stmts1 `appOL` stmts2 `snocOL` s1, top) + fiConv ty convOp = do (env', vx, stmts, top) <- exprToVar env x (v1, s1) <- doExpr ty $ Cast convOp vx ty @@ -1014,22 +1029,50 @@ genMachOp_fast env opt op r n e genMachOp_slow :: LlvmEnv -> EOption -> MachOp -> [CmmExpr] -> UniqSM ExprData -- Element extraction -genMachOp_slow env _ (MO_V_Extract {}) [val, idx] = do +genMachOp_slow env _ (MO_V_Extract l w) [val, idx] = do + (env1, vval, stmts1, top1) <- exprToVar env val + (env2, vidx, stmts2, top2) <- exprToVar env1 idx + ([vval'], stmts3) <- castVars dflags [(vval, LMVector l ty)] + (v1, s1) <- doExpr ty $ Extract vval' vidx + return (env2, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1, top1 ++ top2) + where + dflags = getDflags env + ty = widthToLlvmInt w + +genMachOp_slow env _ (MO_VF_Extract l w) [val, idx] = do (env1, vval, stmts1, top1) <- exprToVar env val (env2, vidx, stmts2, top2) <- exprToVar env1 idx - let (LMVector _ ty) = getVarType vval - (v1, s1) <- doExpr ty $ Extract vval vidx - return (env2, v1, stmts1 `appOL` stmts2 `snocOL` s1, top1 ++ top2) + ([vval'], stmts3) <- castVars dflags [(vval, LMVector l ty)] + (v1, s1) <- doExpr ty $ Extract vval' vidx + return (env2, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1, top1 ++ top2) + where + dflags = getDflags env + ty = widthToLlvmFloat w -- Element insertion -genMachOp_slow env _ (MO_V_Insert {}) [val, elt, idx] = do +genMachOp_slow env _ (MO_V_Insert l w) [val, elt, idx] = do + (env1, vval, stmts1, top1) <- exprToVar env val + (env2, velt, stmts2, top2) <- exprToVar env1 elt + (env3, vidx, stmts3, top3) <- exprToVar env2 idx + ([vval'], stmts4) <- castVars dflags [(vval, ty)] + (v1, s1) <- doExpr ty $ Insert vval' velt vidx + return (env3, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `appOL` stmts4 `snocOL` s1, + top1 ++ top2 ++ top3) + where + dflags = getDflags env + ty = LMVector l (widthToLlvmInt w) + +genMachOp_slow env _ (MO_VF_Insert l w) [val, elt, idx] = do (env1, vval, stmts1, top1) <- exprToVar env val (env2, velt, stmts2, top2) <- exprToVar env1 elt (env3, vidx, stmts3, top3) <- exprToVar env2 idx - let ty = getVarType vval - (v1, s1) <- doExpr ty $ Insert vval velt vidx - return (env3, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1, + ([vval'], stmts4) <- castVars dflags [(vval, ty)] + (v1, s1) <- doExpr ty $ Insert vval' velt vidx + return (env3, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `appOL` stmts4 `snocOL` s1, top1 ++ top2 ++ top3) + where + dflags = getDflags env + ty = LMVector l (widthToLlvmFloat w) -- Binary MachOp genMachOp_slow env opt op [x, y] = case op of @@ -1080,17 +1123,17 @@ genMachOp_slow env opt op [x, y] = case op of MO_U_Shr _ -> genBinMach LM_MO_LShr MO_S_Shr _ -> genBinMach LM_MO_AShr - MO_V_Add _ _ -> genBinMach LM_MO_Add - MO_V_Sub _ _ -> genBinMach LM_MO_Sub - MO_V_Mul _ _ -> genBinMach LM_MO_Mul + MO_V_Add l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_Add + MO_V_Sub l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_Sub + MO_V_Mul l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_Mul - MO_VS_Quot _ _ -> genBinMach LM_MO_SDiv - MO_VS_Rem _ _ -> genBinMach LM_MO_SRem + MO_VS_Quot l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SDiv + MO_VS_Rem l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SRem - MO_VF_Add _ _ -> genBinMach LM_MO_FAdd - MO_VF_Sub _ _ -> genBinMach LM_MO_FSub - MO_VF_Mul _ _ -> genBinMach LM_MO_FMul - MO_VF_Quot _ _ -> genBinMach LM_MO_FDiv + MO_VF_Add l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FAdd + MO_VF_Sub l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FSub + MO_VF_Mul l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FMul + MO_VF_Quot l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FDiv MO_Not _ -> panicOp MO_S_Neg _ -> panicOp @@ -1107,6 +1150,9 @@ genMachOp_slow env opt op [x, y] = case op of MO_VS_Neg {} -> panicOp + MO_VF_Insert {} -> panicOp + MO_VF_Extract {} -> panicOp + MO_VF_Neg {} -> panicOp where @@ -1134,6 +1180,14 @@ genMachOp_slow env opt op [x, y] = case op of `snocOL` dy `snocOL` s1 return (env2, v1, allStmts, top1 ++ top2) + binCastLlvmOp ty binOp = do + (env1, vx, stmts1, top1) <- exprToVar env x + (env2, vy, stmts2, top2) <- exprToVar env1 y + ([vx', vy'], stmts3) <- castVars dflags [(vx, ty), (vy, ty)] + (v1, s1) <- doExpr ty $ binOp vx' vy' + return (env2, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1, + top1 ++ top2) + -- | Need to use EOption here as Cmm expects word size results from -- comparisons while LLVM return i1. Need to extend to llvmWord type -- if expected. See Note [Literals and branch conditions]. @@ -1152,6 +1206,8 @@ genMachOp_slow env opt op [x, y] = case op of genBinMach op = binLlvmOp getVarType (LlvmOp op) + genCastBinMach ty op = binCastLlvmOp ty (LlvmOp op) + -- | Detect if overflow will occur in signed multiply of the two -- CmmExpr's. This is the LLVM assembly equivalent of the NCG -- implementation. Its much longer due to type information/safety. @@ -1427,10 +1483,11 @@ funEpilogue env live = do dflags = getDflags env platform = targetPlatform dflags isLive r = r `elem` alwaysLive || r `elem` live - isPassed r = not (isFloat r) || isLive r - isFloat (FloatReg _) = True - isFloat (DoubleReg _) = True - isFloat _ = False + isPassed r = not (isSSE r) || isLive r + isSSE (FloatReg _) = True + isSSE (DoubleReg _) = True + isSSE (XmmReg _) = True + isSSE _ = False loadExpr r | isLive r = do let reg = lmGlobalRegVar dflags r (v,s) <- doExpr (pLower $ getVarType reg) $ Load reg diff --git a/compiler/llvmGen/LlvmCodeGen/Regs.hs b/compiler/llvmGen/LlvmCodeGen/Regs.hs index e6cfcb2..7271c2f 100644 --- a/compiler/llvmGen/LlvmCodeGen/Regs.hs +++ b/compiler/llvmGen/LlvmCodeGen/Regs.hs @@ -55,6 +55,12 @@ lmGlobalReg dflags suf reg DoubleReg 4 -> doubleGlobal $ "D4" ++ suf DoubleReg 5 -> doubleGlobal $ "D5" ++ suf DoubleReg 6 -> doubleGlobal $ "D6" ++ suf + XmmReg 1 -> xmmGlobal $ "XMM1" ++ suf + XmmReg 2 -> xmmGlobal $ "XMM2" ++ suf + XmmReg 3 -> xmmGlobal $ "XMM3" ++ suf + XmmReg 4 -> xmmGlobal $ "XMM4" ++ suf + XmmReg 5 -> xmmGlobal $ "XMM5" ++ suf + XmmReg 6 -> xmmGlobal $ "XMM6" ++ suf _other -> panic $ "LlvmCodeGen.Reg: GlobalReg (" ++ (show reg) ++ ") not supported!" -- LongReg, HpLim, CCSS, CurrentTSO, CurrentNusery, HpAlloc @@ -64,6 +70,7 @@ lmGlobalReg dflags suf reg ptrGlobal name = LMNLocalVar (fsLit name) (llvmWordPtr dflags) floatGlobal name = LMNLocalVar (fsLit name) LMFloat doubleGlobal name = LMNLocalVar (fsLit name) LMDouble + xmmGlobal name = LMNLocalVar (fsLit name) (LMVector 4 (LMInt 32)) -- | A list of STG Registers that should always be considered alive alwaysLive :: [GlobalReg] diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 0df95a2..4177cad 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -602,19 +602,21 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps MO_FS_Conv from to -> coerceFP2Int from to x MO_SF_Conv from to -> coerceInt2FP from to x - MO_V_Insert {} -> needLlvm - MO_V_Extract {} -> needLlvm - MO_V_Add {} -> needLlvm - MO_V_Sub {} -> needLlvm - MO_V_Mul {} -> needLlvm - MO_VS_Quot {} -> needLlvm - MO_VS_Rem {} -> needLlvm - MO_VS_Neg {} -> needLlvm - MO_VF_Add {} -> needLlvm - MO_VF_Sub {} -> needLlvm - MO_VF_Mul {} -> needLlvm - MO_VF_Quot {} -> needLlvm - MO_VF_Neg {} -> needLlvm + MO_V_Insert {} -> needLlvm + MO_V_Extract {} -> needLlvm + MO_V_Add {} -> needLlvm + MO_V_Sub {} -> needLlvm + MO_V_Mul {} -> needLlvm + MO_VS_Quot {} -> needLlvm + MO_VS_Rem {} -> needLlvm + MO_VS_Neg {} -> needLlvm + MO_VF_Insert {} -> needLlvm + MO_VF_Extract {} -> needLlvm + MO_VF_Add {} -> needLlvm + MO_VF_Sub {} -> needLlvm + MO_VF_Mul {} -> needLlvm + MO_VF_Quot {} -> needLlvm + MO_VF_Neg {} -> needLlvm _other -> pprPanic "getRegister" (pprMachOp mop) where @@ -708,19 +710,21 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps MO_U_Shr rep -> shift_code rep SHR x y {-False-} MO_S_Shr rep -> shift_code rep SAR x y {-False-} - MO_V_Insert {} -> needLlvm - MO_V_Extract {} -> needLlvm - MO_V_Add {} -> needLlvm - MO_V_Sub {} -> needLlvm - MO_V_Mul {} -> needLlvm - MO_VS_Quot {} -> needLlvm - MO_VS_Rem {} -> needLlvm - MO_VS_Neg {} -> needLlvm - MO_VF_Add {} -> needLlvm - MO_VF_Sub {} -> needLlvm - MO_VF_Mul {} -> needLlvm - MO_VF_Quot {} -> needLlvm - MO_VF_Neg {} -> needLlvm + MO_V_Insert {} -> needLlvm + MO_V_Extract {} -> needLlvm + MO_V_Add {} -> needLlvm + MO_V_Sub {} -> needLlvm + MO_V_Mul {} -> needLlvm + MO_VS_Quot {} -> needLlvm + MO_VS_Rem {} -> needLlvm + MO_VS_Neg {} -> needLlvm + MO_VF_Insert {} -> needLlvm + MO_VF_Extract {} -> needLlvm + MO_VF_Add {} -> needLlvm + MO_VF_Sub {} -> needLlvm + MO_VF_Mul {} -> needLlvm + MO_VF_Quot {} -> needLlvm + MO_VF_Neg {} -> needLlvm _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop) where @@ -2722,21 +2726,23 @@ sse2NegCode w x = do return (Any sz code) isVecExpr :: CmmExpr -> Bool -isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True -isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True -isVecExpr (CmmMachOp (MO_V_Add {}) _) = True -isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True -isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True -isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True -isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True -isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True -isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True -isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True -isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True -isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True -isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True -isVecExpr (CmmMachOp _ [e]) = isVecExpr e -isVecExpr _ = False +isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True +isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True +isVecExpr (CmmMachOp (MO_V_Add {}) _) = True +isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True +isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True +isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True +isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True +isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True +isVecExpr (CmmMachOp _ [e]) = isVecExpr e +isVecExpr _ = False needLlvm :: NatM a needLlvm = diff --git a/includes/CodeGen.Platform.hs b/includes/CodeGen.Platform.hs index 14642bd..beff196 100644 --- a/includes/CodeGen.Platform.hs +++ b/includes/CodeGen.Platform.hs @@ -388,36 +388,54 @@ activeStgRegs = [ #ifdef REG_D1 ,DoubleReg 1 #endif +#ifdef REG_XMM1 + ,XmmReg 1 +#endif #ifdef REG_F2 ,FloatReg 2 #endif #ifdef REG_D2 ,DoubleReg 2 #endif +#ifdef REG_XMM2 + ,XmmReg 2 +#endif #ifdef REG_F3 ,FloatReg 3 #endif #ifdef REG_D3 ,DoubleReg 3 #endif +#ifdef REG_XMM3 + ,XmmReg 3 +#endif #ifdef REG_F4 ,FloatReg 4 #endif #ifdef REG_D4 ,DoubleReg 4 #endif +#ifdef REG_XMM4 + ,XmmReg 4 +#endif #ifdef REG_F5 ,FloatReg 5 #endif #ifdef REG_D5 ,DoubleReg 5 #endif +#ifdef REG_XMM5 + ,XmmReg 5 +#endif #ifdef REG_F6 ,FloatReg 6 #endif #ifdef REG_D6 ,DoubleReg 6 #endif +#ifdef REG_XMM6 + ,XmmReg 6 +#endif #else /* MAX_REAL_SSE_REG == 0 */ #ifdef REG_F1 ,FloatReg 1 @@ -569,6 +587,14 @@ globalRegMaybe (DoubleReg 6) = Just (RealRegSingle REG_D6) # endif # endif +#if MAX_REAL_SSE_REG != 0 +globalRegMaybe (XmmReg 1) = Just (RealRegSingle REG_XMM1) +globalRegMaybe (XmmReg 2) = Just (RealRegSingle REG_XMM2) +globalRegMaybe (XmmReg 3) = Just (RealRegSingle REG_XMM3) +globalRegMaybe (XmmReg 4) = Just (RealRegSingle REG_XMM4) +globalRegMaybe (XmmReg 5) = Just (RealRegSingle REG_XMM5) +globalRegMaybe (XmmReg 6) = Just (RealRegSingle REG_XMM6) +# endif # ifdef REG_Sp globalRegMaybe Sp = Just (RealRegSingle REG_Sp) # endif diff --git a/includes/stg/MachRegs.h b/includes/stg/MachRegs.h index 6dc81f6..76bdb1f 100644 --- a/includes/stg/MachRegs.h +++ b/includes/stg/MachRegs.h @@ -167,12 +167,12 @@ #define REG_D5 xmm5 #define REG_D6 xmm6 -#define REG_SSE1 xmm1 -#define REG_SSE2 xmm2 -#define REG_SSE3 xmm3 -#define REG_SSE4 xmm4 -#define REG_SSE5 xmm5 -#define REG_SSE6 xmm6 +#define REG_XMM1 xmm1 +#define REG_XMM2 xmm2 +#define REG_XMM3 xmm3 +#define REG_XMM4 xmm4 +#define REG_XMM5 xmm5 +#define REG_XMM6 xmm6 #if !defined(mingw32_HOST_OS) #define CALLER_SAVES_R3 @@ -199,13 +199,13 @@ #define CALLER_SAVES_D6 #endif -#define CALLER_SAVES_SSE1 -#define CALLER_SAVES_SSE2 -#define CALLER_SAVES_SSE3 -#define CALLER_SAVES_SSE4 -#define CALLER_SAVES_SSE5 +#define CALLER_SAVES_XMM1 +#define CALLER_SAVES_XMM2 +#define CALLER_SAVES_XMM3 +#define CALLER_SAVES_XMM4 +#define CALLER_SAVES_XMM5 #if !defined(mingw32_HOST_OS) -#define CALLER_SAVES_SSE6 +#define CALLER_SAVES_XMM6 #endif #define MAX_REAL_VANILLA_REG 6 diff --git a/includes/stg/Regs.h b/includes/stg/Regs.h index fd1577e..10ae285 100644 --- a/includes/stg/Regs.h +++ b/includes/stg/Regs.h @@ -81,6 +81,12 @@ typedef struct { StgDouble rD4; StgDouble rD5; StgDouble rD6; + StgWord128 rXMM1; + StgWord128 rXMM2; + StgWord128 rXMM3; + StgWord128 rXMM4; + StgWord128 rXMM5; + StgWord128 rXMM6; StgWord64 rL1; StgPtr rSp; StgPtr rSpLim; @@ -270,6 +276,42 @@ GLOBAL_REG_DECL(StgDouble,D6,REG_D6) #define D6 (BaseReg->rD6) #endif +#if defined(REG_XMM1) && !defined(NO_GLOBAL_REG_DECLS) +GLOBAL_REG_DECL(StgWord128,XMM1,REG_XMM1) +#else +#define XMM1 (BaseReg->rXMM1) +#endif + +#if defined(REG_XMM2) && !defined(NO_GLOBAL_REG_DECLS) +GLOBAL_REG_DECL(StgWord128,XMM2,REG_XMM2) +#else +#define XMM2 (BaseReg->rXMM2) +#endif + +#if defined(REG_XMM3) && !defined(NO_GLOBAL_REG_DECLS) +GLOBAL_REG_DECL(StgWord128,XMM3,REG_XMM3) +#else +#define XMM3 (BaseReg->rXMM3) +#endif + +#if defined(REG_XMM4) && !defined(NO_GLOBAL_REG_DECLS) +GLOBAL_REG_DECL(StgWord128,XMM4,REG_XMM4) +#else +#define XMM4 (BaseReg->rXMM4) +#endif + +#if defined(REG_XMM5) && !defined(NO_GLOBAL_REG_DECLS) +GLOBAL_REG_DECL(StgWord128,XMM5,REG_XMM5) +#else +#define XMM5 (BaseReg->rXMM5) +#endif + +#if defined(REG_XMM6) && !defined(NO_GLOBAL_REG_DECLS) +GLOBAL_REG_DECL(StgWord128,XMM6,REG_XMM6) +#else +#define XMM6 (BaseReg->rXMM6) +#endif + #if defined(REG_L1) && !defined(NO_GLOBAL_REG_DECLS) GLOBAL_REG_DECL(StgWord64,L1,REG_L1) #else diff --git a/includes/stg/Types.h b/includes/stg/Types.h index d6bdc90..ccc06a1 100644 --- a/includes/stg/Types.h +++ b/includes/stg/Types.h @@ -83,6 +83,8 @@ typedef unsigned long long int StgWord64; #error cannot find a way to define StgInt64 #endif +typedef struct { StgWord64 h; StgWord64 l; } StgWord128; + /* * Define the standard word size we'll use on this machine: make it * big enough to hold a pointer. diff --git a/utils/deriveConstants/DeriveConstants.hs b/utils/deriveConstants/DeriveConstants.hs index 66c1f0e..e726bf7 100644 --- a/utils/deriveConstants/DeriveConstants.hs +++ b/utils/deriveConstants/DeriveConstants.hs @@ -307,6 +307,12 @@ wanteds = concat ,fieldOffset Both "StgRegTable" "rD4" ,fieldOffset Both "StgRegTable" "rD5" ,fieldOffset Both "StgRegTable" "rD6" + ,fieldOffset Both "StgRegTable" "rXMM1" + ,fieldOffset Both "StgRegTable" "rXMM2" + ,fieldOffset Both "StgRegTable" "rXMM3" + ,fieldOffset Both "StgRegTable" "rXMM4" + ,fieldOffset Both "StgRegTable" "rXMM5" + ,fieldOffset Both "StgRegTable" "rXMM6" ,fieldOffset Both "StgRegTable" "rL1" ,fieldOffset Both "StgRegTable" "rSp" ,fieldOffset Both "StgRegTable" "rSpLim" -- 1.9.1