Changeset 214187 in webkit
- Timestamp:
- Mar 20, 2017 11:58:59 AM (7 years ago)
- Location:
- trunk/Source/JavaScriptCore
- Files:
-
- 9 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/JavaScriptCore/ChangeLog
r214145 r214187 1 2017-03-20 Filip Pizlo <fpizlo@apple.com> 2 3 Graph coloring should use coalescable moves when spilling 4 https://bugs.webkit.org/show_bug.cgi?id=169820 5 6 Reviewed by Michael Saboff. 7 8 This makes our graph coloring register allocator use a new family of move instructions when 9 spilling both operands of the move. It's a three-operand move: 10 11 Move (src), (dst), %scratch 12 13 Previously, if both operands got spilled, we would emit a new instruction to load or store that 14 spill slot. But this made it hard for allocateStack to see that the two spill locations are 15 coalescable. This new kind of instruction makes it obvious that it's a coalescable move. 16 17 This change implements the coalescing of spill slots inside allocateStack. 18 19 This is an outrageous speed-up on the tsf_ir_speed benchmark from http://filpizlo.com/tsf/. This 20 is an interesting benchmark because it has a super ugly interpreter loop with ~20 live variables 21 carried around the loop back edge. This change makes that interpreter run 5x faster. 22 23 This isn't a speed-up on any other benchmarks. It also doesn't regress anything. Compile time is 24 neither progressed or regressed, since the coalescing is super cheap, and this does not add any 25 significant new machinery to the register allocator (it's just a small change to spill codegen). 26 Overall on our wasm benchmarks, this is a 16% throughput progression. 27 28 * assembler/MacroAssembler.h: 29 (JSC::MacroAssembler::move): 30 (JSC::MacroAssembler::move32): 31 (JSC::MacroAssembler::moveFloat): 32 (JSC::MacroAssembler::moveDouble): 33 * b3/air/AirAllocateRegistersByGraphColoring.cpp: 34 (JSC::B3::Air::allocateRegistersByGraphColoring): 35 * b3/air/AirAllocateStack.cpp: 36 (JSC::B3::Air::allocateStack): 37 * b3/air/AirInst.cpp: 38 (JSC::B3::Air::Inst::hasEarlyDef): 39 (JSC::B3::Air::Inst::hasLateUseOrDef): 40 (JSC::B3::Air::Inst::needsPadding): 41 * b3/air/AirInst.h: 42 * b3/air/AirOpcode.opcodes: 43 * b3/air/AirPadInterference.cpp: 44 (JSC::B3::Air::padInterference): 45 * runtime/Options.h: 46 1 47 2017-03-19 Chris Dumez <cdumez@apple.com> 2 48 -
trunk/Source/JavaScriptCore/assembler/MacroAssembler.h
r213753 r214187 112 112 using MacroAssemblerBase::compare32; 113 113 using MacroAssemblerBase::move; 114 using MacroAssemblerBase::moveDouble; 114 115 using MacroAssemblerBase::add32; 115 116 using MacroAssemblerBase::mul32; … … 488 489 } 489 490 491 void move(Address src, Address dest, RegisterID scratch) 492 { 493 loadPtr(src, scratch); 494 storePtr(scratch, dest); 495 } 496 497 void move32(Address src, Address dest, RegisterID scratch) 498 { 499 load32(src, scratch); 500 store32(scratch, dest); 501 } 502 503 void moveFloat(Address src, Address dest, FPRegisterID scratch) 504 { 505 loadFloat(src, scratch); 506 storeFloat(scratch, dest); 507 } 508 509 void moveDouble(Address src, Address dest, FPRegisterID scratch) 510 { 511 loadDouble(src, scratch); 512 storeDouble(scratch, dest); 513 } 514 490 515 // Ptr methods 491 516 // On 32-bit platforms (i.e. x86), these methods directly map onto their 32-bit equivalents. … … 649 674 move(Imm32(imm.asTrustedImmPtr()), dest); 650 675 } 651 676 652 677 void comparePtr(RelationalCondition cond, RegisterID left, TrustedImm32 right, RegisterID dest) 653 678 { -
trunk/Source/JavaScriptCore/b3/air/AirAllocateRegistersByGraphColoring.cpp
r214109 r214187 1751 1751 } 1752 1752 1753 ASSERT_WITH_MESSAGE(inst.args.size() == 2, "We assume coalecable moves only have two arguments in a few places."); 1753 // Avoid the three-argument coalescable spill moves. 1754 if (inst.args.size() != 2) 1755 return false; 1754 1756 1755 1757 if (!inst.args[0].isTmp() || !inst.args[1].isTmp()) … … 2016 2018 bool canUseMove32IfDidSpill = false; 2017 2019 bool didSpill = false; 2020 bool needScratch = false; 2018 2021 if (bank == GP && inst.kind.opcode == Move) { 2019 2022 if ((inst.args[0].isTmp() && m_tmpWidth.width(inst.args[0].tmp()) <= Width32) … … 2035 2038 if (stackSlotEntry == stackSlots.end()) 2036 2039 return; 2037 if (!inst.admitsStack(arg)) 2038 return; 2040 bool needScratchIfSpilledInPlace = false; 2041 if (!inst.admitsStack(arg)) { 2042 if (traceDebug) 2043 dataLog("Have an inst that won't admit stack: ", inst, "\n"); 2044 switch (inst.kind.opcode) { 2045 case Move: 2046 case MoveDouble: 2047 case MoveFloat: 2048 case Move32: { 2049 unsigned argIndex = &arg - &inst.args[0]; 2050 unsigned otherArgIndex = argIndex ^ 1; 2051 Arg otherArg = inst.args[otherArgIndex]; 2052 if (inst.args.size() == 2 2053 && otherArg.isStack() 2054 && otherArg.stackSlot()->isSpill()) { 2055 needScratchIfSpilledInPlace = true; 2056 break; 2057 } 2058 return; 2059 } 2060 default: 2061 return; 2062 } 2063 } 2039 2064 2040 2065 // If the Tmp holds a constant then we want to rematerialize its … … 2049 2074 2050 2075 Width spillWidth = m_tmpWidth.requiredWidth(arg.tmp()); 2051 if (Arg::isAnyDef(role) && width < spillWidth) 2076 if (Arg::isAnyDef(role) && width < spillWidth) { 2077 // Either there are users of this tmp who will use more than width, 2078 // or there are producers who will produce more than width non-zero 2079 // bits. 2080 // FIXME: It's not clear why we should have to return here. We have 2081 // a ZDef fixup in allocateStack. And if this isn't a ZDef, then it 2082 // doesn't seem like it matters what happens to the high bits. Note 2083 // that this isn't the case where we're storing more than what the 2084 // spill slot can hold - we already got that covered because we 2085 // stretch the spill slot on demand. One possibility is that it's ZDefs of 2086 // smaller width than 32-bit. 2087 // https://bugs.webkit.org/show_bug.cgi?id=169823 2052 2088 return; 2089 } 2053 2090 ASSERT(inst.kind.opcode == Move || !(Arg::isAnyUse(role) && width > spillWidth)); 2054 2091 … … 2060 2097 arg = Arg::stack(stackSlotEntry->value); 2061 2098 didSpill = true; 2099 if (needScratchIfSpilledInPlace) 2100 needScratch = true; 2062 2101 }); 2063 2102 2064 2103 if (didSpill && canUseMove32IfDidSpill) 2065 2104 inst.kind.opcode = Move32; 2066 2105 2106 if (needScratch) { 2107 Bank instBank; 2108 switch (inst.kind.opcode) { 2109 case Move: 2110 case Move32: 2111 instBank = GP; 2112 break; 2113 case MoveDouble: 2114 case MoveFloat: 2115 instBank = FP; 2116 break; 2117 default: 2118 RELEASE_ASSERT_NOT_REACHED(); 2119 instBank = GP; 2120 break; 2121 } 2122 2123 RELEASE_ASSERT(instBank == bank); 2124 2125 Tmp tmp = m_code.newTmp(bank); 2126 unspillableTmps.add(AbsoluteTmpMapper<bank>::absoluteIndex(tmp)); 2127 inst.args.append(tmp); 2128 RELEASE_ASSERT(inst.args.size() == 3); 2129 2130 // Without this, a chain of spill moves would need two registers, not one, because 2131 // the scratch registers of successive moves would appear to interfere with each 2132 // other. As well, we need this if the previous instruction had any late effects, 2133 // since otherwise the scratch would appear to interfere with those. On the other 2134 // hand, the late use added at the end of this spill move (previously it was just a 2135 // late def) doesn't change the padding situation: the late def would have already 2136 // caused it to report hasLateUseOrDef in Inst::needsPadding. 2137 insertionSet.insert(instIndex, Nop, inst.origin); 2138 continue; 2139 } 2140 2067 2141 // For every other case, add Load/Store as needed. 2068 2142 inst.forEachTmp([&] (Tmp& tmp, Arg::Role role, Bank argBank, Width) { … … 2185 2259 { 2186 2260 PhaseScope phaseScope(code, "Air::allocateRegistersByGraphColoring"); 2261 2262 if (false) 2263 dataLog("Code before graph coloring:\n", code); 2187 2264 2188 2265 UseCounts<Tmp> useCounts(code); -
trunk/Source/JavaScriptCore/b3/air/AirAllocateStack.cpp
r212970 r214187 91 91 } 92 92 93 struct CoalescableMove { 94 CoalescableMove() 95 { 96 } 97 98 CoalescableMove(StackSlot* src, StackSlot* dst, double frequency) 99 : src(src) 100 , dst(dst) 101 , frequency(frequency) 102 { 103 } 104 105 bool operator==(const CoalescableMove& other) const 106 { 107 return src == other.src 108 && dst == other.dst 109 && frequency == other.frequency; 110 } 111 112 bool operator!=(const CoalescableMove& other) const 113 { 114 return !(*this == other); 115 } 116 117 explicit operator bool() const 118 { 119 return *this != CoalescableMove(); 120 } 121 122 void dump(PrintStream& out) const 123 { 124 out.print(pointerDump(src), "->", pointerDump(dst), "(", frequency, ")"); 125 } 126 127 StackSlot* src { nullptr }; 128 StackSlot* dst { nullptr }; 129 double frequency { PNaN }; 130 }; 131 93 132 } // anonymous namespace 94 133 … … 126 165 Vector<StackSlot*> slots; 127 166 167 // We will perform some spill coalescing. To make that effective, we need to be able to identify 168 // coalescable moves and handle them specially in interference analysis. 169 auto isCoalescableMove = [&] (Inst& inst) -> bool { 170 Width width; 171 switch (inst.kind.opcode) { 172 case Move: 173 width = pointerWidth(); 174 break; 175 case Move32: 176 case MoveFloat: 177 width = Width32; 178 break; 179 case MoveDouble: 180 width = Width64; 181 break; 182 default: 183 return false; 184 } 185 186 if (!Options::coalesceSpillSlots()) 187 return false; 188 189 if (inst.args.size() != 3) 190 return false; 191 192 for (unsigned i = 0; i < 2; ++i) { 193 Arg arg = inst.args[i]; 194 if (!arg.isStack()) 195 return false; 196 StackSlot* slot = arg.stackSlot(); 197 if (slot->kind() != StackSlotKind::Spill) 198 return false; 199 if (slot->byteSize() != bytes(width)) 200 return false; 201 } 202 203 return true; 204 }; 205 206 auto isUselessMove = [&] (Inst& inst) -> bool { 207 return isCoalescableMove(inst) && inst.args[0] == inst.args[1]; 208 }; 209 210 auto addEdge = [&] (StackSlot* a, StackSlot* b) { 211 interference[a].add(b); 212 interference[b].add(a); 213 }; 214 215 Vector<CoalescableMove> coalescableMoves; 216 128 217 for (BasicBlock* block : code) { 129 218 StackSlotLiveness::LocalCalc localCalc(liveness, block); … … 133 222 dataLog("Interfering: ", WTF::pointerListDump(localCalc.live()), "\n"); 134 223 224 Inst* prevInst = block->get(instIndex); 225 Inst* nextInst = block->get(instIndex + 1); 226 if (prevInst && isCoalescableMove(*prevInst)) { 227 CoalescableMove move(prevInst->args[0].stackSlot(), prevInst->args[1].stackSlot(), block->frequency()); 228 229 coalescableMoves.append(move); 230 231 for (StackSlot* otherSlot : localCalc.live()) { 232 if (otherSlot != move.src) 233 addEdge(move.dst, otherSlot); 234 } 235 236 prevInst = nullptr; 237 } 135 238 Inst::forEachDef<Arg>( 136 block->get(instIndex), block->get(instIndex + 1),239 prevInst, nextInst, 137 240 [&] (Arg& arg, Arg::Role, Bank, Width) { 138 241 if (!arg.isStack()) … … 142 245 return; 143 246 144 for (StackSlot* otherSlot : localCalc.live()) { 145 interference[slot].add(otherSlot); 146 interference[otherSlot].add(slot); 147 } 247 for (StackSlot* otherSlot : localCalc.live()) 248 addEdge(slot, otherSlot); 148 249 }); 149 250 }; … … 201 302 dataLog("Interference of ", pointerDump(slot), ": ", pointerListDump(interference[slot]), "\n"); 202 303 } 203 304 305 // Now try to coalesce some moves. 306 std::sort( 307 coalescableMoves.begin(), coalescableMoves.end(), 308 [&] (CoalescableMove& a, CoalescableMove& b) -> bool { 309 return a.frequency > b.frequency; 310 }); 311 312 IndexMap<StackSlot, StackSlot*> remappedStackSlots(code.stackSlots().size()); 313 auto remap = [&] (StackSlot* slot) -> StackSlot* { 314 if (!slot) 315 return nullptr; 316 for (;;) { 317 StackSlot* remappedSlot = remappedStackSlots[slot]; 318 if (!remappedSlot) 319 return slot; 320 slot = remappedSlot; 321 } 322 }; 323 324 for (CoalescableMove& move : coalescableMoves) { 325 move.src = remap(move.src); 326 move.dst = remap(move.dst); 327 if (move.src == move.dst) 328 continue; 329 if (interference[move.src].contains(move.dst)) 330 continue; 331 332 StackSlot* slotToKill = move.src; 333 StackSlot* slotToKeep = move.dst; 334 335 remappedStackSlots[slotToKill] = slotToKeep; 336 for (StackSlot* interferingSlot : interference[slotToKill]) { 337 if (interferingSlot == slotToKill) 338 continue; 339 interference[interferingSlot].remove(slotToKill); 340 interference[interferingSlot].add(slotToKeep); 341 } 342 interference[slotToKeep].add(interference[slotToKill].begin(), interference[slotToKill].end()); 343 interference[slotToKill].clear(); 344 } 345 346 for (BasicBlock* block : code) { 347 for (Inst& inst : *block) { 348 for (Arg& arg : inst.args) { 349 if (arg.isStack()) 350 arg = Arg::stack(remap(arg.stackSlot()), arg.offset()); 351 } 352 if (isUselessMove(inst)) 353 inst = Inst(); 354 } 355 } 356 204 357 // Now we assign stack locations. At its heart this algorithm is just first-fit. For each 205 358 // StackSlot we just want to find the offsetFromFP that is closest to zero while ensuring no … … 207 360 Vector<StackSlot*> otherSlots = assignedEscapedStackSlots; 208 361 for (StackSlot* slot : code.stackSlots()) { 362 if (remappedStackSlots[slot]) 363 continue; 364 209 365 if (slot->offsetFromFP()) { 210 366 // Already assigned an offset. -
trunk/Source/JavaScriptCore/b3/air/AirInst.cpp
r212970 r214187 34 34 35 35 namespace JSC { namespace B3 { namespace Air { 36 37 bool Inst::hasEarlyDef() 38 { 39 if (kind.opcode == Patch && !extraEarlyClobberedRegs().isEmpty()) 40 return true; 41 bool result = false; 42 forEachArg( 43 [&] (Arg&, Arg::Role role, Bank, Width) { 44 result |= Arg::isEarlyDef(role); 45 }); 46 return result; 47 } 48 49 bool Inst::hasLateUseOrDef() 50 { 51 if (kind.opcode == Patch && !extraClobberedRegs().isEmpty()) 52 return true; 53 bool result = false; 54 forEachArg( 55 [&] (Arg&, Arg::Role role, Bank, Width) { 56 result |= Arg::isLateUse(role) || Arg::isLateDef(role); 57 }); 58 return result; 59 } 60 61 bool Inst::needsPadding(Inst* prevInst, Inst* nextInst) 62 { 63 return prevInst && nextInst && prevInst->hasLateUseOrDef() && nextInst->hasEarlyDef(); 64 } 36 65 37 66 bool Inst::hasArgEffects() -
trunk/Source/JavaScriptCore/b3/air/AirInst.h
r212970 r214187 144 144 template<typename Thing, typename Functor> 145 145 static void forEachDefWithExtraClobberedRegs(Inst* prevInst, Inst* nextInst, const Functor&); 146 146 147 // Some summaries about all arguments. These are useful for needsPadding(). 148 bool hasEarlyDef(); 149 bool hasLateUseOrDef(); 150 151 // Check if there needs to be a padding Nop between these two instructions. 152 static bool needsPadding(Inst* prevInst, Inst* nextInst); 153 147 154 // Use this to report which registers are live. This should be done just before codegen. Note 148 155 // that for efficiency, reportUsedRegisters() only works for the Patch opcode. -
trunk/Source/JavaScriptCore/b3/air/AirOpcode.opcodes
r213714 r214187 624 624 x86: Imm, Addr as storePtr 625 625 626 # This is for moving between spill slots. 627 Move U:G:Ptr, D:G:Ptr, S:G:Ptr 628 Addr, Addr, Tmp 629 626 630 x86: Swap32 UD:G:32, UD:G:32 627 631 Tmp, Tmp … … 642 646 x86: Imm, Index as store32 643 647 648 # This is for moving between spill slots. 649 Move32 U:G:32, ZD:G:32, S:G:32 650 Addr, Addr, Tmp 651 644 652 StoreZero32 U:G:32 645 653 Addr … … 676 684 Tmp, Index as storeFloat 677 685 686 MoveFloat U:F:32, D:F:32, S:F:32 687 Addr, Addr, Tmp 688 678 689 MoveDouble U:F:64, D:F:64 679 690 Tmp, Tmp … … 682 693 Tmp, Addr as storeDouble 683 694 Tmp, Index as storeDouble 695 696 MoveDouble U:F:64, D:F:64, S:F:64 697 Addr, Addr, Tmp 684 698 685 699 MoveZeroToDouble D:F:64 -
trunk/Source/JavaScriptCore/b3/air/AirPadInterference.cpp
r213714 r214187 39 39 InsertionSet insertionSet(code); 40 40 for (BasicBlock* block : code) { 41 bool prevHadLate = false; 42 for (unsigned instIndex = 0; instIndex < block->size(); ++instIndex) { 41 for (unsigned instIndex = 1; instIndex < block->size(); ++instIndex) { 43 42 Inst& inst = block->at(instIndex); 44 45 bool hasEarlyDef = false; 46 bool hasLate = false; 47 inst.forEachArg( 48 [&] (Arg&, Arg::Role role, Bank, Width) { 49 switch (role) { 50 case Arg::EarlyDef: 51 case Arg::EarlyZDef: 52 hasEarlyDef = true; 53 break; 54 case Arg::LateUse: 55 case Arg::Def: 56 case Arg::ZDef: 57 case Arg::LateColdUse: 58 case Arg::UseDef: 59 case Arg::UseZDef: 60 hasLate = true; 61 break; 62 case Arg::Scratch: 63 hasEarlyDef = true; 64 hasLate = true; 65 break; 66 case Arg::Use: 67 case Arg::ColdUse: 68 case Arg::UseAddr: 69 break; 70 } 71 }); 72 if (inst.kind.opcode == Patch) { 73 hasEarlyDef |= !inst.extraEarlyClobberedRegs().isEmpty(); 74 hasLate |= !inst.extraClobberedRegs().isEmpty(); 75 } 76 77 if (hasEarlyDef && prevHadLate) 43 if (Inst::needsPadding(&block->at(instIndex - 1), &inst)) 78 44 insertionSet.insert(instIndex, Nop, inst.origin); 79 80 prevHadLate = hasLate;81 45 } 82 46 insertionSet.execute(block); -
trunk/Source/JavaScriptCore/runtime/Options.h
r213714 r214187 398 398 v(bool, airForceBriggsAllocator, false, Normal, nullptr) \ 399 399 v(bool, airForceIRCAllocator, false, Normal, nullptr) \ 400 v(bool, coalesceSpillSlots, true, Normal, nullptr) \ 400 401 v(bool, logAirRegisterPressure, false, Normal, nullptr) \ 401 402 v(unsigned, maxB3TailDupBlockSize, 3, Normal, nullptr) \
Note: See TracChangeset
for help on using the changeset viewer.