fix EBADF unqueueing in select backend (Trac #10590)
authorSergei Trofimovich <slyfox@gentoo.org>
Tue, 7 Jul 2015 15:00:23 +0000 (17:00 +0200)
committerBen Gamari <ben@smart-cactus.org>
Tue, 7 Jul 2015 17:17:43 +0000 (19:17 +0200)
Alexander found a interesting case:
1. We have a queue of two waiters in a blocked_queue
2. first file descriptor changes state to RUNNABLE,
   second changes to INVALID
3. awaitEvent function dequeued RUNNABLE thread to a
   run queue and attempted to dequeue INVALID descriptor
   to a run queue.

Unqueueing INVALID fails thusly:
        #3  0x000000000045cf1c in barf (s=0x4c1cb0 "removeThreadFromDeQueue: not found")
                               at rts/RtsMessages.c:42
        #4  0x000000000046848b in removeThreadFromDeQueue (...) at rts/Threads.c:249
        #5  0x000000000049a120 in removeFromQueues (...) at rts/RaiseAsync.c:719
        #6  0x0000000000499502 in throwToSingleThreaded__ (...) at rts/RaiseAsync.c:67
        #7  0x0000000000499555 in throwToSingleThreaded (..) at rts/RaiseAsync.c:75
        #8  0x000000000047c27d in awaitEvent (wait=rtsFalse) at rts/posix/Select.c:415

The problem here is a throwToSingleThreaded function that tries
to unqueue a TSO from blocked_queue, but awaitEvent function
leaves blocked_queue in a inconsistent state while traverses
over blocked_queue:

      case RTS_FD_IS_READY:
          IF_DEBUG(scheduler,
              debugBelch("Waking up blocked thread %lu\n",
                         (unsigned long)tso->id));
          tso->why_blocked = NotBlocked;
          tso->_link = END_TSO_QUEUE;              // Here we break the queue head
          pushOnRunQueue(&MainCapability,tso);
          break;

Signed-off-by: Sergei Trofimovich <siarheit@google.com>
Test Plan: tested on a sample from T10590

Reviewers: austin, bgamari, simonmar

Reviewed By: bgamari, simonmar

Subscribers: qnikst, thomie, bgamari

Differential Revision: https://phabricator.haskell.org/D1024

GHC Trac Issues: #10590, #4934

rts/RaiseAsync.c
rts/RaiseAsync.h
rts/posix/Select.c
testsuite/tests/rts/T10590.hs [new file with mode: 0644]
testsuite/tests/rts/all.T

index 3b206ff..267707c 100644 (file)
 #include "win32/IOManager.h"
 #endif
 
-static StgTSO* raiseAsync (Capability *cap,
-                           StgTSO *tso,
-                           StgClosure *exception,
-                           rtsBool stop_at_atomically,
-                           StgUpdateFrame *stop_here);
-
 static void removeFromQueues(Capability *cap, StgTSO *tso);
 
 static void removeFromMVarBlockedQueue (StgTSO *tso);
@@ -777,7 +771,7 @@ removeFromQueues(Capability *cap, StgTSO *tso)
  *
  * -------------------------------------------------------------------------- */
 
-static StgTSO *
+StgTSO *
 raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
            rtsBool stop_at_atomically, StgUpdateFrame *stop_here)
 {
index 6bfed8d..1f939d4 100644 (file)
 void blockedThrowTo (Capability *cap,
                      StgTSO *target, MessageThrowTo *msg);
 
+StgTSO* raiseAsync (Capability *cap,
+                    StgTSO *tso,
+                    StgClosure *exception,
+                    rtsBool stop_at_atomically,
+                    StgUpdateFrame *stop_here);
+
 void throwToSingleThreaded (Capability *cap,
                             StgTSO *tso,
                             StgClosure *exception);
index 4b19235..d5c9b55 100644 (file)
@@ -375,6 +375,12 @@ awaitEvent(rtsBool wait)
 
       prev = NULL;
       {
+          /*
+           * The queue is being rebuilt in this loop:
+           * 'blocked_queue_hd' will contain already
+           * traversed blocked TSOs. As a result you
+           * can't use functions accessing 'blocked_queue_hd'.
+           */
           for(tso = blocked_queue_hd; tso != END_TSO_QUEUE; tso = next) {
               next = tso->_link;
               int fd;
@@ -412,8 +418,8 @@ awaitEvent(rtsBool wait)
                   IF_DEBUG(scheduler,
                       debugBelch("Killing blocked thread %lu on bad fd=%i\n",
                                  (unsigned long)tso->id, fd));
-                  throwToSingleThreaded(&MainCapability, tso,
-                                        (StgClosure *)blockedOnBadFD_closure);
+                  raiseAsync(&MainCapability, tso,
+                      (StgClosure *)blockedOnBadFD_closure, rtsFalse, NULL);
                   break;
               case RTS_FD_IS_READY:
                   IF_DEBUG(scheduler,
diff --git a/testsuite/tests/rts/T10590.hs b/testsuite/tests/rts/T10590.hs
new file mode 100644 (file)
index 0000000..24198ab
--- /dev/null
@@ -0,0 +1,37 @@
+import Foreign.C
+import Foreign.Marshal.Array
+import Foreign.Storable
+import Control.Concurrent
+
+-- The test works only on UNIX like.
+-- unportable bits:
+import qualified System.Posix.Internals as SPI
+import qualified System.Posix.Types as SPT
+
+pipe :: IO (CInt, CInt)
+pipe = allocaArray 2 $ \fds -> do
+    throwErrnoIfMinus1_ "pipe" $ SPI.c_pipe fds
+    rd <- peekElemOff fds 0
+    wr <- peekElemOff fds 1
+    return (rd, wr)
+
+main :: IO ()
+main = do
+    (r1, w1)  <- pipe
+    (r2, _w2) <- pipe
+    _ <- forkIO $ do -- thread A
+                     threadWaitRead (SPT.Fd r1)
+    _ <- forkIO $ do -- thread B
+                     threadWaitRead (SPT.Fd r2)
+    yield -- switch to A, then B
+          -- now both are blocked
+    _ <- SPI.c_close w1 -- unblocking thread A fd
+    _ <- SPI.c_close r2 -- breaking   thread B fd
+    yield -- kick RTS IO manager
+
+{-
+ Trac #10590 exposed a bug as:
+   T10590: internal error: removeThreadFromDeQueue: not found
+    (GHC version 7.11.20150702 for x86_64_unknown_linux)
+    Please report this as a GHC bug:  http://www.haskell.org/ghc/reportabug
+ -}
index 8f6137f..0e89249 100644 (file)
@@ -318,3 +318,8 @@ test('T9839_05', [ only_ways(prof_ways), ignore_output, exit_code(1), extra_run_
 test('T9839_06', [ only_ways(prof_ways), ignore_output, exit_code(1), extra_run_opts('+RTS -xtx')],
                 compile_and_run,
                 [''])
+
+# ignore_output as RTS reports slightly different error messages
+# in 'epoll' and 'select' backends on reading from EBADF
+# mingw32 skip as UNIX pipe and close(fd) is used to exercise the problem
+test('T10590', [ignore_output, when(opsys('mingw32'),skip)], compile_and_run, [''])