<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">
From: Suparna Bhattacharya &lt;suparna@in.ibm.com&gt;

Fixes the following remaining issues with the DIO code:

1. During DIO file extends, intermediate writes could extend i_size
   exposing unwritten blocks to intermediate reads (Soln: Don't drop i_sem
   for file extends)

2. AIO-DIO file extends may update i_size before I/O completes,
   exposing unwritten blocks to intermediate reads.  (Soln: Force AIO-DIO
   file extends to be synchronous)

3. AIO-DIO writes to holes call aio_complete() before falling back to
   buffered I/O !  (Soln: Avoid calling aio_complete() if -ENOTBLK)

4. AIO-DIO writes to an allocated region followed by a hole, falls back
   to buffered i/o without waiting for already submitted i/o to complete;
   might return to user-space, which could overwrite the buffer contents
   while they are still being written out by the kernel (Soln: Always wait
   for submitted i/o to complete before falling back to buffered i/o)

DESC
dio-aio fix fix
EDESC

Avoid a possible busywait.


 fs/direct-io.c |   59 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 49 insertions(+), 10 deletions(-)

diff -puN fs/direct-io.c~dio-aio-fixes fs/direct-io.c
--- 25/fs/direct-io.c~dio-aio-fixes	2004-01-01 23:33:28.000000000 -0800
+++ 25-akpm/fs/direct-io.c	2004-01-01 23:35:40.000000000 -0800
@@ -209,7 +209,7 @@ static struct page *dio_get_page(struct 
  */
 static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
 {
-	if (dio-&gt;end_io)
+	if (dio-&gt;end_io &amp;&amp; dio-&gt;result)
 		dio-&gt;end_io(dio-&gt;inode, offset, bytes, dio-&gt;map_bh.b_private);
 	if (dio-&gt;needs_locking)
 		up_read(&amp;dio-&gt;inode-&gt;i_alloc_sem);
@@ -225,8 +225,14 @@ static void finished_one_bio(struct dio 
 		if (dio-&gt;is_async) {
 			dio_complete(dio, dio-&gt;block_in_file &lt;&lt; dio-&gt;blkbits,
 					dio-&gt;result);
-			aio_complete(dio-&gt;iocb, dio-&gt;result, 0);
-			kfree(dio);
+			/* Complete AIO later if falling back to buffered i/o */
+			if (dio-&gt;result != -ENOTBLK) {
+				aio_complete(dio-&gt;iocb, dio-&gt;result, 0);
+				kfree(dio);
+			} else {
+				if (dio-&gt;waiter)
+					wake_up_process(dio-&gt;waiter);
+			}
 		}
 	}
 }
@@ -877,8 +883,6 @@ direct_io_worker(int rw, struct kiocb *i
 	int ret2;
 	size_t bytes;
 
-	dio-&gt;is_async = !is_sync_kiocb(iocb);
-
 	dio-&gt;bio = NULL;
 	dio-&gt;inode = inode;
 	dio-&gt;rw = rw;
@@ -975,10 +979,11 @@ direct_io_worker(int rw, struct kiocb *i
 	dio_cleanup(dio);
 
 	/*
-	 * All new block allocations have been performed.  We can let i_sem
-	 * go now.
+	 * All block lookups have been performed. For READ requests
+	 * we can let i_sem go now that its achieved its purpose
+	 * of protecting us from looking up uninitialized blocks.
 	 */
-	if (dio-&gt;needs_locking)
+	if ((rw == READ) &amp;&amp; dio-&gt;needs_locking)
 		up(&amp;dio-&gt;inode-&gt;i_sem);
 
 	/*
@@ -988,8 +993,31 @@ direct_io_worker(int rw, struct kiocb *i
 	if (dio-&gt;is_async) {
 		if (ret == 0)
 			ret = dio-&gt;result;	/* Bytes written */
+		if (ret == -ENOTBLK) {
+			/*
+			 * The request will be reissued via buffered I/O
+			 * when we return; Any I/O already issued
+			 * effectively becomes redundant.
+			 */
+			dio-&gt;result = ret;
+			dio-&gt;waiter = current;
+		}
 		finished_one_bio(dio);		/* This can free the dio */
 		blk_run_queues();
+		if (ret == -ENOTBLK) {
+			/*
+			 * Wait for already issued I/O to drain out and
+			 * release its references to user-space pages
+			 * before returning to fallback on buffered I/O
+			 */
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			while (atomic_read(&amp;dio-&gt;bio_count)) {
+				io_schedule();
+				set_current_state(TASK_UNINTERRUPTIBLE);
+			}
+			set_current_state(TASK_RUNNING);
+			dio-&gt;waiter = NULL;
+		}
 	} else {
 		finished_one_bio(dio);
 		ret2 = dio_await_completion(dio);
@@ -1009,6 +1037,9 @@ direct_io_worker(int rw, struct kiocb *i
 				ret = i_size - offset;
 		}
 		dio_complete(dio, offset, ret);
+		/* We could have also come here on an AIO file extend */
+		if (!is_sync_kiocb(iocb) &amp;&amp; (ret != -ENOTBLK))
+			aio_complete(iocb, ret, 0);
 		kfree(dio);
 	}
 	return ret;
@@ -1035,6 +1066,7 @@ __blockdev_direct_IO(int rw, struct kioc
 	unsigned bdev_blkbits = 0;
 	unsigned blocksize_mask = (1 &lt;&lt; blkbits) - 1;
 	ssize_t retval = -EINVAL;
+	loff_t end = offset;
 	struct dio *dio;
 	int needs_locking;
 
@@ -1053,6 +1085,7 @@ __blockdev_direct_IO(int rw, struct kioc
 	for (seg = 0; seg &lt; nr_segs; seg++) {
 		addr = (unsigned long)iov[seg].iov_base;
 		size = iov[seg].iov_len;
+		end += size;
 		if ((addr &amp; blocksize_mask) || (size &amp; blocksize_mask))  {
 			if (bdev)
 				 blkbits = bdev_blkbits;
@@ -1087,11 +1120,17 @@ __blockdev_direct_IO(int rw, struct kioc
 		down_read(&amp;inode-&gt;i_alloc_sem);
 	}
 	dio-&gt;needs_locking = needs_locking;
+	/*
+	 * For file extending writes updating i_size before data
+	 * writeouts complete can expose uninitialized blocks. So
+	 * even for AIO, we need to wait for i/o to complete before
+	 * returning in this case.
+	 */
+	dio-&gt;is_async = !is_sync_kiocb(iocb) &amp;&amp; !((rw == WRITE) &amp;&amp;
+		(end &gt; i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
 				nr_segs, blkbits, get_blocks, end_io, dio);
-	if (needs_locking &amp;&amp; rw == WRITE)
-		down(&amp;inode-&gt;i_sem);
 out:
 	return retval;
 }

_
</pre></body></html>