diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 019e8af449ab..6ea5cd0a595f 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
 	/* number of things on the pending list */
 	atomic_t num_pending;
 
+	/* reference counter for this struct */
+	atomic_t refs;
+
 	unsigned long sequence;
 
 	/* protects the pending list. */
@@ -93,17 +96,40 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
 	}
 }
 
+static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+{
+	struct btrfs_workers *workers = worker->workers;
+	unsigned long flags;
+
+	rmb();
+	if (!workers->atomic_start_pending)
+		return;
+
+	spin_lock_irqsave(&workers->lock, flags);
+	if (!workers->atomic_start_pending)
+		goto out;
+
+	workers->atomic_start_pending = 0;
+	if (workers->num_workers >= workers->max_workers)
+		goto out;
+
+	spin_unlock_irqrestore(&workers->lock, flags);
+	btrfs_start_workers(workers, 1);
+	return;
+
+out:
+	spin_unlock_irqrestore(&workers->lock, flags);
+}
+
 static noinline int run_ordered_completions(struct btrfs_workers *workers,
 					    struct btrfs_work *work)
 {
-	unsigned long flags;
-
 	if (!workers->ordered)
 		return 0;
 
 	set_bit(WORK_DONE_BIT, &work->flags);
 
-	spin_lock_irqsave(&workers->lock, flags);
+	spin_lock(&workers->order_lock);
 
 	while (1) {
 		if (!list_empty(&workers->prio_order_list)) {
@@ -126,45 +152,117 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
 			break;
 
-		spin_unlock_irqrestore(&workers->lock, flags);
+		spin_unlock(&workers->order_lock);
 
 		work->ordered_func(work);
 
 		/* now take the lock again and call the freeing code */
-		spin_lock_irqsave(&workers->lock, flags);
+		spin_lock(&workers->order_lock);
 		list_del(&work->order_list);
 		work->ordered_free(work);
 	}
 
-	spin_unlock_irqrestore(&workers->lock, flags);
+	spin_unlock(&workers->order_lock);
 	return 0;
 }
 
+static void put_worker(struct btrfs_worker_thread *worker)
+{
+	if (atomic_dec_and_test(&worker->refs))
+		kfree(worker);
+}
+
+static int try_worker_shutdown(struct btrfs_worker_thread *worker)
+{
+	int freeit = 0;
+
+	spin_lock_irq(&worker->lock);
+	spin_lock_irq(&worker->workers->lock);
+	if (worker->workers->num_workers > 1 &&
+	    worker->idle &&
+	    !worker->working &&
+	    !list_empty(&worker->worker_list) &&
+	    list_empty(&worker->prio_pending) &&
+	    list_empty(&worker->pending)) {
+		freeit = 1;
+		list_del_init(&worker->worker_list);
+		worker->workers->num_workers--;
+	}
+	spin_unlock_irq(&worker->workers->lock);
+	spin_unlock_irq(&worker->lock);
+
+	if (freeit)
+		put_worker(worker);
+	return freeit;
+}
+
+static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
+					struct list_head *prio_head,
+					struct list_head *head)
+{
+	struct btrfs_work *work = NULL;
+	struct list_head *cur = NULL;
+
+	if(!list_empty(prio_head))
+		cur = prio_head->next;
+
+	smp_mb();
+	if (!list_empty(&worker->prio_pending))
+		goto refill;
+
+	if (!list_empty(head))
+		cur = head->next;
+
+	if (cur)
+		goto out;
+
+refill:
+	spin_lock_irq(&worker->lock);
+	list_splice_tail_init(&worker->prio_pending, prio_head);
+	list_splice_tail_init(&worker->pending, head);
+
+	if (!list_empty(prio_head))
+		cur = prio_head->next;
+	else if (!list_empty(head))
+		cur = head->next;
+	spin_unlock_irq(&worker->lock);
+
+	if (!cur)
+		goto out_fail;
+
+out:
+	work = list_entry(cur, struct btrfs_work, list);
+
+out_fail:
+	return work;
+}
+
 /*
  * main loop for servicing work items
  */
 static int worker_loop(void *arg)
 {
 	struct btrfs_worker_thread *worker = arg;
-	struct list_head *cur;
+	struct list_head head;
+	struct list_head prio_head;
 	struct btrfs_work *work;
+
+	INIT_LIST_HEAD(&head);
+	INIT_LIST_HEAD(&prio_head);
+
 	do {
-		spin_lock_irq(&worker->lock);
-again_locked:
+again:
 		while (1) {
-			if (!list_empty(&worker->prio_pending))
-				cur = worker->prio_pending.next;
-			else if (!list_empty(&worker->pending))
-				cur = worker->pending.next;
-			else
+
+
+			work = get_next_work(worker, &prio_head, &head);
+			if (!work)
 				break;
 
-			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
 			clear_bit(WORK_QUEUED_BIT, &work->flags);
 
 			work->worker = worker;
-			spin_unlock_irq(&worker->lock);
 
 			work->func(work);
 
@@ -175,9 +273,13 @@ static int worker_loop(void *arg)
 			 */
 			run_ordered_completions(worker->workers, work);
 
-			spin_lock_irq(&worker->lock);
-			check_idle_worker(worker);
+			check_pending_worker_creates(worker);
+
 		}
+
+		spin_lock_irq(&worker->lock);
+		check_idle_worker(worker);
+
 		if (freezing(current)) {
 			worker->working = 0;
 			spin_unlock_irq(&worker->lock);
@@ -216,8 +318,10 @@ static int worker_loop(void *arg)
 				spin_lock_irq(&worker->lock);
 				set_current_state(TASK_INTERRUPTIBLE);
 				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending))
-					goto again_locked;
+				    !list_empty(&worker->prio_pending)) {
+					spin_unlock_irq(&worker->lock);
+					goto again;
+				}
 
 				/*
 				 * this makes sure we get a wakeup when someone
@@ -226,8 +330,13 @@ static int worker_loop(void *arg)
 				worker->working = 0;
 				spin_unlock_irq(&worker->lock);
 
-				if (!kthread_should_stop())
-					schedule();
+				if (!kthread_should_stop()) {
+					schedule_timeout(HZ * 120);
+					if (!worker->working &&
+					    try_worker_shutdown(worker)) {
+						return 0;
+					}
+				}
 			}
 			__set_current_state(TASK_RUNNING);
 		}
@@ -242,16 +351,30 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 {
 	struct list_head *cur;
 	struct btrfs_worker_thread *worker;
+	int can_stop;
 
+	spin_lock_irq(&workers->lock);
 	list_splice_init(&workers->idle_list, &workers->worker_list);
 	while (!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
 		worker = list_entry(cur, struct btrfs_worker_thread,
 				    worker_list);
-		kthread_stop(worker->task);
-		list_del(&worker->worker_list);
-		kfree(worker);
+
+		atomic_inc(&worker->refs);
+		workers->num_workers -= 1;
+		if (!list_empty(&worker->worker_list)) {
+			list_del_init(&worker->worker_list);
+			put_worker(worker);
+			can_stop = 1;
+		} else
+			can_stop = 0;
+		spin_unlock_irq(&workers->lock);
+		if (can_stop)
+			kthread_stop(worker->task);
+		spin_lock_irq(&workers->lock);
+		put_worker(worker);
 	}
+	spin_unlock_irq(&workers->lock);
 	return 0;
 }
 
@@ -266,10 +389,13 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 	INIT_LIST_HEAD(&workers->order_list);
 	INIT_LIST_HEAD(&workers->prio_order_list);
 	spin_lock_init(&workers->lock);
+	spin_lock_init(&workers->order_lock);
 	workers->max_workers = max;
 	workers->idle_thresh = 32;
 	workers->name = name;
 	workers->ordered = 0;
+	workers->atomic_start_pending = 0;
+	workers->atomic_worker_start = 0;
 }
 
 /*
@@ -293,7 +419,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		INIT_LIST_HEAD(&worker->prio_pending);
 		INIT_LIST_HEAD(&worker->worker_list);
 		spin_lock_init(&worker->lock);
+
 		atomic_set(&worker->num_pending, 0);
+		atomic_set(&worker->refs, 1);
 		worker->workers = workers;
 		worker->task = kthread_run(worker_loop, worker,
 					   "btrfs-%s-%d", workers->name,
@@ -303,7 +431,6 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 			kfree(worker);
 			goto fail;
 		}
-
 		spin_lock_irq(&workers->lock);
 		list_add_tail(&worker->worker_list, &workers->idle_list);
 		worker->idle = 1;
@@ -367,28 +494,18 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 {
 	struct btrfs_worker_thread *worker;
 	unsigned long flags;
+	struct list_head *fallback;
 
 again:
 	spin_lock_irqsave(&workers->lock, flags);
 	worker = next_worker(workers);
-	spin_unlock_irqrestore(&workers->lock, flags);
 
 	if (!worker) {
-		spin_lock_irqsave(&workers->lock, flags);
 		if (workers->num_workers >= workers->max_workers) {
-			struct list_head *fallback = NULL;
-			/*
-			 * we have failed to find any workers, just
-			 * return the force one
-			 */
-			if (!list_empty(&workers->worker_list))
-				fallback = workers->worker_list.next;
-			if (!list_empty(&workers->idle_list))
-				fallback = workers->idle_list.next;
-			BUG_ON(!fallback);
-			worker = list_entry(fallback,
-				  struct btrfs_worker_thread, worker_list);
-			spin_unlock_irqrestore(&workers->lock, flags);
+			goto fallback;
+		} else if (workers->atomic_worker_start) {
+			workers->atomic_start_pending = 1;
+			goto fallback;
 		} else {
 			spin_unlock_irqrestore(&workers->lock, flags);
 			/* we're below the limit, start another worker */
@@ -396,6 +513,23 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 			goto again;
 		}
 	}
+	spin_unlock_irqrestore(&workers->lock, flags);
+	return worker;
+
+fallback:
+	fallback = NULL;
+	/*
+	 * we have failed to find any workers, just
+	 * return the first one we can find.
+	 */
+	if (!list_empty(&workers->worker_list))
+		fallback = workers->worker_list.next;
+	if (!list_empty(&workers->idle_list))
+		fallback = workers->idle_list.next;
+	BUG_ON(!fallback);
+	worker = list_entry(fallback,
+		  struct btrfs_worker_thread, worker_list);
+	spin_unlock_irqrestore(&workers->lock, flags);
 	return worker;
 }
 
@@ -435,9 +569,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
 		worker->working = 1;
 	}
 
-	spin_unlock_irqrestore(&worker->lock, flags);
 	if (wake)
 		wake_up_process(worker->task);
+	spin_unlock_irqrestore(&worker->lock, flags);
 out:
 
 	return 0;
@@ -463,14 +597,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
 	worker = find_worker(workers);
 	if (workers->ordered) {
-		spin_lock_irqsave(&workers->lock, flags);
+		/*
+		 * you're not allowed to do ordered queues from an
+		 * interrupt handler
+		 */
+		spin_lock(&workers->order_lock);
 		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
 			list_add_tail(&work->order_list,
 				      &workers->prio_order_list);
 		} else {
 			list_add_tail(&work->order_list, &workers->order_list);
 		}
-		spin_unlock_irqrestore(&workers->lock, flags);
+		spin_unlock(&workers->order_lock);
 	} else {
 		INIT_LIST_HEAD(&work->order_list);
 	}
@@ -492,10 +630,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 		wake = 1;
 	worker->working = 1;
 
-	spin_unlock_irqrestore(&worker->lock, flags);
-
 	if (wake)
 		wake_up_process(worker->task);
+	spin_unlock_irqrestore(&worker->lock, flags);
+
 out:
 	return 0;
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1b511c109db6..fc089b95ec14 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -73,6 +73,15 @@ struct btrfs_workers {
 	/* force completions in the order they were queued */
 	int ordered;
 
+	/* more workers required, but in an interrupt handler */
+	int atomic_start_pending;
+
+	/*
+	 * are we allowed to sleep while starting workers or are we required
+	 * to start them at a later time?
+	 */
+	int atomic_worker_start;
+
 	/* list with all the work threads.  The workers on the idle thread
 	 * may be actively servicing jobs, but they haven't yet hit the
 	 * idle thresh limit above.
@@ -90,6 +99,9 @@ struct btrfs_workers {
 	/* lock for finding the next worker thread to queue on */
 	spinlock_t lock;
 
+	/* lock for the ordered lists */
+	spinlock_t order_lock;
+
 	/* extra name for this worker, used for current->name */
 	char *name;
 };
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9d8ba4d54a37..a11a32058b50 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		 */
 		set_page_extent_mapped(page);
 		lock_extent(tree, last_offset, end, GFP_NOFS);
-		spin_lock(&em_tree->lock);
+		read_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, last_offset,
 					   PAGE_CACHE_SIZE);
-		spin_unlock(&em_tree->lock);
+		read_unlock(&em_tree->lock);
 
 		if (!em || last_offset < em->start ||
 		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	em_tree = &BTRFS_I(inode)->extent_tree;
 
 	/* we need the actual starting offset of this extent in the file */
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree,
 				   page_offset(bio->bi_io_vec->bv_page),
 				   PAGE_CACHE_SIZE);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	compressed_len = em->block_len;
 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 837435ce84ca..732d5b884aa7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2290,7 +2290,7 @@ extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 locked_end,
-		       u64 inline_limit, u64 *hint_block);
+		       u64 inline_limit, u64 *hint_block, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..253da7e01ab3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -123,15 +123,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	struct extent_map *em;
 	int ret;
 
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
 	if (em) {
 		em->bdev =
 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-		spin_unlock(&em_tree->lock);
+		read_unlock(&em_tree->lock);
 		goto out;
 	}
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
@@ -144,7 +144,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	em->block_start = 0;
 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
-	spin_lock(&em_tree->lock);
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	if (ret == -EEXIST) {
 		u64 failed_start = em->start;
@@ -163,7 +163,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 		free_extent_map(em);
 		em = NULL;
 	}
-	spin_unlock(&em_tree->lock);
+	write_unlock(&em_tree->lock);
 
 	if (ret)
 		em = ERR_PTR(ret);
@@ -1325,9 +1325,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	offset = page_offset(page);
 
 	em_tree = &BTRFS_I(inode)->extent_tree;
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 	if (!em) {
 		__unplug_io_fn(bdi, page);
 		return;
@@ -1698,7 +1698,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		err = -EINVAL;
 		goto fail_iput;
 	}
-
+printk("thread pool is %d\n", fs_info->thread_pool_size);
 	/*
 	 * we need to start all the end_io workers up front because the
 	 * queue work function gets called at interrupt time, and so it
@@ -1743,20 +1743,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->endio_workers.idle_thresh = 4;
 	fs_info->endio_meta_workers.idle_thresh = 4;
 
-	fs_info->endio_write_workers.idle_thresh = 64;
-	fs_info->endio_meta_write_workers.idle_thresh = 64;
+	fs_info->endio_write_workers.idle_thresh = 2;
+	fs_info->endio_meta_write_workers.idle_thresh = 2;
+
+	fs_info->endio_workers.atomic_worker_start = 1;
+	fs_info->endio_meta_workers.atomic_worker_start = 1;
+	fs_info->endio_write_workers.atomic_worker_start = 1;
+	fs_info->endio_meta_write_workers.atomic_worker_start = 1;
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->delalloc_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
-	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
-	btrfs_start_workers(&fs_info->endio_meta_workers,
-			    fs_info->thread_pool_size);
-	btrfs_start_workers(&fs_info->endio_meta_write_workers,
-			    fs_info->thread_pool_size);
-	btrfs_start_workers(&fs_info->endio_write_workers,
-			    fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_workers, 1);
+	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+	btrfs_start_workers(&fs_info->endio_write_workers, 1);
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..edd86ae9e149 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5396,9 +5396,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
 	while (1) {
 		int ret;
-		spin_lock(&em_tree->lock);
+		write_lock(&em_tree->lock);
 		ret = add_extent_mapping(em_tree, em);
-		spin_unlock(&em_tree->lock);
+		write_unlock(&em_tree->lock);
 		if (ret != -EEXIST) {
 			free_extent_map(em);
 			break;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68260180f587..a102422cd92e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -367,10 +367,10 @@ static int insert_state(struct extent_io_tree *tree,
 	}
 	if (bits & EXTENT_DIRTY)
 		tree->dirty_bytes += end - start + 1;
-	set_state_cb(tree, state, bits);
-	state->state |= bits;
 	state->start = start;
 	state->end = end;
+	set_state_cb(tree, state, bits);
+	state->state |= bits;
 	node = tree_insert(&tree->state, end, &state->rb_node);
 	if (node) {
 		struct extent_state *found;
@@ -471,10 +471,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
  * bits were already set, or zero if none of the bits were already set.
  */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete, gfp_t mask)
+		     int bits, int wake, int delete,
+		     struct extent_state **cached_state,
+		     gfp_t mask)
 {
 	struct extent_state *state;
+	struct extent_state *cached;
 	struct extent_state *prealloc = NULL;
+	struct rb_node *next_node;
 	struct rb_node *node;
 	u64 last_end;
 	int err;
@@ -488,6 +492,17 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	}
 
 	spin_lock(&tree->lock);
+	if (cached_state) {
+		cached = *cached_state;
+		*cached_state = NULL;
+		if (cached->tree && cached->start == start) {
+			atomic_dec(&cached->refs);
+			state = cached;
+			last_end = state->end;
+			goto found;
+		}
+		free_extent_state(cached);
+	}
 	/*
 	 * this search will find the extents that end after
 	 * our range starts
@@ -496,6 +511,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	if (!node)
 		goto out;
 	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
 	if (state->start > end)
 		goto out;
 	WARN_ON(state->end < start);
@@ -555,11 +571,21 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		prealloc = NULL;
 		goto out;
 	}
-
+found:
+	if (state->end < end && prealloc && !need_resched())
+		next_node = rb_next(&state->rb_node);
+	else
+		next_node = NULL;
 	set |= clear_state_bit(tree, state, bits, wake, delete);
 	if (last_end == (u64)-1)
 		goto out;
 	start = last_end + 1;
+	if (start <= end && next_node) {
+		state = rb_entry(next_node, struct extent_state,
+				 rb_node);
+		if (state->start == start)
+			goto hit_next;
+	}
 	goto search_again;
 
 out:
@@ -653,26 +679,37 @@ static void set_state_bits(struct extent_io_tree *tree,
 	state->state |= bits;
 }
 
+static void cache_state(struct extent_state *state,
+			struct extent_state **cached_ptr)
+{
+	if (cached_ptr && !(*cached_ptr)) {
+		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+			*cached_ptr = state;
+			atomic_inc(&state->refs);
+		}
+	}
+}
+
 /*
- * set some bits on a range in the tree.  This may require allocations
- * or sleeping, so the gfp mask is used to indicate what is allowed.
+ * set some bits on a range in the tree.  This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
  *
- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
- * range already has the desired bits set.  The start of the existing
- * range is returned in failed_start in this case.
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set.  The start of the
+ * existing range is returned in failed_start in this case.
  *
- * [start, end] is inclusive
- * This takes the tree lock.
+ * [start, end] is inclusive This takes the tree lock.
  */
+
 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-			  int bits, int exclusive, u64 *failed_start,
+			  int bits, int exclusive_bits, u64 *failed_start,
+			  struct extent_state **cached_state,
 			  gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
 	int err = 0;
-	int set;
 	u64 last_start;
 	u64 last_end;
 again:
@@ -683,6 +720,13 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	}
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start == start && state->tree) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -694,8 +738,8 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		BUG_ON(err == -EEXIST);
 		goto out;
 	}
-
 	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
 	last_start = state->start;
 	last_end = state->end;
 
@@ -706,17 +750,28 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	 * Just lock what we found and keep going
 	 */
 	if (state->start == start && state->end <= end) {
-		set = state->state & bits;
-		if (set && exclusive) {
+		struct rb_node *next_node;
+		if (state->state & exclusive_bits) {
 			*failed_start = state->start;
 			err = -EEXIST;
 			goto out;
 		}
 		set_state_bits(tree, state, bits);
+		cache_state(state, cached_state);
 		merge_state(tree, state);
 		if (last_end == (u64)-1)
 			goto out;
+
 		start = last_end + 1;
+		if (start < end && prealloc && !need_resched()) {
+			next_node = rb_next(node);
+			if (next_node) {
+				state = rb_entry(next_node, struct extent_state,
+						 rb_node);
+				if (state->start == start)
+					goto hit_next;
+			}
+		}
 		goto search_again;
 	}
 
@@ -737,8 +792,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	 * desired bit on it.
 	 */
 	if (state->start < start) {
-		set = state->state & bits;
-		if (exclusive && set) {
+		if (state->state & exclusive_bits) {
 			*failed_start = start;
 			err = -EEXIST;
 			goto out;
@@ -750,6 +804,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			goto out;
 		if (state->end <= end) {
 			set_state_bits(tree, state, bits);
+			cache_state(state, cached_state);
 			merge_state(tree, state);
 			if (last_end == (u64)-1)
 				goto out;
@@ -774,6 +829,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			this_end = last_start - 1;
 		err = insert_state(tree, prealloc, start, this_end,
 				   bits);
+		cache_state(prealloc, cached_state);
 		prealloc = NULL;
 		BUG_ON(err == -EEXIST);
 		if (err)
@@ -788,8 +844,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	 * on the first half
 	 */
 	if (state->start <= end && state->end > end) {
-		set = state->state & bits;
-		if (exclusive && set) {
+		if (state->state & exclusive_bits) {
 			*failed_start = start;
 			err = -EEXIST;
 			goto out;
@@ -798,6 +853,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		BUG_ON(err == -EEXIST);
 
 		set_state_bits(tree, prealloc, bits);
+		cache_state(prealloc, cached_state);
 		merge_state(tree, prealloc);
 		prealloc = NULL;
 		goto out;
@@ -826,86 +882,64 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
-			      mask);
-}
-
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-		       gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+			      NULL, mask);
 }
 
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, bits, 0, NULL,
-			      mask);
+			      NULL, mask);
 }
 
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 }
 
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY,
-			      0, NULL, mask);
+			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+			      0, NULL, NULL, mask);
 }
 
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end,
-				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
-}
-
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-			 gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+				NULL, mask);
 }
 
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
-			      mask);
+			      NULL, mask);
 }
 
 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
+				NULL, mask);
 }
 
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
-			      mask);
+			      NULL, mask);
 }
 
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
 				 u64 end, gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
-}
-
-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
-			 gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
-			      0, NULL, mask);
-}
-
-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
-				  u64 end, gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+				NULL, mask);
 }
 
 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -917,13 +951,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
  * either insert or lock state struct between start and end use mask to tell
  * us if waiting is desired.
  */
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, struct extent_state **cached_state, gfp_t mask)
 {
 	int err;
 	u64 failed_start;
 	while (1) {
-		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
-				     &failed_start, mask);
+		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+				     EXTENT_LOCKED, &failed_start,
+				     cached_state, mask);
 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
 			start = failed_start;
@@ -935,27 +971,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 	return err;
 }
 
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+	return lock_extent_bits(tree, start, end, 0, NULL, mask);
+}
+
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask)
 {
 	int err;
 	u64 failed_start;
 
-	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
-			     &failed_start, mask);
+	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+			     &failed_start, NULL, mask);
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
-					 EXTENT_LOCKED, 1, 0, mask);
+					 EXTENT_LOCKED, 1, 0, NULL, mask);
 		return 0;
 	}
 	return 1;
 }
 
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+			 struct extent_state **cached, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+				mask);
+}
+
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		  gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+				mask);
 }
 
 /*
@@ -974,7 +1023,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
 		page_cache_release(page);
 		index++;
 	}
-	set_extent_dirty(tree, start, end, GFP_NOFS);
 	return 0;
 }
 
@@ -994,7 +1042,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 		page_cache_release(page);
 		index++;
 	}
-	set_extent_writeback(tree, start, end, GFP_NOFS);
 	return 0;
 }
 
@@ -1232,6 +1279,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
 	u64 delalloc_start;
 	u64 delalloc_end;
 	u64 found;
+	struct extent_state *cached_state = NULL;
 	int ret;
 	int loops = 0;
 
@@ -1269,6 +1317,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
 		/* some of the pages are gone, lets avoid looping by
 		 * shortening the size of the delalloc range we're searching
 		 */
+		free_extent_state(cached_state);
 		if (!loops) {
 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
 			max_bytes = PAGE_CACHE_SIZE - offset;
@@ -1282,18 +1331,21 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
 	BUG_ON(ret);
 
 	/* step three, lock the state bits for the whole range */
-	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+	lock_extent_bits(tree, delalloc_start, delalloc_end,
+			 0, &cached_state, GFP_NOFS);
 
 	/* then test to make sure it is all still delalloc */
 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
-			     EXTENT_DELALLOC, 1);
+			     EXTENT_DELALLOC, 1, cached_state);
 	if (!ret) {
-		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+		unlock_extent_cached(tree, delalloc_start, delalloc_end,
+				     &cached_state, GFP_NOFS);
 		__unlock_for_delalloc(inode, locked_page,
 			      delalloc_start, delalloc_end);
 		cond_resched();
 		goto again;
 	}
+	free_extent_state(cached_state);
 	*start = delalloc_start;
 	*end = delalloc_end;
 out_failed:
@@ -1307,7 +1359,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				int clear_unlock,
 				int clear_delalloc, int clear_dirty,
 				int set_writeback,
-				int end_writeback)
+				int end_writeback,
+				int set_private2)
 {
 	int ret;
 	struct page *pages[16];
@@ -1325,8 +1378,9 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	if (clear_delalloc)
 		clear_bits |= EXTENT_DELALLOC;
 
-	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
-	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
+	      set_private2))
 		return 0;
 
 	while (nr_pages > 0) {
@@ -1334,6 +1388,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				     min_t(unsigned long,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
+
+			if (set_private2)
+				SetPagePrivate2(pages[i]);
+
 			if (pages[i] == locked_page) {
 				page_cache_release(pages[i]);
 				continue;
@@ -1476,14 +1534,17 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
  * range is found set.
  */
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled)
+		   int bits, int filled, struct extent_state *cached)
 {
 	struct extent_state *state = NULL;
 	struct rb_node *node;
 	int bitset = 0;
 
 	spin_lock(&tree->lock);
-	node = tree_search(tree, start);
+	if (cached && cached->tree && cached->start == start)
+		node = &cached->rb_node;
+	else
+		node = tree_search(tree, start);
 	while (node && start <= end) {
 		state = rb_entry(node, struct extent_state, rb_node);
 
@@ -1526,7 +1587,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
 {
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
 		SetPageUptodate(page);
 	return 0;
 }
@@ -1540,7 +1601,7 @@ static int check_page_locked(struct extent_io_tree *tree,
 {
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
 		unlock_page(page);
 	return 0;
 }
@@ -1552,10 +1613,7 @@ static int check_page_locked(struct extent_io_tree *tree,
 static int check_page_writeback(struct extent_io_tree *tree,
 			     struct page *page)
 {
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
-		end_page_writeback(page);
+	end_page_writeback(page);
 	return 0;
 }
 
@@ -1613,13 +1671,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 		}
 
 		if (!uptodate) {
-			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			clear_extent_uptodate(tree, start, end, GFP_NOFS);
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
 
-		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
-
 		if (whole_page)
 			end_page_writeback(page);
 		else
@@ -1983,7 +2039,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			continue;
 		}
 		/* the get_extent function already copied into the page */
-		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+		if (test_range_bit(tree, cur, cur_end,
+				   EXTENT_UPTODATE, 1, NULL)) {
 			check_page_uptodate(tree, page);
 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
 			cur = cur + iosize;
@@ -2078,6 +2135,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 iosize;
 	u64 unlock_start;
 	sector_t sector;
+	struct extent_state *cached_state = NULL;
 	struct extent_map *em;
 	struct block_device *bdev;
 	int ret;
@@ -2124,6 +2182,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	delalloc_end = 0;
 	page_started = 0;
 	if (!epd->extent_locked) {
+		u64 delalloc_to_write;
 		/*
 		 * make sure the wbc mapping index is at least updated
 		 * to this page.
@@ -2143,6 +2202,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			tree->ops->fill_delalloc(inode, page, delalloc_start,
 						 delalloc_end, &page_started,
 						 &nr_written);
+			delalloc_to_write = (delalloc_end -
+					max_t(u64, page_offset(page),
+					      delalloc_start) + 1) >>
+				        PAGE_CACHE_SHIFT;
+			if (wbc->nr_to_write < delalloc_to_write) {
+				wbc->nr_to_write = min_t(long, 8192,
+						 delalloc_to_write);
+			}
 			delalloc_start = delalloc_end + 1;
 		}
 
@@ -2160,15 +2227,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			goto done_unlocked;
 		}
 	}
-	lock_extent(tree, start, page_end, GFP_NOFS);
-
-	unlock_start = start;
-
 	if (tree->ops && tree->ops->writepage_start_hook) {
 		ret = tree->ops->writepage_start_hook(page, start,
 						      page_end);
 		if (ret == -EAGAIN) {
-			unlock_extent(tree, start, page_end, GFP_NOFS);
 			redirty_page_for_writepage(wbc, page);
 			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
@@ -2184,12 +2246,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	update_nr_written(page, wbc, nr_written + 1);
 
 	end = page_end;
-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
-		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
-
 	if (last_byte <= start) {
-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
-		unlock_extent(tree, start, page_end, GFP_NOFS);
 		if (tree->ops && tree->ops->writepage_end_io_hook)
 			tree->ops->writepage_end_io_hook(page, start,
 							 page_end, NULL, 1);
@@ -2197,13 +2254,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		goto done;
 	}
 
-	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
 	blocksize = inode->i_sb->s_blocksize;
 
 	while (cur <= end) {
 		if (cur >= last_byte) {
-			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
-			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 page_end, NULL, 1);
@@ -2235,12 +2289,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		 */
 		if (compressed || block_start == EXTENT_MAP_HOLE ||
 		    block_start == EXTENT_MAP_INLINE) {
-			clear_extent_dirty(tree, cur,
-					   cur + iosize - 1, GFP_NOFS);
-
-			unlock_extent(tree, unlock_start, cur + iosize - 1,
-				      GFP_NOFS);
-
 			/*
 			 * end_io notification does not happen here for
 			 * compressed extents
@@ -2265,13 +2313,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		}
 		/* leave this out until we have a page_mkwrite call */
 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
-				   EXTENT_DIRTY, 0)) {
+				   EXTENT_DIRTY, 0, NULL)) {
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
 		}
 
-		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
 		if (tree->ops && tree->ops->writepage_io_hook) {
 			ret = tree->ops->writepage_io_hook(page, cur,
 						cur + iosize - 1);
@@ -2309,12 +2356,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		set_page_writeback(page);
 		end_page_writeback(page);
 	}
-	if (unlock_start <= page_end)
-		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 	unlock_page(page);
 
 done_unlocked:
 
+	/* drop our reference on any cached states */
+	free_extent_state(cached_state);
 	return 0;
 }
 
@@ -2339,7 +2386,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 			     writepage_t writepage, void *data,
 			     void (*flush_fn)(void *))
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int ret = 0;
 	int done = 0;
 	struct pagevec pvec;
@@ -2414,10 +2460,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 			}
 			if (ret || wbc->nr_to_write <= 0)
 				done = 1;
-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
@@ -2604,10 +2646,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 		return 0;
 
 	lock_extent(tree, start, end, GFP_NOFS);
-	wait_on_extent_writeback(tree, start, end);
+	wait_on_page_writeback(page);
 	clear_extent_bit(tree, start, end,
 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
-			 1, 1, GFP_NOFS);
+			 1, 1, NULL, GFP_NOFS);
 	return 0;
 }
 
@@ -2687,7 +2729,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
 		    !isnew && !PageUptodate(page) &&
 		    (block_off_end > to || block_off_start < from) &&
 		    !test_range_bit(tree, block_start, cur_end,
-				    EXTENT_UPTODATE, 1)) {
+				    EXTENT_UPTODATE, 1, NULL)) {
 			u64 sector;
 			u64 extent_offset = block_start - em->start;
 			size_t iosize;
@@ -2701,7 +2743,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
 			 */
 			set_extent_bit(tree, block_start,
 				       block_start + iosize - 1,
-				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+				       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset, em->bdev,
 					 NULL, 1,
@@ -2742,13 +2784,13 @@ int try_release_extent_state(struct extent_map_tree *map,
 	int ret = 1;
 
 	if (test_range_bit(tree, start, end,
-			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
+			   EXTENT_IOBITS, 0, NULL))
 		ret = 0;
 	else {
 		if ((mask & GFP_NOFS) == GFP_NOFS)
 			mask = GFP_NOFS;
 		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
-				 1, 1, mask);
+				 1, 1, NULL, mask);
 	}
 	return ret;
 }
@@ -2771,29 +2813,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 		u64 len;
 		while (start <= end) {
 			len = end - start + 1;
-			spin_lock(&map->lock);
+			write_lock(&map->lock);
 			em = lookup_extent_mapping(map, start, len);
 			if (!em || IS_ERR(em)) {
-				spin_unlock(&map->lock);
+				write_unlock(&map->lock);
 				break;
 			}
 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
 			    em->start != start) {
-				spin_unlock(&map->lock);
+				write_unlock(&map->lock);
 				free_extent_map(em);
 				break;
 			}
 			if (!test_range_bit(tree, em->start,
 					    extent_map_end(em) - 1,
-					    EXTENT_LOCKED | EXTENT_WRITEBACK |
-					    EXTENT_ORDERED,
-					    0)) {
+					    EXTENT_LOCKED | EXTENT_WRITEBACK,
+					    0, NULL)) {
 				remove_extent_mapping(map, em);
 				/* once for the rb tree */
 				free_extent_map(em);
 			}
 			start = extent_map_end(em);
-			spin_unlock(&map->lock);
+			write_unlock(&map->lock);
 
 			/* once for us */
 			free_extent_map(em);
@@ -3203,7 +3244,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 	int uptodate;
 	unsigned long index;
 
-	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
 	if (ret)
 		return 1;
 	while (start <= end) {
@@ -3233,7 +3274,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 		return 1;
 
 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1);
+			   EXTENT_UPTODATE, 1, NULL);
 	if (ret)
 		return ret;
 
@@ -3269,7 +3310,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		return 0;
 
 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1)) {
+			   EXTENT_UPTODATE, 1, NULL)) {
 		return 0;
 	}
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5bc20abf3f3d..14ed16fd862d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,10 +13,8 @@
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_ORDERED (1 << 9)
-#define EXTENT_ORDERED_METADATA (1 << 10)
-#define EXTENT_BOUNDARY (1 << 11)
-#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /* flags for bio submission */
@@ -142,6 +140,8 @@ int try_release_extent_state(struct extent_map_tree *map,
 			     struct extent_io_tree *tree, struct page *page,
 			     gfp_t mask);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, struct extent_state **cached, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask);
@@ -155,11 +155,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 max_bytes, unsigned long bits);
 
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled);
+		   int bits, int filled, struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete, gfp_t mask);
+		     int bits, int wake, int delete, struct extent_state **cached,
+		     gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -282,5 +283,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				int clear_unlock,
 				int clear_delalloc, int clear_dirty,
 				int set_writeback,
-				int end_writeback);
+				int end_writeback,
+				int set_private2);
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 30c9365861e6..5bc7a0d325e7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -36,7 +36,7 @@ void extent_map_exit(void)
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
 	tree->map.rb_node = NULL;
-	spin_lock_init(&tree->lock);
+	rwlock_init(&tree->lock);
 }
 
 /**
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 	return 0;
 }
 
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+	int ret = 0;
+	struct extent_map *merge = NULL;
+	struct rb_node *rb;
+	struct extent_map *em;
+
+	write_lock(&tree->lock);
+	em = lookup_extent_mapping(tree, start, len);
+
+	WARN_ON(em->start != start || !em);
+
+	if (!em)
+		goto out;
+
+	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb)
+			merge = rb_entry(rb, struct extent_map, rb_node);
+		if (rb && mergable_maps(merge, em)) {
+			em->start = merge->start;
+			em->len += merge->len;
+			em->block_len += merge->block_len;
+			em->block_start = merge->block_start;
+			merge->in_tree = 0;
+			rb_erase(&merge->rb_node, &tree->map);
+			free_extent_map(merge);
+		}
+	}
+
+	rb = rb_next(&em->rb_node);
+	if (rb)
+		merge = rb_entry(rb, struct extent_map, rb_node);
+	if (rb && mergable_maps(em, merge)) {
+		em->len += merge->len;
+		em->block_len += merge->len;
+		rb_erase(&merge->rb_node, &tree->map);
+		merge->in_tree = 0;
+		free_extent_map(merge);
+	}
+
+	free_extent_map(em);
+out:
+	write_unlock(&tree->lock);
+	return ret;
+
+}
+
 /**
  * add_extent_mapping - add new extent map to the extent tree
  * @tree:	tree to insert new map in
@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		ret = -EEXIST;
 		goto out;
 	}
-	assert_spin_locked(&tree->lock);
 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
 		ret = -EEXIST;
@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 	struct rb_node *next = NULL;
 	u64 end = range_end(start, len);
 
-	assert_spin_locked(&tree->lock);
 	rb_node = __tree_search(&tree->map, start, &prev, &next);
 	if (!rb_node && prev) {
 		em = rb_entry(prev, struct extent_map, rb_node);
@@ -331,7 +379,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	int ret = 0;
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
-	assert_spin_locked(&tree->lock);
 	rb_erase(&em->rb_node, &tree->map);
 	em->in_tree = 0;
 	return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index fb6eeef06bb0..d3d442f4bbbd 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -31,7 +31,7 @@ struct extent_map {
 
 struct extent_map_tree {
 	struct rb_root map;
-	spinlock_t lock;
+	rwlock_t lock;
 };
 
 static inline u64 extent_map_end(struct extent_map *em)
@@ -59,4 +59,5 @@ struct extent_map *alloc_extent_map(gfp_t mask);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b833972273a..571ad3c13b47 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int i;
 	struct inode *inode = fdentry(file)->d_inode;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	u64 hint_byte;
 	u64 num_bytes;
 	u64 start_pos;
 	u64 end_of_last_block;
@@ -125,22 +123,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
 	end_of_last_block = start_pos + num_bytes - 1;
-
-	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-	trans = btrfs_join_transaction(root, 1);
-	if (!trans) {
-		err = -ENOMEM;
-		goto out_unlock;
-	}
-	btrfs_set_trans_block_group(trans, inode);
-	hint_byte = 0;
-
-	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-
-	/* check for reserved extents on each page, we don't want
-	 * to reset the delalloc bit on things that already have
-	 * extents reserved.
-	 */
 	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = pages[i];
@@ -155,9 +137,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		 * at this time.
 		 */
 	}
-	err = btrfs_end_transaction(trans, root);
-out_unlock:
-	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	return err;
 }
 
@@ -189,18 +168,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		if (!split2)
 			split2 = alloc_extent_map(GFP_NOFS);
 
-		spin_lock(&em_tree->lock);
+		write_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
 		if (!em) {
-			spin_unlock(&em_tree->lock);
+			write_unlock(&em_tree->lock);
 			break;
 		}
 		flags = em->flags;
 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
-			spin_unlock(&em_tree->lock);
 			if (em->start <= start &&
 			    (!testend || em->start + em->len >= start + len)) {
 				free_extent_map(em);
+				write_unlock(&em_tree->lock);
 				break;
 			}
 			if (start < em->start) {
@@ -210,6 +189,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 				start = em->start + em->len;
 			}
 			free_extent_map(em);
+			write_unlock(&em_tree->lock);
 			continue;
 		}
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -260,7 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			free_extent_map(split);
 			split = NULL;
 		}
-		spin_unlock(&em_tree->lock);
+		write_unlock(&em_tree->lock);
 
 		/* once for us */
 		free_extent_map(em);
@@ -289,7 +269,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 locked_end,
-		       u64 inline_limit, u64 *hint_byte)
+		       u64 inline_limit, u64 *hint_byte, int drop_cache)
 {
 	u64 extent_end = 0;
 	u64 search_start = start;
@@ -314,7 +294,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int ret;
 
 	inline_limit = 0;
-	btrfs_drop_extent_cache(inode, start, end - 1, 0);
+	if (drop_cache)
+		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
 	path = btrfs_alloc_path();
 	if (!path)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59cba180fe83..941f1b71cd22 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_drop_extents(trans, root, inode, start,
-				 aligned_end, aligned_end, start, &hint_byte);
+				 aligned_end, aligned_end, start,
+				 &hint_byte, 1);
 	BUG_ON(ret);
 
 	if (isize > actual_end)
@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				   inline_len, compressed_size,
 				   compressed_pages);
 	BUG_ON(ret);
-	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 	return 0;
 }
 
@@ -425,7 +426,7 @@ static noinline int compress_file_range(struct inode *inode,
 			extent_clear_unlock_delalloc(inode,
 						     &BTRFS_I(inode)->io_tree,
 						     start, end, NULL, 1, 0,
-						     0, 1, 1, 1);
+						     0, 1, 1, 1, 0);
 			ret = 0;
 			goto free_pages_out;
 		}
@@ -611,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
 		while (1) {
-			spin_lock(&em_tree->lock);
+			write_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
-			spin_unlock(&em_tree->lock);
+			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
 				break;
@@ -640,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 					     async_extent->start,
 					     async_extent->start +
 					     async_extent->ram_size - 1,
-					     NULL, 1, 1, 0, 1, 1, 0);
+					     NULL, 1, 1, 0, 1, 1, 0, 0);
 
 		ret = btrfs_submit_compressed_write(inode,
 				    async_extent->start,
@@ -713,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
 			extent_clear_unlock_delalloc(inode,
 						     &BTRFS_I(inode)->io_tree,
 						     start, end, NULL, 1, 1,
-						     1, 1, 1, 1);
+						     1, 1, 1, 1, 0);
 			*nr_written = *nr_written +
 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 			*page_started = 1;
@@ -747,9 +748,9 @@ static noinline int cow_file_range(struct inode *inode,
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
 		while (1) {
-			spin_lock(&em_tree->lock);
+			write_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
-			spin_unlock(&em_tree->lock);
+			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
 				break;
@@ -776,11 +777,14 @@ static noinline int cow_file_range(struct inode *inode,
 		/* we're not doing compressed IO, don't unlock the first
 		 * page (which the caller expects to stay locked), don't
 		 * clear any dirty bits and don't set any writeback bits
+		 *
+		 * Do set the Private2 bit so we know this page was properly
+		 * setup for writepage
 		 */
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					     start, start + ram_size - 1,
 					     locked_page, unlock, 1,
-					     1, 0, 0, 0);
+					     1, 0, 0, 0, 1);
 		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
@@ -853,7 +857,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	int limit = 10 * 1024 * 1042;
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
-			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+			 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
 	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
 		async_cow->inode = inode;
@@ -1080,9 +1084,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
 			while (1) {
-				spin_lock(&em_tree->lock);
+				write_lock(&em_tree->lock);
 				ret = add_extent_mapping(em_tree, em);
-				spin_unlock(&em_tree->lock);
+				write_unlock(&em_tree->lock);
 				if (ret != -EEXIST) {
 					free_extent_map(em);
 					break;
@@ -1101,7 +1105,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					cur_offset, cur_offset + num_bytes - 1,
-					locked_page, 1, 1, 1, 0, 0, 0);
+					locked_page, 1, 1, 1, 0, 0, 0, 1);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
@@ -1374,10 +1378,8 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
 
 	/* already ordered? We're done */
-	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			     EXTENT_ORDERED, 0)) {
+	if (PagePrivate2(page))
 		goto out;
-	}
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
@@ -1413,11 +1415,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	struct inode *inode = page->mapping->host;
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
 
-	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
-			     EXTENT_ORDERED, 0);
-	if (ret)
+	/* this page is properly in the ordered list */
+	if (TestClearPagePrivate2(page))
 		return 0;
 
 	if (PageChecked(page))
@@ -1455,9 +1455,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 
 	path->leave_spinning = 1;
+
+	/*
+	 * we may be replacing one extent in the tree with another.
+	 * The new extent is pinned in the extent map, and we don't want
+	 * to drop it from the cache until it is completely in the btree.
+	 *
+	 * So, tell btrfs_drop_extents to leave this extent in the cache.
+	 * the caller is expected to unpin it and allow it to be merged
+	 * with the others.
+	 */
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
 				 file_pos + num_bytes, locked_end,
-				 file_pos, &hint);
+				 file_pos, &hint, 0);
 	BUG_ON(ret);
 
 	ins.objectid = inode->i_ino;
@@ -1485,7 +1495,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 
 	inode_add_bytes(inode, num_bytes);
-	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
 
 	ins.objectid = disk_bytenr;
 	ins.offset = disk_num_bytes;
@@ -1596,6 +1605,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->len,
 						compressed, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
+		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+				   ordered_extent->file_offset,
+				   ordered_extent->len);
 		BUG_ON(ret);
 	}
 	unlock_extent(io_tree, ordered_extent->file_offset,
@@ -1623,6 +1635,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 				struct extent_state *state, int uptodate)
 {
+	ClearPagePrivate2(page);
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
@@ -1669,13 +1682,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		failrec->last_mirror = 0;
 		failrec->bio_flags = 0;
 
-		spin_lock(&em_tree->lock);
+		read_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, failrec->len);
 		if (em->start > start || em->start + em->len < start) {
 			free_extent_map(em);
 			em = NULL;
 		}
-		spin_unlock(&em_tree->lock);
+		read_unlock(&em_tree->lock);
 
 		if (!em || IS_ERR(em)) {
 			kfree(failrec);
@@ -1794,7 +1807,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		return 0;
 
 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
-	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
 				  GFP_NOFS);
 		return 0;
@@ -2935,7 +2948,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 						 cur_offset,
 						 cur_offset + hole_size,
 						 block_end,
-						 cur_offset, &hint_byte);
+						 cur_offset, &hint_byte, 1);
 			if (err)
 				break;
 			err = btrfs_insert_file_extent(trans, root,
@@ -4064,11 +4077,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	int compressed;
 
 again:
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
 	if (em)
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	if (em) {
 		if (em->start > start || em->start + em->len <= start)
@@ -4215,6 +4228,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				map = kmap(page);
 				read_extent_buffer(leaf, map + pg_offset, ptr,
 						   copy_size);
+				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+					memset(map + pg_offset + copy_size, 0,
+					       PAGE_CACHE_SIZE - pg_offset -
+					       copy_size);
+				}
 				kunmap(page);
 			}
 			flush_dcache_page(page);
@@ -4259,7 +4277,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	}
 
 	err = 0;
-	spin_lock(&em_tree->lock);
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
@@ -4299,7 +4317,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 			err = 0;
 		}
 	}
-	spin_unlock(&em_tree->lock);
+	write_unlock(&em_tree->lock);
 out:
 	if (path)
 		btrfs_free_path(path);
@@ -4398,13 +4416,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
+
+	/*
+	 * we have the page locked, so new writeback can't start,
+	 * and the dirty bit won't be cleared while we are here.
+	 *
+	 * Wait for IO on this page so that we can safely clear
+	 * the PagePrivate2 bit and do ordered accounting
+	 */
 	wait_on_page_writeback(page);
+
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	if (offset) {
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-
 	lock_extent(tree, page_start, page_end, GFP_NOFS);
 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
 					   page_offset(page));
@@ -4415,16 +4441,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		 */
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
-		btrfs_finish_ordered_io(page->mapping->host,
-					page_start, page_end);
+				 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+		/*
+		 * whoever cleared the private bit is responsible
+		 * for the finish_ordered_io
+		 */
+		if (TestClearPagePrivate2(page)) {
+			btrfs_finish_ordered_io(page->mapping->host,
+						page_start, page_end);
+		}
 		btrfs_put_ordered_extent(ordered);
 		lock_extent(tree, page_start, page_end, GFP_NOFS);
 	}
 	clear_extent_bit(tree, page_start, page_end,
-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_ORDERED,
-		 1, 1, GFP_NOFS);
+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+		 1, 1, NULL, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
@@ -4521,11 +4552,14 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
+	SetPageUptodate(page);
 
 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
+	if (!ret)
+		return VM_FAULT_LOCKED;
 	unlock_page(page);
 out:
 	return ret;
@@ -5058,6 +5092,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 						  0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
 		BUG_ON(ret);
+		btrfs_drop_extent_cache(inode, cur_offset,
+					cur_offset + ins.offset -1, 0);
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
 		alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd88f25889f7..ef0188fb3cc4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -596,9 +596,8 @@ static int btrfs_defrag_file(struct file *file)
 		clear_page_dirty_for_io(page);
 
 		btrfs_set_extent_delalloc(inode, page_start, page_end);
-
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
@@ -976,7 +975,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 	/* punch hole in destination first */
 	btrfs_drop_extents(trans, root, inode, off, off + len,
-			   off + len, 0, &hint_byte);
+			   off + len, 0, &hint_byte, 1);
 
 	/* clone data */
 	key.objectid = src->i_ino;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..4a9c8c4cec25 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  *
  * len is the length of the extent
  *
- * This also sets the EXTENT_ORDERED bit on the range in the inode.
- *
  * The tree is given a single reference on the ordered extent that was
  * inserted.
  */
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->start = start;
 	entry->len = len;
 	entry->disk_len = disk_len;
+	entry->bytes_left = len;
 	entry->inode = inode;
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			   &entry->rb_node);
 	BUG_ON(node);
 
-	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
-			   entry_end(entry) - 1, GFP_NOFS);
-
 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 	list_add_tail(&entry->root_extent_list,
 		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	int ret;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
 	mutex_lock(&tree->mutex);
-	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
-			     GFP_NOFS);
 	node = tree_search(tree, file_offset);
 	if (!node) {
 		ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 		goto out;
 	}
 
-	ret = test_range_bit(io_tree, entry->file_offset,
-			     entry->file_offset + entry->len - 1,
-			     EXTENT_ORDERED, 0);
-	if (ret == 0)
+	if (io_size > entry->bytes_left) {
+		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+		       (unsigned long long)entry->bytes_left,
+		       (unsigned long long)io_size);
+	}
+	entry->bytes_left -= io_size;
+	if (entry->bytes_left == 0)
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+	else
+		ret = 1;
 out:
 	mutex_unlock(&tree->mutex);
 	return ret == 0;
@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	u64 orig_end;
 	u64 wait_end;
 	struct btrfs_ordered_extent *ordered;
+	int found;
 
 	if (start + len < start) {
 		orig_end = INT_LIMIT(loff_t);
@@ -502,6 +501,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 					   orig_end >> PAGE_CACHE_SHIFT);
 
 	end = orig_end;
+	found = 0;
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
 		if (!ordered)
@@ -514,6 +514,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
+		found++;
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
 		btrfs_put_ordered_extent(ordered);
@@ -521,8 +522,8 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 			break;
 		end--;
 	}
-	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
-			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+			   EXTENT_DELALLOC, 0, NULL)) {
 		schedule_timeout(1);
 		goto again;
 	}
@@ -613,7 +614,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 */
 	if (test_range_bit(io_tree, disk_i_size,
 			   ordered->file_offset + ordered->len - 1,
-			   EXTENT_DELALLOC, 0)) {
+			   EXTENT_DELALLOC, 0, NULL)) {
 		goto out;
 	}
 	/*
@@ -664,7 +665,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 */
 	if (i_size_test > entry_end(ordered) &&
 	    !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
-			   EXTENT_DELALLOC, 0)) {
+			   EXTENT_DELALLOC, 0, NULL)) {
 		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
 	}
 	BTRFS_I(inode)->disk_i_size = new_i_size;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3d31c8827b01..993a7ea45c70 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
 	/* extent length on disk */
 	u64 disk_len;
 
+	/* number of bytes that still need writing */
+	u64 bytes_left;
+
 	/* flags (described above) */
 	unsigned long flags;
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c04f7f212602..3be16ccc7eea 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2180,7 +2180,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
 				struct reloc_control *rc)
 {
 	if (test_range_bit(&rc->processed_blocks, bytenr,
-			   bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
 		return 1;
 	return 0;
 }
@@ -2646,9 +2646,9 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
 	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
 	while (1) {
 		int ret;
-		spin_lock(&em_tree->lock);
+		write_lock(&em_tree->lock);
 		ret = add_extent_mapping(em_tree, em);
-		spin_unlock(&em_tree->lock);
+		write_unlock(&em_tree->lock);
 		if (ret != -EEXIST) {
 			free_extent_map(em);
 			break;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d91b0de7c502..8661a7381b39 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
 	ret = btrfs_drop_extents(trans, root, inode,
-			 start, extent_end, extent_end, start, &alloc_hint);
+			 start, extent_end, extent_end, start, &alloc_hint, 1);
 	BUG_ON(ret);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..d2358c06bbd9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -276,7 +276,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
+		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 		    fs_info->fs_devices->open_devices > 1) {
 			struct io_context *ioc;
 
@@ -1749,9 +1749,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	 * step two, delete the device extents and the
 	 * chunk tree entries
 	 */
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	BUG_ON(em->start > chunk_offset ||
 	       em->start + em->len < chunk_offset);
@@ -1780,9 +1780,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
 	BUG_ON(ret);
 
-	spin_lock(&em_tree->lock);
+	write_lock(&em_tree->lock);
 	remove_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+	write_unlock(&em_tree->lock);
 
 	kfree(map);
 	em->bdev = NULL;
@@ -2294,9 +2294,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	em->block_len = em->len;
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
-	spin_lock(&em_tree->lock);
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+	write_unlock(&em_tree->lock);
 	BUG_ON(ret);
 	free_extent_map(em);
 
@@ -2491,9 +2491,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 	int readonly = 0;
 	int i;
 
-	spin_lock(&map_tree->map_tree.lock);
+	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
-	spin_unlock(&map_tree->map_tree.lock);
+	read_unlock(&map_tree->map_tree.lock);
 	if (!em)
 		return 1;
 
@@ -2518,11 +2518,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	struct extent_map *em;
 
 	while (1) {
-		spin_lock(&tree->map_tree.lock);
+		write_lock(&tree->map_tree.lock);
 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
 		if (em)
 			remove_extent_mapping(&tree->map_tree, em);
-		spin_unlock(&tree->map_tree.lock);
+		write_unlock(&tree->map_tree.lock);
 		if (!em)
 			break;
 		kfree(em->bdev);
@@ -2540,9 +2540,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
 	int ret;
 
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, len);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -2604,9 +2604,9 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		atomic_set(&multi->error, 0);
 	}
 
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	if (!em && unplug_page)
 		return 0;
@@ -2763,9 +2763,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	u64 stripe_nr;
 	int i, j, nr = 0;
 
-	spin_lock(&em_tree->lock);
+	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
-	spin_unlock(&em_tree->lock);
+	read_unlock(&em_tree->lock);
 
 	BUG_ON(!em || em->start != chunk_start);
 	map = (struct map_lookup *)em->bdev;
@@ -3053,9 +3053,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
 
-	spin_lock(&map_tree->map_tree.lock);
+	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
-	spin_unlock(&map_tree->map_tree.lock);
+	read_unlock(&map_tree->map_tree.lock);
 
 	/* already mapped? */
 	if (em && em->start <= logical && em->start + em->len > logical) {
@@ -3114,9 +3114,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		map->stripes[i].dev->in_fs_metadata = 1;
 	}
 
-	spin_lock(&map_tree->map_tree.lock);
+	write_lock(&map_tree->map_tree.lock);
 	ret = add_extent_mapping(&map_tree->map_tree, em);
-	spin_unlock(&map_tree->map_tree.lock);
+	write_unlock(&map_tree->map_tree.lock);
 	BUG_ON(ret);
 	free_extent_map(em);