diff --git a/Documentation/dma-buf-sharing.txt b/Documentation/dma-buf-sharing.txt
index 225f96d88f55..3bbd5c51605a 100644
--- a/Documentation/dma-buf-sharing.txt
+++ b/Documentation/dma-buf-sharing.txt
@@ -32,8 +32,12 @@ The buffer-user
 *IMPORTANT*: [see https://lkml.org/lkml/2011/12/20/211 for more details]
 For this first version, A buffer shared using the dma_buf sharing API:
 - *may* be exported to user space using "mmap" *ONLY* by exporter, outside of
-   this framework.
-- may be used *ONLY* by importers that do not need CPU access to the buffer.
+  this framework.
+- with this new iteration of the dma-buf api cpu access from the kernel has been
+  enable, see below for the details.
+
+dma-buf operations for device dma only
+--------------------------------------
 
 The dma_buf buffer sharing API usage contains the following steps:
 
@@ -219,10 +223,120 @@ NOTES:
    If the exporter chooses not to allow an attach() operation once a
    map_dma_buf() API has been called, it simply returns an error.
 
-Miscellaneous notes:
+Kernel cpu access to a dma-buf buffer object
+--------------------------------------------
+
+The motivation to allow cpu access from the kernel to a dma-buf object from the
+importers side are:
+- fallback operations, e.g. if the devices is connected to a usb bus and the
+  kernel needs to shuffle the data around first before sending it away.
+- full transparency for existing users on the importer side, i.e. userspace
+  should not notice the difference between a normal object from that subsystem
+  and an imported one backed by a dma-buf. This is really important for drm
+  opengl drivers that expect to still use all the existing upload/download
+  paths.
+
+Access to a dma_buf from the kernel context involves three steps:
+
+1. Prepare access, which invalidate any necessary caches and make the object
+   available for cpu access.
+2. Access the object page-by-page with the dma_buf map apis
+3. Finish access, which will flush any necessary cpu caches and free reserved
+   resources.
+
+1. Prepare access
+
+   Before an importer can access a dma_buf object with the cpu from the kernel
+   context, it needs to notify the exporter of the access that is about to
+   happen.
+
+   Interface:
+      int dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
+				   size_t start, size_t len,
+				   enum dma_data_direction direction)
+
+   This allows the exporter to ensure that the memory is actually available for
+   cpu access - the exporter might need to allocate or swap-in and pin the
+   backing storage. The exporter also needs to ensure that cpu access is
+   coherent for the given range and access direction. The range and access
+   direction can be used by the exporter to optimize the cache flushing, i.e.
+   access outside of the range or with a different direction (read instead of
+   write) might return stale or even bogus data (e.g. when the exporter needs to
+   copy the data to temporary storage).
+
+   This step might fail, e.g. in oom conditions.
+
+2. Accessing the buffer
+
+   To support dma_buf objects residing in highmem cpu access is page-based using
+   an api similar to kmap. Accessing a dma_buf is done in aligned chunks of
+   PAGE_SIZE size. Before accessing a chunk it needs to be mapped, which returns
+   a pointer in kernel virtual address space. Afterwards the chunk needs to be
+   unmapped again. There is no limit on how often a given chunk can be mapped
+   and unmapped, i.e. the importer does not need to call begin_cpu_access again
+   before mapping the same chunk again.
+
+   Interfaces:
+      void *dma_buf_kmap(struct dma_buf *, unsigned long);
+      void dma_buf_kunmap(struct dma_buf *, unsigned long, void *);
+
+   There are also atomic variants of these interfaces. Like for kmap they
+   facilitate non-blocking fast-paths. Neither the importer nor the exporter (in
+   the callback) is allowed to block when using these.
+
+   Interfaces:
+      void *dma_buf_kmap_atomic(struct dma_buf *, unsigned long);
+      void dma_buf_kunmap_atomic(struct dma_buf *, unsigned long, void *);
+
+   For importers all the restrictions of using kmap apply, like the limited
+   supply of kmap_atomic slots. Hence an importer shall only hold onto at most 2
+   atomic dma_buf kmaps at the same time (in any given process context).
+
+   dma_buf kmap calls outside of the range specified in begin_cpu_access are
+   undefined. If the range is not PAGE_SIZE aligned, kmap needs to succeed on
+   the partial chunks at the beginning and end but may return stale or bogus
+   data outside of the range (in these partial chunks).
+
+   Note that these calls need to always succeed. The exporter needs to complete
+   any preparations that might fail in begin_cpu_access.
+
+3. Finish access
+
+   When the importer is done accessing the range specified in begin_cpu_access,
+   it needs to announce this to the exporter (to facilitate cache flushing and
+   unpinning of any pinned resources). The result of of any dma_buf kmap calls
+   after end_cpu_access is undefined.
+
+   Interface:
+      void dma_buf_end_cpu_access(struct dma_buf *dma_buf,
+				  size_t start, size_t len,
+				  enum dma_data_direction dir);
+
+
+Miscellaneous notes
+-------------------
+
 - Any exporters or users of the dma-buf buffer sharing framework must have
   a 'select DMA_SHARED_BUFFER' in their respective Kconfigs.
 
+- In order to avoid fd leaks on exec, the FD_CLOEXEC flag must be set
+  on the file descriptor.  This is not just a resource leak, but a
+  potential security hole.  It could give the newly exec'd application
+  access to buffers, via the leaked fd, to which it should otherwise
+  not be permitted access.
+
+  The problem with doing this via a separate fcntl() call, versus doing it
+  atomically when the fd is created, is that this is inherently racy in a
+  multi-threaded app[3].  The issue is made worse when it is library code
+  opening/creating the file descriptor, as the application may not even be
+  aware of the fd's.
+
+  To avoid this problem, userspace must have a way to request O_CLOEXEC
+  flag be set when the dma-buf fd is created.  So any API provided by
+  the exporting driver to create a dmabuf fd must provide a way to let
+  userspace control setting of O_CLOEXEC flag passed in to dma_buf_fd().
+
 References:
 [1] struct dma_buf_ops in include/linux/dma-buf.h
 [2] All interfaces mentioned above defined in include/linux/dma-buf.h
+[3] https://lwn.net/Articles/236486/
diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c
index e38ad243b4bb..07cbbc6fddb4 100644
--- a/drivers/base/dma-buf.c
+++ b/drivers/base/dma-buf.c
@@ -71,7 +71,7 @@ static inline int is_dma_buf_file(struct file *file)
  * ops, or error in allocating struct dma_buf, will return negative error.
  *
  */
-struct dma_buf *dma_buf_export(void *priv, struct dma_buf_ops *ops,
+struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
 				size_t size, int flags)
 {
 	struct dma_buf *dmabuf;
@@ -80,7 +80,9 @@ struct dma_buf *dma_buf_export(void *priv, struct dma_buf_ops *ops,
 	if (WARN_ON(!priv || !ops
 			  || !ops->map_dma_buf
 			  || !ops->unmap_dma_buf
-			  || !ops->release)) {
+			  || !ops->release
+			  || !ops->kmap_atomic
+			  || !ops->kmap)) {
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -107,17 +109,18 @@ EXPORT_SYMBOL_GPL(dma_buf_export);
 /**
  * dma_buf_fd - returns a file descriptor for the given dma_buf
  * @dmabuf:	[in]	pointer to dma_buf for which fd is required.
+ * @flags:      [in]    flags to give to fd
  *
  * On success, returns an associated 'fd'. Else, returns error.
  */
-int dma_buf_fd(struct dma_buf *dmabuf)
+int dma_buf_fd(struct dma_buf *dmabuf, int flags)
 {
 	int error, fd;
 
 	if (!dmabuf || !dmabuf->file)
 		return -EINVAL;
 
-	error = get_unused_fd();
+	error = get_unused_fd_flags(flags);
 	if (error < 0)
 		return error;
 	fd = error;
@@ -185,17 +188,18 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 	struct dma_buf_attachment *attach;
 	int ret;
 
-	if (WARN_ON(!dmabuf || !dev || !dmabuf->ops))
+	if (WARN_ON(!dmabuf || !dev))
 		return ERR_PTR(-EINVAL);
 
 	attach = kzalloc(sizeof(struct dma_buf_attachment), GFP_KERNEL);
 	if (attach == NULL)
-		goto err_alloc;
-
-	mutex_lock(&dmabuf->lock);
+		return ERR_PTR(-ENOMEM);
 
 	attach->dev = dev;
 	attach->dmabuf = dmabuf;
+
+	mutex_lock(&dmabuf->lock);
+
 	if (dmabuf->ops->attach) {
 		ret = dmabuf->ops->attach(dmabuf, dev, attach);
 		if (ret)
@@ -206,8 +210,6 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 	mutex_unlock(&dmabuf->lock);
 	return attach;
 
-err_alloc:
-	return ERR_PTR(-ENOMEM);
 err_attach:
 	kfree(attach);
 	mutex_unlock(&dmabuf->lock);
@@ -224,7 +226,7 @@ EXPORT_SYMBOL_GPL(dma_buf_attach);
  */
 void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach)
 {
-	if (WARN_ON(!dmabuf || !attach || !dmabuf->ops))
+	if (WARN_ON(!dmabuf || !attach))
 		return;
 
 	mutex_lock(&dmabuf->lock);
@@ -255,13 +257,10 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach,
 
 	might_sleep();
 
-	if (WARN_ON(!attach || !attach->dmabuf || !attach->dmabuf->ops))
+	if (WARN_ON(!attach || !attach->dmabuf))
 		return ERR_PTR(-EINVAL);
 
-	mutex_lock(&attach->dmabuf->lock);
-	if (attach->dmabuf->ops->map_dma_buf)
-		sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction);
-	mutex_unlock(&attach->dmabuf->lock);
+	sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction);
 
 	return sg_table;
 }
@@ -273,19 +272,137 @@ EXPORT_SYMBOL_GPL(dma_buf_map_attachment);
  * dma_buf_ops.
  * @attach:	[in]	attachment to unmap buffer from
  * @sg_table:	[in]	scatterlist info of the buffer to unmap
+ * @direction:  [in]    direction of DMA transfer
  *
  */
 void dma_buf_unmap_attachment(struct dma_buf_attachment *attach,
-				struct sg_table *sg_table)
+				struct sg_table *sg_table,
+				enum dma_data_direction direction)
 {
-	if (WARN_ON(!attach || !attach->dmabuf || !sg_table
-			    || !attach->dmabuf->ops))
+	if (WARN_ON(!attach || !attach->dmabuf || !sg_table))
 		return;
 
-	mutex_lock(&attach->dmabuf->lock);
-	if (attach->dmabuf->ops->unmap_dma_buf)
-		attach->dmabuf->ops->unmap_dma_buf(attach, sg_table);
-	mutex_unlock(&attach->dmabuf->lock);
-
+	attach->dmabuf->ops->unmap_dma_buf(attach, sg_table,
+						direction);
 }
 EXPORT_SYMBOL_GPL(dma_buf_unmap_attachment);
+
+
+/**
+ * dma_buf_begin_cpu_access - Must be called before accessing a dma_buf from the
+ * cpu in the kernel context. Calls begin_cpu_access to allow exporter-specific
+ * preparations. Coherency is only guaranteed in the specified range for the
+ * specified access direction.
+ * @dma_buf:	[in]	buffer to prepare cpu access for.
+ * @start:	[in]	start of range for cpu access.
+ * @len:	[in]	length of range for cpu access.
+ * @direction:	[in]	length of range for cpu access.
+ *
+ * Can return negative error values, returns 0 on success.
+ */
+int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, size_t start, size_t len,
+			     enum dma_data_direction direction)
+{
+	int ret = 0;
+
+	if (WARN_ON(!dmabuf))
+		return -EINVAL;
+
+	if (dmabuf->ops->begin_cpu_access)
+		ret = dmabuf->ops->begin_cpu_access(dmabuf, start, len, direction);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dma_buf_begin_cpu_access);
+
+/**
+ * dma_buf_end_cpu_access - Must be called after accessing a dma_buf from the
+ * cpu in the kernel context. Calls end_cpu_access to allow exporter-specific
+ * actions. Coherency is only guaranteed in the specified range for the
+ * specified access direction.
+ * @dma_buf:	[in]	buffer to complete cpu access for.
+ * @start:	[in]	start of range for cpu access.
+ * @len:	[in]	length of range for cpu access.
+ * @direction:	[in]	length of range for cpu access.
+ *
+ * This call must always succeed.
+ */
+void dma_buf_end_cpu_access(struct dma_buf *dmabuf, size_t start, size_t len,
+			    enum dma_data_direction direction)
+{
+	WARN_ON(!dmabuf);
+
+	if (dmabuf->ops->end_cpu_access)
+		dmabuf->ops->end_cpu_access(dmabuf, start, len, direction);
+}
+EXPORT_SYMBOL_GPL(dma_buf_end_cpu_access);
+
+/**
+ * dma_buf_kmap_atomic - Map a page of the buffer object into kernel address
+ * space. The same restrictions as for kmap_atomic and friends apply.
+ * @dma_buf:	[in]	buffer to map page from.
+ * @page_num:	[in]	page in PAGE_SIZE units to map.
+ *
+ * This call must always succeed, any necessary preparations that might fail
+ * need to be done in begin_cpu_access.
+ */
+void *dma_buf_kmap_atomic(struct dma_buf *dmabuf, unsigned long page_num)
+{
+	WARN_ON(!dmabuf);
+
+	return dmabuf->ops->kmap_atomic(dmabuf, page_num);
+}
+EXPORT_SYMBOL_GPL(dma_buf_kmap_atomic);
+
+/**
+ * dma_buf_kunmap_atomic - Unmap a page obtained by dma_buf_kmap_atomic.
+ * @dma_buf:	[in]	buffer to unmap page from.
+ * @page_num:	[in]	page in PAGE_SIZE units to unmap.
+ * @vaddr:	[in]	kernel space pointer obtained from dma_buf_kmap_atomic.
+ *
+ * This call must always succeed.
+ */
+void dma_buf_kunmap_atomic(struct dma_buf *dmabuf, unsigned long page_num,
+			   void *vaddr)
+{
+	WARN_ON(!dmabuf);
+
+	if (dmabuf->ops->kunmap_atomic)
+		dmabuf->ops->kunmap_atomic(dmabuf, page_num, vaddr);
+}
+EXPORT_SYMBOL_GPL(dma_buf_kunmap_atomic);
+
+/**
+ * dma_buf_kmap - Map a page of the buffer object into kernel address space. The
+ * same restrictions as for kmap and friends apply.
+ * @dma_buf:	[in]	buffer to map page from.
+ * @page_num:	[in]	page in PAGE_SIZE units to map.
+ *
+ * This call must always succeed, any necessary preparations that might fail
+ * need to be done in begin_cpu_access.
+ */
+void *dma_buf_kmap(struct dma_buf *dmabuf, unsigned long page_num)
+{
+	WARN_ON(!dmabuf);
+
+	return dmabuf->ops->kmap(dmabuf, page_num);
+}
+EXPORT_SYMBOL_GPL(dma_buf_kmap);
+
+/**
+ * dma_buf_kunmap - Unmap a page obtained by dma_buf_kmap.
+ * @dma_buf:	[in]	buffer to unmap page from.
+ * @page_num:	[in]	page in PAGE_SIZE units to unmap.
+ * @vaddr:	[in]	kernel space pointer obtained from dma_buf_kmap.
+ *
+ * This call must always succeed.
+ */
+void dma_buf_kunmap(struct dma_buf *dmabuf, unsigned long page_num,
+		    void *vaddr)
+{
+	WARN_ON(!dmabuf);
+
+	if (dmabuf->ops->kunmap)
+		dmabuf->ops->kunmap(dmabuf, page_num, vaddr);
+}
+EXPORT_SYMBOL_GPL(dma_buf_kunmap);
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 887dcd487062..3efbfc2145c3 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -29,6 +29,7 @@
 #include <linux/scatterlist.h>
 #include <linux/list.h>
 #include <linux/dma-mapping.h>
+#include <linux/fs.h>
 
 struct device;
 struct dma_buf;
@@ -49,6 +50,17 @@ struct dma_buf_attachment;
  * @unmap_dma_buf: decreases usecount of buffer, might deallocate scatter
  *		   pages.
  * @release: release this buffer; to be called after the last dma_buf_put.
+ * @begin_cpu_access: [optional] called before cpu access to invalidate cpu
+ * 		      caches and allocate backing storage (if not yet done)
+ * 		      respectively pin the objet into memory.
+ * @end_cpu_access: [optional] called after cpu access to flush cashes.
+ * @kmap_atomic: maps a page from the buffer into kernel address
+ * 		 space, users may not block until the subsequent unmap call.
+ * 		 This callback must not sleep.
+ * @kunmap_atomic: [optional] unmaps a atomically mapped page from the buffer.
+ * 		   This Callback must not sleep.
+ * @kmap: maps a page from the buffer into kernel address space.
+ * @kunmap: [optional] unmaps a page from the buffer.
  */
 struct dma_buf_ops {
 	int (*attach)(struct dma_buf *, struct device *,
@@ -63,7 +75,8 @@ struct dma_buf_ops {
 	struct sg_table * (*map_dma_buf)(struct dma_buf_attachment *,
 						enum dma_data_direction);
 	void (*unmap_dma_buf)(struct dma_buf_attachment *,
-						struct sg_table *);
+						struct sg_table *,
+						enum dma_data_direction);
 	/* TODO: Add try_map_dma_buf version, to return immed with -EBUSY
 	 * if the call would block.
 	 */
@@ -71,6 +84,14 @@ struct dma_buf_ops {
 	/* after final dma_buf_put() */
 	void (*release)(struct dma_buf *);
 
+	int (*begin_cpu_access)(struct dma_buf *, size_t, size_t,
+				enum dma_data_direction);
+	void (*end_cpu_access)(struct dma_buf *, size_t, size_t,
+			       enum dma_data_direction);
+	void *(*kmap_atomic)(struct dma_buf *, unsigned long);
+	void (*kunmap_atomic)(struct dma_buf *, unsigned long, void *);
+	void *(*kmap)(struct dma_buf *, unsigned long);
+	void (*kunmap)(struct dma_buf *, unsigned long, void *);
 };
 
 /**
@@ -86,7 +107,7 @@ struct dma_buf {
 	struct file *file;
 	struct list_head attachments;
 	const struct dma_buf_ops *ops;
-	/* mutex to serialize list manipulation and other ops */
+	/* mutex to serialize list manipulation and attach/detach */
 	struct mutex lock;
 	void *priv;
 };
@@ -109,20 +130,43 @@ struct dma_buf_attachment {
 	void *priv;
 };
 
+/**
+ * get_dma_buf - convenience wrapper for get_file.
+ * @dmabuf:	[in]	pointer to dma_buf
+ *
+ * Increments the reference count on the dma-buf, needed in case of drivers
+ * that either need to create additional references to the dmabuf on the
+ * kernel side.  For example, an exporter that needs to keep a dmabuf ptr
+ * so that subsequent exports don't create a new dmabuf.
+ */
+static inline void get_dma_buf(struct dma_buf *dmabuf)
+{
+	get_file(dmabuf->file);
+}
+
 #ifdef CONFIG_DMA_SHARED_BUFFER
 struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 							struct device *dev);
 void dma_buf_detach(struct dma_buf *dmabuf,
 				struct dma_buf_attachment *dmabuf_attach);
-struct dma_buf *dma_buf_export(void *priv, struct dma_buf_ops *ops,
-			size_t size, int flags);
-int dma_buf_fd(struct dma_buf *dmabuf);
+struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
+			       size_t size, int flags);
+int dma_buf_fd(struct dma_buf *dmabuf, int flags);
 struct dma_buf *dma_buf_get(int fd);
 void dma_buf_put(struct dma_buf *dmabuf);
 
 struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *,
 					enum dma_data_direction);
-void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *);
+void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *,
+				enum dma_data_direction);
+int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, size_t start, size_t len,
+			     enum dma_data_direction dir);
+void dma_buf_end_cpu_access(struct dma_buf *dma_buf, size_t start, size_t len,
+			    enum dma_data_direction dir);
+void *dma_buf_kmap_atomic(struct dma_buf *, unsigned long);
+void dma_buf_kunmap_atomic(struct dma_buf *, unsigned long, void *);
+void *dma_buf_kmap(struct dma_buf *, unsigned long);
+void dma_buf_kunmap(struct dma_buf *, unsigned long, void *);
 #else
 
 static inline struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
@@ -138,13 +182,13 @@ static inline void dma_buf_detach(struct dma_buf *dmabuf,
 }
 
 static inline struct dma_buf *dma_buf_export(void *priv,
-						struct dma_buf_ops *ops,
-						size_t size, int flags)
+					     const struct dma_buf_ops *ops,
+					     size_t size, int flags)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline int dma_buf_fd(struct dma_buf *dmabuf)
+static inline int dma_buf_fd(struct dma_buf *dmabuf, int flags)
 {
 	return -ENODEV;
 }
@@ -166,11 +210,44 @@ static inline struct sg_table *dma_buf_map_attachment(
 }
 
 static inline void dma_buf_unmap_attachment(struct dma_buf_attachment *attach,
-						struct sg_table *sg)
+			struct sg_table *sg, enum dma_data_direction dir)
 {
 	return;
 }
 
+static inline int dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
+					   size_t start, size_t len,
+					   enum dma_data_direction dir)
+{
+	return -ENODEV;
+}
+
+static inline void dma_buf_end_cpu_access(struct dma_buf *dmabuf,
+					  size_t start, size_t len,
+					  enum dma_data_direction dir)
+{
+}
+
+static inline void *dma_buf_kmap_atomic(struct dma_buf *dmabuf,
+					unsigned long pnum)
+{
+	return NULL;
+}
+
+static inline void dma_buf_kunmap_atomic(struct dma_buf *dmabuf,
+					 unsigned long pnum, void *vaddr)
+{
+}
+
+static inline void *dma_buf_kmap(struct dma_buf *dmabuf, unsigned long pnum)
+{
+	return NULL;
+}
+
+static inline void dma_buf_kunmap(struct dma_buf *dmabuf,
+				  unsigned long pnum, void *vaddr)
+{
+}
 #endif /* CONFIG_DMA_SHARED_BUFFER */
 
 #endif /* __DMA_BUF_H__ */