/* Copyright (C) 2005,2006      David Decotigny

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; either version 2
   of the License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
   USA.
*/
#ifndef _SOS_FSPAGECACHE_H_
#define _SOS_FSPAGECACHE_H_


/**
 * @file fs_pagecache.h
 *
 * Simple page cache interface. Used to automate the synchronization
 * between the rad/write operations and mmap. A "FS page cache" is
 * simply a set of pages mapping a file in memory. A file may not be
 * entirely mapped into memory: its pages are mapped only if any user
 * thread invoked an mmap and page-faulted inside the mapped region to
 * map these pages into memory. Contrary to some other caches in SOS
 * (eg the block cache), this one is not limited in size. As many
 * pages as needed will be allocated for it, as permitted by the
 * available RAM. With a pageout mechanism, this cache will be
 * shrinked when needed: some of its pages will be transferred back to
 * disk and unmapped.
 *
 * A page cache is used both to cache memory mapped files of an FS,
 * and memory mapped block devices. Hence:
 * - there is one SINGLE page cache for each block device (proper to
 *   each disk, to each partition)
 * - there is one SINGLE page cache for each file of a file system
 *
 * For block devices, the page cache automatically synchronizes the
 * pages with the block cache as long as these mapped pages are
 * accessed through the read/write API. However, <b>NO <i>automatic
 * and accurate</i> synchronization</b> between the in-memory modified
 * pages (accessed through the MMU) and the block cache is provided
 * because we have no way to collect the accurate list of pages
 * modified through MMU write accesses (this would require either to
 * catch all the MMU write operations [too inefficient !], or to have
 * a reverse-mapping system in order to look at the dirty bit of all
 * the mappings).  Hence, to enforce blkcache/pagecache
 * synchronization, the msync/munmap API must be used manually. Thus,
 * the page cache is accurately synchronized with the block cache:
 * - automatically: with the read/write/sync operations
 * - manually: with the msync and munmap (and of course: exit) operations
 *
 * Nevertheless, from the viewpoint of "blkdev.c", the
 * blockdev_read/write operations are always in sync with the MMU
 * because the pagecache is accessed prior to the blkcache: any
 * divergence between the pagecache and the blkcache is hence
 * <i>hidden</i>. But keep in mind that if you want the disk to
 * accurately reflect the contents of the mapped pages, you have to
 * eventually call msync, munmap, or to destroy the address space (ie
 * exit the process).
 *
 * A side effect: if you map /dev/hda and /dev/hda1, both mappings
 * will be inconsistent and may also be inconsistent with read/write
 * accesses. This is because the partitions have their own page cache
 * while they share the block cache with the disk device. A solution
 * would be to share the page cache between the disk device and all
 * its partitions. But, due to the fact that partitions are not
 * necessarily page-aligned in the disk, this would impose some pages
 * to not correspond to a page-aligned offset inside a partition,
 * requiring either to have an odd semantic of the mmap syscall (the
 * allowed device "offset" would depend on the disk partitioning) if
 * we want to share the mapped pages between the cache and userspace,
 * or to allocate other pages for the required userspace mappings and
 * keep them in sync with the page cache pages. Both solutions seem
 * ugly to me, and not worth implementing since the page cache is
 * aimed at being generic enough to be used for file mappings: files
 * don't have sub-files (as do disk devices that have partitions). So
 * solving the problem is non pertinent for files. And who will ever
 * need /dev/hda mappings to be consistent with those of /dev/hda1 ?...
 */
#include <sos/errno.h>
#include <sos/uaccess.h>


/** Opaque structure holding a page cache */
struct sos_fs_pagecache;

/** Opaque structure holding a page of the cache */
struct sos_fs_pagecache_entry;


sos_ret_t sos_fs_pagecache_subsystem_setup(void);


/**
 * Function called to flush the dirty pages to backing store
 */
typedef sos_ret_t
(*sos_fs_pagecache_sync_function_t)(sos_luoffset_t offset,
				    sos_vaddr_t dirty_page,
				    void * custom_data);


/**
 * Create a new pagecache.
 *
 * @param sync_fct, the function used to flush the dirty pages to
 *   backing store.  may be NULL
 */
struct sos_fs_pagecache *
sos_fs_pagecache_new_cache(sos_fs_pagecache_sync_function_t sync_fct,
			   void * sync_fct_custom_data);


/**
 * Delete the page cache.
 *
 * The page cache is expected to be already flushed to backing store
 */
sos_ret_t
sos_fs_pagecache_delete_cache(struct sos_fs_pagecache * pc);


/**
 * Read from the given offset from the cache, if present.
 * @return ENOENT when no page for the given offset is mapped, return
 * EFAULT when the contents could not be completely copied to
 * destination buffer
 */
sos_ret_t
sos_fs_pagecache_read(struct sos_fs_pagecache * pc,
		      sos_luoffset_t offset,
		      sos_genaddr_t dest_buf,
		      sos_size_t * /* in/out */len);


/**
 * Write at the given offset from the cache, if present
 * @return ENOENT when no page for the given offset is mapped, return
 * EFAULT when the contents could not be completely copied from
 * source buffer
 */
sos_ret_t
sos_fs_pagecache_write(struct sos_fs_pagecache * pc,
		       sos_luoffset_t offset,
		       sos_genaddr_t src_buf,
		       sos_size_t * /* in/out */len,
		       sos_bool_t synchronous_write);


/**
 * Function reserved to blkdev.c and FS code: used by the msync
 * callback to mark a pagecache page dirty
 *
 * @param sync_backing_store When TRUE, then the page must be flushed
 * to backing store.
 */
sos_ret_t sos_fs_pagecache_set_dirty(struct sos_fs_pagecache * pc,
				     sos_luoffset_t offset,
				     sos_bool_t sync_backing_store);


/**
 * Prepare a page to be mapped: get a NEW reference to the page
 * (kernel address) of the page to be mapped, which is also locked in
 * order to be used. If the page is not yet present in the cache,
 * allocate it and prepare it to be filled
 *
 * @param offset MUST be page-aligned
 * @param newly_allocated TRUE when the page was not already mapped by
 * someone: the contents of the page is then IRRELEVANT
 *
 * @return NULL on error
 *
 * @note The page is also LOCKED, use unlock to unlock it before
 *  unreferencing it
 */
struct sos_fs_pagecache_entry *
sos_fs_pagecache_ref_page(struct sos_fs_pagecache * pc,
			  sos_luoffset_t offset,
			  sos_vaddr_t * /* out */ kernel_vaddr,
			  sos_bool_t * /* out */ newly_allocated);


/** Called by the blkdev.c and FS page_in callback to unlock the entry
    after it has been initialized. */
sos_ret_t
sos_fs_pagecache_unlock_page(struct sos_fs_pagecache * pc,
			     struct sos_fs_pagecache_entry * entry,
			     sos_bool_t initial_fill_aborted);


/**
 * Called when the page is unmapped from a user process space
 *   @param offset MUST be page-aligned
 *
 * @note the page is expected to be present in the cache
 * @note the entry is expected NOT to be locked !
 */
sos_ret_t
sos_fs_pagecache_unref_page(struct sos_fs_pagecache * pc,
			    sos_luoffset_t offset);


/** Call the sync function on each dirty page */
sos_ret_t
sos_fs_pagecache_sync(struct sos_fs_pagecache * pc);

#endif