sos-code-article10/sos/fs_pagecache.c

/* Copyright (C) 2005,2006      David Decotigny

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; either version 2
   of the License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
   USA.
*/
#include <sos/ksynch.h>
#include <sos/kmem_slab.h>
#include <sos/hash.h>
#include <sos/physmem.h> /* For SOS_PAGE_MASK */
#include <sos/list.h>
#include <sos/assert.h>
#include <sos/uaccess.h>
#include <sos/kmalloc.h>

#include "fs_pagecache.h"


#define SOS_OFFSET64_PAGE_ALIGN_INF(offs64) \
  ( ((sos_luoffset_t)(offs64)) & (~ ((sos_luoffset_t)(SOS_PAGE_MASK))) )


#define SOS_OFFSET64_IS_PAGE_ALIGNED(offs64) \
  ( ( ((sos_luoffset_t)(offs64)) & (((sos_luoffset_t)(SOS_PAGE_MASK))) ) == 0 )


/**
 * Definition of an object holding a reference to a shared mapping of
 * a file/device-mapped cache page.
 *
 * @note This structure is huge. We can shrink it largely by removing the
 * "name" field from the lock structure (32 bytes).
 */
struct sos_fs_pagecache_entry
{
  /** offset of the cached page in the file or device */
  sos_luoffset_t file_offset;

  /** Address of the cached page for this offset */
  sos_vaddr_t    kernel_vaddr;

  struct sos_kmutex lock;
  sos_count_t ref_cnt;

  sos_bool_t initial_fill_aborted; /**< True when the page could not
				      be correctly filled */

  /**
   * When 0: the page is clean wrt to read/write syscalls, ie the disk
   * contents reflect the contents of the page since the last
   * read/write operation. However, the disk may NOT be in sync wrt to
   * mmap() operations: if mmap() operations occured in the meantime,
   * the disk may NOT be up to date, and the pagecache entry may even
   * NOT be considered dirty. This is because we do not trace each of
   * the read/write MMU operations from every processes (this would
   * need to catch all writes even on read/write mapped pages) and we
   * don't have a reverse mapping available to set the page read-only
   * in every mappings once it has been synched to disk (to
   * effectively trace the dirty state relative to mmap operations).
   *
   * When ">0": at least one process changed the contents of the page
   * through read/write syscalls since last sync operation.
   *
   * @note A boolean is enough for 99% of the code. But we need a real
   * counter for the sos_fs_pagecache_sync operation to make sure we
   * don't iterate 2 times over the same page.
   */
  sos_lcount_t rw_dirty_order;
#define ENTRY_IS_RW_DIRTY(e) ((e)->rw_dirty_order > 0)

  /** Linkage structure to keep the cache entry in the hash map */
  struct sos_hash_linkage hlink;

  /** Links to insert the entry into the rw_sync/rw_dirty lists */
  struct sos_fs_pagecache_entry *prev, *next;
};


struct sos_fs_pagecache
{
  /** The operation used to synchronize the mapped pages with the
      backing store */
  sos_fs_pagecache_sync_function_t sync_fct;
  void * sync_fct_custom_data;

  /** The dictionary offset -> pagecache_entry */
  struct sos_hash_table  * lookup_table;

  /* Lists to look into in order to free a node */
  struct sos_fs_pagecache_entry * rw_sync_list;  /**< Pages in sync
						      with disk wrt
						      read/write API
						      (LRU at end) */
  struct sos_fs_pagecache_entry * rw_dirty_list; /**< Dirty pages wrt
						      read/write API
						      (LRU last) */

  /** The "timestamp" high watermark used to iterate over the dirty
      pages in the sync function */
  sos_lcount_t top_rw_dirty_order;
};


/** The slab cache for pagecache */
static struct sos_kslab_cache * cache_of_pagecache;


/** The slab cache for pagecache entries */
static struct sos_kslab_cache * cache_of_pagecache_entry;


sos_ret_t sos_fs_pagecache_subsystem_setup()
{
  /* Allocate the slab caches for the page caches and page cache
     entries */

  cache_of_pagecache =
    sos_kmem_cache_create("pagecache", sizeof(struct sos_fs_pagecache),
			  2, 0, SOS_KSLAB_CREATE_MAP | SOS_KSLAB_CREATE_ZERO);
  if (NULL == cache_of_pagecache)
    return -SOS_ENOMEM;

  cache_of_pagecache_entry =
    sos_kmem_cache_create("pagecache_entry",
			  sizeof(struct sos_fs_pagecache_entry),
			  2, 0, SOS_KSLAB_CREATE_MAP | SOS_KSLAB_CREATE_ZERO);
  if (NULL == cache_of_pagecache_entry)
    {
      sos_kmem_cache_destroy(cache_of_pagecache);
      return -SOS_ENOMEM;
    }

  return SOS_OK;
}


struct sos_fs_pagecache *
sos_fs_pagecache_new_cache(sos_fs_pagecache_sync_function_t sync_fct,
			   void * sync_fct_custom_data)
{
  struct sos_fs_pagecache * pagecache
    = (struct sos_fs_pagecache*) sos_kmem_cache_alloc(cache_of_pagecache,
						      0);
  if (NULL == pagecache)
    return NULL;

  pagecache->lookup_table = sos_hash_create("pagecache",
					    struct sos_fs_pagecache_entry,
					    sos_hash_ui64,
					    sos_hash_key_eq_ui64,
					    127, file_offset, hlink);
  if (NULL == pagecache->lookup_table)
    {
      sos_kmem_cache_free((sos_vaddr_t) pagecache);
      return NULL;
    }

  pagecache->sync_fct             = sync_fct;
  pagecache->sync_fct_custom_data = sync_fct_custom_data;
  pagecache->top_rw_dirty_order   = 0x24;

  return pagecache;
}


sos_ret_t
sos_fs_pagecache_delete_cache(struct sos_fs_pagecache * pc)
{
  /* The cache is EXPECTED to be empty ! */

  if (!list_is_empty(pc->rw_dirty_list))
    SOS_FATAL_ERROR("Non empty dirty list");
  if (!list_is_empty(pc->rw_sync_list))
    SOS_FATAL_ERROR("Non empty sync list");

  sos_hash_dispose(pc->lookup_table);
  return sos_kmem_cache_free((sos_vaddr_t)pc);
}


/** Helper function to flush a page to disk. Expects the entry to be
    locked */
static sos_ret_t pagecache_sync_page(struct sos_fs_pagecache * pc,
				     struct sos_fs_pagecache_entry * entry)
{
  sos_ret_t retval;

  if (! ENTRY_IS_RW_DIRTY(entry))
    return SOS_OK;

  /* Now do the real transfer to backing store */
  retval = pc->sync_fct(entry->file_offset, entry->kernel_vaddr,
			pc->sync_fct_custom_data);
  if (SOS_OK != retval)
    return retval;

  /* Transfer page to the sync list */
  list_delete(pc->rw_dirty_list, entry);
  entry->rw_dirty_order = 0;
  list_add_head(pc->rw_sync_list, entry);

  return SOS_OK;
}


/** Helper function to correctly lock an entry */
static sos_ret_t pagecache_use(struct sos_fs_pagecache * pc,
			       struct sos_fs_pagecache_entry * entry)
{
  entry->ref_cnt ++;
  return sos_kmutex_lock(& entry->lock, NULL);
}


/**
 * Helper function to transfer a page to the dirty r/w list
 */
static sos_ret_t pagecache_set_rw_dirty(struct sos_fs_pagecache * pc,
					struct sos_fs_pagecache_entry * entry)
{
  if (ENTRY_IS_RW_DIRTY(entry))
    return SOS_OK; /* Nothing to do */

  list_delete(pc->rw_sync_list, entry);
  entry->rw_dirty_order = ++ pc->top_rw_dirty_order;
  list_add_head(pc->rw_dirty_list, entry);

  return SOS_OK;
}


/** Helper function to correctly unlock an entry, flushing it to disk
    if needed */
static sos_ret_t pagecache_release(struct sos_fs_pagecache * pc,
				   struct sos_fs_pagecache_entry * entry)
{
  if (entry->ref_cnt > 1)
    {
      entry->ref_cnt --;
      sos_kmutex_unlock(& entry->lock);
      return SOS_OK;
    }

  /*
   * The cached page is now referenced ONLY by US, we can try to
   * remove it from the cache
   */

  /* Flush any change to disk, at least if we are sure that its
     content is legal, ie that the page_in callback did success in
     filling it */
  if (! entry->initial_fill_aborted)
    pagecache_sync_page(pc, entry);

  /* Ok, now WE are not interested by this entry anymore */
  entry->ref_cnt --;

  /* During blocking time, another thread could have asked for the
     entry. In this case, stop here */
  if (entry->ref_cnt > 0)
    {
      sos_kmutex_unlock(& entry->lock);
      return SOS_OK;
    }

  /* Remove it from the lists */
  sos_hash_remove(pc->lookup_table, entry);
  if (ENTRY_IS_RW_DIRTY(entry))
    list_delete(pc->rw_dirty_list, entry);
  else
    list_delete(pc->rw_sync_list, entry);

  /* We can safely erase it now ! */
  sos_kmutex_unlock(& entry->lock);
  SOS_ASSERT_FATAL(SOS_OK == sos_kmutex_dispose(& entry->lock)); /* No thread are waiting */
  sos_kfree(entry->kernel_vaddr);
  sos_kmem_cache_free((sos_vaddr_t)entry);

  return SOS_OK;
}


/**
 * Helper function to look up an entry from the cache and lock it. If
 * the entry does not exist (yet), return NULL.
 */
static struct sos_fs_pagecache_entry *
pagecache_lookup_and_lock(struct sos_fs_pagecache * pc,
			  sos_luoffset_t offset)
{
  sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
  struct sos_fs_pagecache_entry * entry = NULL;

  while (TRUE)
    {
      entry
	= (struct sos_fs_pagecache_entry*) sos_hash_lookup(pc->lookup_table,
							   & pgoffs);
      if (! entry)
	break;

      /* Lock it now */
      SOS_ASSERT_FATAL(SOS_OK == pagecache_use(pc, entry));

      /*
       * Entry is now locked
       */

      /* Make sure it contains legal contents: if we were blocked
	 because of the page_in operations reading it from disk, an
	 error could have been occured. In this case, we must consider
	 that this entry is not yet inserted in the cache */
      if (entry->initial_fill_aborted)
	{
	  pagecache_release(pc, entry);
	  continue;
	}

      /* Ok, we have the entry and it is correctly initialized ! */
      break;
    }

  return entry;
}


sos_ret_t
sos_fs_pagecache_read(struct sos_fs_pagecache * pc,
		      sos_luoffset_t offset,
		      sos_genaddr_t dest_buf,
		      sos_size_t * /* in/out */len)
{
  sos_ret_t retval;
  sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
  sos_luoffset_t endpos = offset + *len;
  struct sos_fs_pagecache_entry * entry;

  entry = pagecache_lookup_and_lock(pc, pgoffs);
  if (NULL == entry)
    return -SOS_ENOENT;

  /* Great ! Found the entry in the cache ! */

  /* Read only up to the end of the page */
  if (endpos - pgoffs > SOS_PAGE_SIZE)
    endpos = pgoffs + SOS_PAGE_SIZE;

  /* Copy page contents to destination buffer */
  retval = sos_memcpy_generic_to(dest_buf,
				 entry->kernel_vaddr + (offset - pgoffs),
				 endpos - offset);
  pagecache_release(pc, entry);

  if (retval < 0)
    {
      *len = 0;
      return retval;
    }

  *len = retval;
  if ((sos_luoffset_t)retval != endpos - offset)
    return -SOS_EFAULT;

  return SOS_OK;
}


sos_ret_t
sos_fs_pagecache_write(struct sos_fs_pagecache * pc,
		       sos_luoffset_t offset,
		       sos_genaddr_t src_buf,
		       sos_size_t * /* in/out */len,
		       sos_bool_t synchronous_write)
{
  sos_ret_t retval;
  sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
  sos_luoffset_t endpos = offset + *len;
  struct sos_fs_pagecache_entry * entry;

  entry = pagecache_lookup_and_lock(pc, pgoffs);
  if (NULL == entry)
    return -SOS_ENOENT;

  /* Great ! Found the entry in the cache ! */

  /* Read only up to the end of the page */
  if (endpos - pgoffs > SOS_PAGE_SIZE)
    endpos = pgoffs + SOS_PAGE_SIZE;

  /* Copy page contents to destination buffer */
  retval = sos_memcpy_generic_from(entry->kernel_vaddr + (offset - pgoffs),
				   src_buf,
				   endpos - offset);
  /* Transfer the entry in the dirty list if needed */
  if (retval >= 0)
    pagecache_set_rw_dirty(pc, entry);

  if (retval < 0)
    {
      *len = 0;
      pagecache_release(pc, entry);
      return retval;
    }

  *len = retval;
  if ((sos_luoffset_t)retval != endpos - offset)
    retval = -SOS_EFAULT;
  else
    retval = SOS_OK;

  /* Flush to disk if needed */
  if (synchronous_write)
    {
      sos_ret_t ret = pagecache_sync_page(pc, entry);
      if (SOS_OK == retval)
	retval = ret;
    }

  pagecache_release(pc, entry);
  return retval;
}


sos_ret_t sos_fs_pagecache_set_dirty(struct sos_fs_pagecache * pc,
				     sos_luoffset_t offset,
				     sos_bool_t sync_backing_store)
{
  sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
  struct sos_fs_pagecache_entry * entry;

  entry = pagecache_lookup_and_lock(pc, pgoffs);
  if (NULL == entry)
    return -SOS_ENOENT;

  /* Great ! Found the entry in the cache ! */
  pagecache_set_rw_dirty(pc, entry);

  /* Synchronize to backing store if needed */
  if (sync_backing_store)
    pagecache_sync_page(pc, entry);

  pagecache_release(pc, entry);
  return SOS_OK;
}


struct sos_fs_pagecache_entry *
sos_fs_pagecache_ref_page(struct sos_fs_pagecache * pc,
			  sos_luoffset_t offset,
			  sos_vaddr_t * /* out */ kernel_vaddr,
			  sos_bool_t * /* out */ newly_allocated)
{
  sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
  struct sos_fs_pagecache_entry * entry;

  /* The offset is expected to be page-aligned */
  if (pgoffs != offset)
    return NULL;

  entry = pagecache_lookup_and_lock(pc, pgoffs);
  if (NULL != entry)
    {
      /* Found it ! No need to go further */
      *newly_allocated = FALSE;
      *kernel_vaddr = entry->kernel_vaddr;
      return entry;
    }


  /*
   * Need to allocate a new kernel page
   */

  entry = (struct sos_fs_pagecache_entry*)
    sos_kmem_cache_alloc(cache_of_pagecache_entry, 0);
  if (NULL == entry)
    return (sos_vaddr_t)NULL;

  if (SOS_OK != sos_kmutex_init(& entry->lock, "pagecache_entry",
				SOS_KWQ_ORDER_FIFO))
    {
      sos_kmem_cache_free((sos_vaddr_t)entry);
      return NULL;
    }

  /* Initial state of the page correspond to an erroneous
     initialization */
  entry->file_offset          = pgoffs;
  entry->initial_fill_aborted = TRUE;
  entry->ref_cnt              = 1;

  /* Allocate the page */
  entry->kernel_vaddr = sos_kmalloc(SOS_PAGE_SIZE, 0);
  if (((sos_vaddr_t)NULL) == entry->kernel_vaddr)
    {
      sos_kmutex_dispose(& entry->lock);
      sos_kmem_cache_free((sos_vaddr_t)entry);
      return NULL;
    }

  /* Own the mutex */
  SOS_ASSERT_FATAL(SOS_OK == sos_kmutex_lock(& entry->lock, NULL));

  /* Try to insert it into the hash table. Might fail if the page was
     already inserted, which could be possible because the allocation
     routines might have blocked */
  if (SOS_OK != sos_hash_insert(pc->lookup_table, entry))
    {
      /* entry was inserted during allocations, undo the new entry */
      sos_kmutex_unlock(& entry->lock);
      sos_kmutex_dispose(& entry->lock);
      sos_kfree(entry->kernel_vaddr);
      sos_kmem_cache_free((sos_vaddr_t)entry);

      /* Get the real entry */
      entry = pagecache_lookup_and_lock(pc, offset);
      SOS_ASSERT_FATAL(NULL != entry);
      *kernel_vaddr = entry->kernel_vaddr;
      *newly_allocated = FALSE;
      return entry;
    }

  /* Now register the entry in the sync list */
  entry->rw_dirty_order = 0;
  list_add_head(pc->rw_sync_list, entry);

  *newly_allocated = TRUE;
  *kernel_vaddr = entry->kernel_vaddr;
  return entry;
}


sos_ret_t
sos_fs_pagecache_unlock_page(struct sos_fs_pagecache * pc,
			     struct sos_fs_pagecache_entry * entry,
			     sos_bool_t initial_fill_aborted)
{

  entry->initial_fill_aborted = initial_fill_aborted;

  if (initial_fill_aborted)
    return pagecache_release(pc, entry);

  return sos_kmutex_unlock(& entry->lock);
}


sos_ret_t
sos_fs_pagecache_unref_page(struct sos_fs_pagecache * pc,
			    sos_luoffset_t offset)
{
  sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
  struct sos_fs_pagecache_entry * entry;

  /* The offset is expected to be page-aligned */
  if (pgoffs != offset)
    return -SOS_EINVAL;

  entry
    = (struct sos_fs_pagecache_entry*) sos_hash_lookup(pc->lookup_table,
						& pgoffs);
  SOS_ASSERT_FATAL(NULL != entry);
  SOS_ASSERT_FATAL(SOS_OK == sos_kmutex_lock(& entry->lock, NULL));
  return pagecache_release(pc, entry);
}


sos_ret_t
sos_fs_pagecache_sync(struct sos_fs_pagecache * pc)
{
  sos_ret_t retval = SOS_OK;
  int dummy = 0;
  sos_lcount_t rw_dirty_order = 0;

  /** High watermark telling "you won't take the pages added
      afterwards into account" */
  sos_lcount_t top_rw_dirty_order = pc->top_rw_dirty_order;

  if (list_is_empty(pc->rw_dirty_list))
    return SOS_OK;

  /* This scan will be exhaustive and resilient to addition/removal of
     devices as long as new devices are added with list_add_tail
     (because the scan is "forward", ie in order head -> tail) */
  while (TRUE)
    {
      struct sos_fs_pagecache_entry * entry = NULL;
      int ndirty;

      /* As long as we don't block, we can safely access the
	 prev/next fields of the page descriptor */
      list_foreach_backward(pc->rw_dirty_list, entry, ndirty)
	{
	  sos_ret_t ret = SOS_OK;
	  struct sos_fs_pagecache_entry * prev_entry = NULL;

	  /* Reached the initial high watermark ? Don't take the
	     additional pages into account */
	  if (entry->rw_dirty_order > top_rw_dirty_order)
	    break;

	  if (entry->rw_dirty_order <= rw_dirty_order)
	    continue;

	  rw_dirty_order = entry->rw_dirty_order;
	  prev_entry     = entry->prev;

	  SOS_ASSERT_FATAL(SOS_OK == pagecache_use(pc, entry));
	  if (! entry->initial_fill_aborted)
	    ret = pagecache_sync_page(pc, entry);
	  if (SOS_OK != ret)
	    retval = ret;
	  pagecache_release(pc, entry);

	  /* We must NOT continue the loops because the prev/next page
	     cache entry might have been removed or added (sync pages,
	     by definition) ! */
	  if (prev_entry != entry->prev)
	    goto lookup_next_ent;
	}

      /* Reached the end of the list */
      break;

    lookup_next_ent:
      /* Loop over */
      dummy ++;
    }

  return retval;
}