sos-code-article10/sos/fs_pagecache.c

643 lines
17 KiB
C

/* Copyright (C) 2005,2006 David Decotigny
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#include <sos/ksynch.h>
#include <sos/kmem_slab.h>
#include <sos/hash.h>
#include <sos/physmem.h> /* For SOS_PAGE_MASK */
#include <sos/list.h>
#include <sos/assert.h>
#include <sos/uaccess.h>
#include <sos/kmalloc.h>
#include "fs_pagecache.h"
#define SOS_OFFSET64_PAGE_ALIGN_INF(offs64) \
( ((sos_luoffset_t)(offs64)) & (~ ((sos_luoffset_t)(SOS_PAGE_MASK))) )
#define SOS_OFFSET64_IS_PAGE_ALIGNED(offs64) \
( ( ((sos_luoffset_t)(offs64)) & (((sos_luoffset_t)(SOS_PAGE_MASK))) ) == 0 )
/**
* Definition of an object holding a reference to a shared mapping of
* a file/device-mapped cache page.
*
* @note This structure is huge. We can shrink it largely by removing the
* "name" field from the lock structure (32 bytes).
*/
struct sos_fs_pagecache_entry
{
/** offset of the cached page in the file or device */
sos_luoffset_t file_offset;
/** Address of the cached page for this offset */
sos_vaddr_t kernel_vaddr;
struct sos_kmutex lock;
sos_count_t ref_cnt;
sos_bool_t initial_fill_aborted; /**< True when the page could not
be correctly filled */
/**
* When 0: the page is clean wrt to read/write syscalls, ie the disk
* contents reflect the contents of the page since the last
* read/write operation. However, the disk may NOT be in sync wrt to
* mmap() operations: if mmap() operations occured in the meantime,
* the disk may NOT be up to date, and the pagecache entry may even
* NOT be considered dirty. This is because we do not trace each of
* the read/write MMU operations from every processes (this would
* need to catch all writes even on read/write mapped pages) and we
* don't have a reverse mapping available to set the page read-only
* in every mappings once it has been synched to disk (to
* effectively trace the dirty state relative to mmap operations).
*
* When ">0": at least one process changed the contents of the page
* through read/write syscalls since last sync operation.
*
* @note A boolean is enough for 99% of the code. But we need a real
* counter for the sos_fs_pagecache_sync operation to make sure we
* don't iterate 2 times over the same page.
*/
sos_lcount_t rw_dirty_order;
#define ENTRY_IS_RW_DIRTY(e) ((e)->rw_dirty_order > 0)
/** Linkage structure to keep the cache entry in the hash map */
struct sos_hash_linkage hlink;
/** Links to insert the entry into the rw_sync/rw_dirty lists */
struct sos_fs_pagecache_entry *prev, *next;
};
struct sos_fs_pagecache
{
/** The operation used to synchronize the mapped pages with the
backing store */
sos_fs_pagecache_sync_function_t sync_fct;
void * sync_fct_custom_data;
/** The dictionary offset -> pagecache_entry */
struct sos_hash_table * lookup_table;
/* Lists to look into in order to free a node */
struct sos_fs_pagecache_entry * rw_sync_list; /**< Pages in sync
with disk wrt
read/write API
(LRU at end) */
struct sos_fs_pagecache_entry * rw_dirty_list; /**< Dirty pages wrt
read/write API
(LRU last) */
/** The "timestamp" high watermark used to iterate over the dirty
pages in the sync function */
sos_lcount_t top_rw_dirty_order;
};
/** The slab cache for pagecache */
static struct sos_kslab_cache * cache_of_pagecache;
/** The slab cache for pagecache entries */
static struct sos_kslab_cache * cache_of_pagecache_entry;
sos_ret_t sos_fs_pagecache_subsystem_setup()
{
/* Allocate the slab caches for the page caches and page cache
entries */
cache_of_pagecache =
sos_kmem_cache_create("pagecache", sizeof(struct sos_fs_pagecache),
2, 0, SOS_KSLAB_CREATE_MAP | SOS_KSLAB_CREATE_ZERO);
if (NULL == cache_of_pagecache)
return -SOS_ENOMEM;
cache_of_pagecache_entry =
sos_kmem_cache_create("pagecache_entry",
sizeof(struct sos_fs_pagecache_entry),
2, 0, SOS_KSLAB_CREATE_MAP | SOS_KSLAB_CREATE_ZERO);
if (NULL == cache_of_pagecache_entry)
{
sos_kmem_cache_destroy(cache_of_pagecache);
return -SOS_ENOMEM;
}
return SOS_OK;
}
struct sos_fs_pagecache *
sos_fs_pagecache_new_cache(sos_fs_pagecache_sync_function_t sync_fct,
void * sync_fct_custom_data)
{
struct sos_fs_pagecache * pagecache
= (struct sos_fs_pagecache*) sos_kmem_cache_alloc(cache_of_pagecache,
0);
if (NULL == pagecache)
return NULL;
pagecache->lookup_table = sos_hash_create("pagecache",
struct sos_fs_pagecache_entry,
sos_hash_ui64,
sos_hash_key_eq_ui64,
127, file_offset, hlink);
if (NULL == pagecache->lookup_table)
{
sos_kmem_cache_free((sos_vaddr_t) pagecache);
return NULL;
}
pagecache->sync_fct = sync_fct;
pagecache->sync_fct_custom_data = sync_fct_custom_data;
pagecache->top_rw_dirty_order = 0x24;
return pagecache;
}
sos_ret_t
sos_fs_pagecache_delete_cache(struct sos_fs_pagecache * pc)
{
/* The cache is EXPECTED to be empty ! */
if (!list_is_empty(pc->rw_dirty_list))
SOS_FATAL_ERROR("Non empty dirty list");
if (!list_is_empty(pc->rw_sync_list))
SOS_FATAL_ERROR("Non empty sync list");
sos_hash_dispose(pc->lookup_table);
return sos_kmem_cache_free((sos_vaddr_t)pc);
}
/** Helper function to flush a page to disk. Expects the entry to be
locked */
static sos_ret_t pagecache_sync_page(struct sos_fs_pagecache * pc,
struct sos_fs_pagecache_entry * entry)
{
sos_ret_t retval;
if (! ENTRY_IS_RW_DIRTY(entry))
return SOS_OK;
/* Now do the real transfer to backing store */
retval = pc->sync_fct(entry->file_offset, entry->kernel_vaddr,
pc->sync_fct_custom_data);
if (SOS_OK != retval)
return retval;
/* Transfer page to the sync list */
list_delete(pc->rw_dirty_list, entry);
entry->rw_dirty_order = 0;
list_add_head(pc->rw_sync_list, entry);
return SOS_OK;
}
/** Helper function to correctly lock an entry */
static sos_ret_t pagecache_use(struct sos_fs_pagecache * pc,
struct sos_fs_pagecache_entry * entry)
{
entry->ref_cnt ++;
return sos_kmutex_lock(& entry->lock, NULL);
}
/**
* Helper function to transfer a page to the dirty r/w list
*/
static sos_ret_t pagecache_set_rw_dirty(struct sos_fs_pagecache * pc,
struct sos_fs_pagecache_entry * entry)
{
if (ENTRY_IS_RW_DIRTY(entry))
return SOS_OK; /* Nothing to do */
list_delete(pc->rw_sync_list, entry);
entry->rw_dirty_order = ++ pc->top_rw_dirty_order;
list_add_head(pc->rw_dirty_list, entry);
return SOS_OK;
}
/** Helper function to correctly unlock an entry, flushing it to disk
if needed */
static sos_ret_t pagecache_release(struct sos_fs_pagecache * pc,
struct sos_fs_pagecache_entry * entry)
{
if (entry->ref_cnt > 1)
{
entry->ref_cnt --;
sos_kmutex_unlock(& entry->lock);
return SOS_OK;
}
/*
* The cached page is now referenced ONLY by US, we can try to
* remove it from the cache
*/
/* Flush any change to disk, at least if we are sure that its
content is legal, ie that the page_in callback did success in
filling it */
if (! entry->initial_fill_aborted)
pagecache_sync_page(pc, entry);
/* Ok, now WE are not interested by this entry anymore */
entry->ref_cnt --;
/* During blocking time, another thread could have asked for the
entry. In this case, stop here */
if (entry->ref_cnt > 0)
{
sos_kmutex_unlock(& entry->lock);
return SOS_OK;
}
/* Remove it from the lists */
sos_hash_remove(pc->lookup_table, entry);
if (ENTRY_IS_RW_DIRTY(entry))
list_delete(pc->rw_dirty_list, entry);
else
list_delete(pc->rw_sync_list, entry);
/* We can safely erase it now ! */
sos_kmutex_unlock(& entry->lock);
SOS_ASSERT_FATAL(SOS_OK == sos_kmutex_dispose(& entry->lock)); /* No thread are waiting */
sos_kfree(entry->kernel_vaddr);
sos_kmem_cache_free((sos_vaddr_t)entry);
return SOS_OK;
}
/**
* Helper function to look up an entry from the cache and lock it. If
* the entry does not exist (yet), return NULL.
*/
static struct sos_fs_pagecache_entry *
pagecache_lookup_and_lock(struct sos_fs_pagecache * pc,
sos_luoffset_t offset)
{
sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
struct sos_fs_pagecache_entry * entry = NULL;
while (TRUE)
{
entry
= (struct sos_fs_pagecache_entry*) sos_hash_lookup(pc->lookup_table,
& pgoffs);
if (! entry)
break;
/* Lock it now */
SOS_ASSERT_FATAL(SOS_OK == pagecache_use(pc, entry));
/*
* Entry is now locked
*/
/* Make sure it contains legal contents: if we were blocked
because of the page_in operations reading it from disk, an
error could have been occured. In this case, we must consider
that this entry is not yet inserted in the cache */
if (entry->initial_fill_aborted)
{
pagecache_release(pc, entry);
continue;
}
/* Ok, we have the entry and it is correctly initialized ! */
break;
}
return entry;
}
sos_ret_t
sos_fs_pagecache_read(struct sos_fs_pagecache * pc,
sos_luoffset_t offset,
sos_genaddr_t dest_buf,
sos_size_t * /* in/out */len)
{
sos_ret_t retval;
sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
sos_luoffset_t endpos = offset + *len;
struct sos_fs_pagecache_entry * entry;
entry = pagecache_lookup_and_lock(pc, pgoffs);
if (NULL == entry)
return -SOS_ENOENT;
/* Great ! Found the entry in the cache ! */
/* Read only up to the end of the page */
if (endpos - pgoffs > SOS_PAGE_SIZE)
endpos = pgoffs + SOS_PAGE_SIZE;
/* Copy page contents to destination buffer */
retval = sos_memcpy_generic_to(dest_buf,
entry->kernel_vaddr + (offset - pgoffs),
endpos - offset);
pagecache_release(pc, entry);
if (retval < 0)
{
*len = 0;
return retval;
}
*len = retval;
if ((sos_luoffset_t)retval != endpos - offset)
return -SOS_EFAULT;
return SOS_OK;
}
sos_ret_t
sos_fs_pagecache_write(struct sos_fs_pagecache * pc,
sos_luoffset_t offset,
sos_genaddr_t src_buf,
sos_size_t * /* in/out */len,
sos_bool_t synchronous_write)
{
sos_ret_t retval;
sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
sos_luoffset_t endpos = offset + *len;
struct sos_fs_pagecache_entry * entry;
entry = pagecache_lookup_and_lock(pc, pgoffs);
if (NULL == entry)
return -SOS_ENOENT;
/* Great ! Found the entry in the cache ! */
/* Read only up to the end of the page */
if (endpos - pgoffs > SOS_PAGE_SIZE)
endpos = pgoffs + SOS_PAGE_SIZE;
/* Copy page contents to destination buffer */
retval = sos_memcpy_generic_from(entry->kernel_vaddr + (offset - pgoffs),
src_buf,
endpos - offset);
/* Transfer the entry in the dirty list if needed */
if (retval >= 0)
pagecache_set_rw_dirty(pc, entry);
if (retval < 0)
{
*len = 0;
pagecache_release(pc, entry);
return retval;
}
*len = retval;
if ((sos_luoffset_t)retval != endpos - offset)
retval = -SOS_EFAULT;
else
retval = SOS_OK;
/* Flush to disk if needed */
if (synchronous_write)
{
sos_ret_t ret = pagecache_sync_page(pc, entry);
if (SOS_OK == retval)
retval = ret;
}
pagecache_release(pc, entry);
return retval;
}
sos_ret_t sos_fs_pagecache_set_dirty(struct sos_fs_pagecache * pc,
sos_luoffset_t offset,
sos_bool_t sync_backing_store)
{
sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
struct sos_fs_pagecache_entry * entry;
entry = pagecache_lookup_and_lock(pc, pgoffs);
if (NULL == entry)
return -SOS_ENOENT;
/* Great ! Found the entry in the cache ! */
pagecache_set_rw_dirty(pc, entry);
/* Synchronize to backing store if needed */
if (sync_backing_store)
pagecache_sync_page(pc, entry);
pagecache_release(pc, entry);
return SOS_OK;
}
struct sos_fs_pagecache_entry *
sos_fs_pagecache_ref_page(struct sos_fs_pagecache * pc,
sos_luoffset_t offset,
sos_vaddr_t * /* out */ kernel_vaddr,
sos_bool_t * /* out */ newly_allocated)
{
sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
struct sos_fs_pagecache_entry * entry;
/* The offset is expected to be page-aligned */
if (pgoffs != offset)
return NULL;
entry = pagecache_lookup_and_lock(pc, pgoffs);
if (NULL != entry)
{
/* Found it ! No need to go further */
*newly_allocated = FALSE;
*kernel_vaddr = entry->kernel_vaddr;
return entry;
}
/*
* Need to allocate a new kernel page
*/
entry = (struct sos_fs_pagecache_entry*)
sos_kmem_cache_alloc(cache_of_pagecache_entry, 0);
if (NULL == entry)
return (sos_vaddr_t)NULL;
if (SOS_OK != sos_kmutex_init(& entry->lock, "pagecache_entry",
SOS_KWQ_ORDER_FIFO))
{
sos_kmem_cache_free((sos_vaddr_t)entry);
return NULL;
}
/* Initial state of the page correspond to an erroneous
initialization */
entry->file_offset = pgoffs;
entry->initial_fill_aborted = TRUE;
entry->ref_cnt = 1;
/* Allocate the page */
entry->kernel_vaddr = sos_kmalloc(SOS_PAGE_SIZE, 0);
if (((sos_vaddr_t)NULL) == entry->kernel_vaddr)
{
sos_kmutex_dispose(& entry->lock);
sos_kmem_cache_free((sos_vaddr_t)entry);
return NULL;
}
/* Own the mutex */
SOS_ASSERT_FATAL(SOS_OK == sos_kmutex_lock(& entry->lock, NULL));
/* Try to insert it into the hash table. Might fail if the page was
already inserted, which could be possible because the allocation
routines might have blocked */
if (SOS_OK != sos_hash_insert(pc->lookup_table, entry))
{
/* entry was inserted during allocations, undo the new entry */
sos_kmutex_unlock(& entry->lock);
sos_kmutex_dispose(& entry->lock);
sos_kfree(entry->kernel_vaddr);
sos_kmem_cache_free((sos_vaddr_t)entry);
/* Get the real entry */
entry = pagecache_lookup_and_lock(pc, offset);
SOS_ASSERT_FATAL(NULL != entry);
*kernel_vaddr = entry->kernel_vaddr;
*newly_allocated = FALSE;
return entry;
}
/* Now register the entry in the sync list */
entry->rw_dirty_order = 0;
list_add_head(pc->rw_sync_list, entry);
*newly_allocated = TRUE;
*kernel_vaddr = entry->kernel_vaddr;
return entry;
}
sos_ret_t
sos_fs_pagecache_unlock_page(struct sos_fs_pagecache * pc,
struct sos_fs_pagecache_entry * entry,
sos_bool_t initial_fill_aborted)
{
entry->initial_fill_aborted = initial_fill_aborted;
if (initial_fill_aborted)
return pagecache_release(pc, entry);
return sos_kmutex_unlock(& entry->lock);
}
sos_ret_t
sos_fs_pagecache_unref_page(struct sos_fs_pagecache * pc,
sos_luoffset_t offset)
{
sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset);
struct sos_fs_pagecache_entry * entry;
/* The offset is expected to be page-aligned */
if (pgoffs != offset)
return -SOS_EINVAL;
entry
= (struct sos_fs_pagecache_entry*) sos_hash_lookup(pc->lookup_table,
& pgoffs);
SOS_ASSERT_FATAL(NULL != entry);
SOS_ASSERT_FATAL(SOS_OK == sos_kmutex_lock(& entry->lock, NULL));
return pagecache_release(pc, entry);
}
sos_ret_t
sos_fs_pagecache_sync(struct sos_fs_pagecache * pc)
{
sos_ret_t retval = SOS_OK;
int dummy = 0;
sos_lcount_t rw_dirty_order = 0;
/** High watermark telling "you won't take the pages added
afterwards into account" */
sos_lcount_t top_rw_dirty_order = pc->top_rw_dirty_order;
if (list_is_empty(pc->rw_dirty_list))
return SOS_OK;
/* This scan will be exhaustive and resilient to addition/removal of
devices as long as new devices are added with list_add_tail
(because the scan is "forward", ie in order head -> tail) */
while (TRUE)
{
struct sos_fs_pagecache_entry * entry = NULL;
int ndirty;
/* As long as we don't block, we can safely access the
prev/next fields of the page descriptor */
list_foreach_backward(pc->rw_dirty_list, entry, ndirty)
{
sos_ret_t ret = SOS_OK;
struct sos_fs_pagecache_entry * prev_entry = NULL;
/* Reached the initial high watermark ? Don't take the
additional pages into account */
if (entry->rw_dirty_order > top_rw_dirty_order)
break;
if (entry->rw_dirty_order <= rw_dirty_order)
continue;
rw_dirty_order = entry->rw_dirty_order;
prev_entry = entry->prev;
SOS_ASSERT_FATAL(SOS_OK == pagecache_use(pc, entry));
if (! entry->initial_fill_aborted)
ret = pagecache_sync_page(pc, entry);
if (SOS_OK != ret)
retval = ret;
pagecache_release(pc, entry);
/* We must NOT continue the loops because the prev/next page
cache entry might have been removed or added (sync pages,
by definition) ! */
if (prev_entry != entry->prev)
goto lookup_next_ent;
}
/* Reached the end of the list */
break;
lookup_next_ent:
/* Loop over */
dummy ++;
}
return retval;
}