/* Copyright (C) 2005,2006 David Decotigny This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include /* For SOS_PAGE_MASK */ #include #include #include #include #include "fs_pagecache.h" #define SOS_OFFSET64_PAGE_ALIGN_INF(offs64) \ ( ((sos_luoffset_t)(offs64)) & (~ ((sos_luoffset_t)(SOS_PAGE_MASK))) ) #define SOS_OFFSET64_IS_PAGE_ALIGNED(offs64) \ ( ( ((sos_luoffset_t)(offs64)) & (((sos_luoffset_t)(SOS_PAGE_MASK))) ) == 0 ) /** * Definition of an object holding a reference to a shared mapping of * a file/device-mapped cache page. * * @note This structure is huge. We can shrink it largely by removing the * "name" field from the lock structure (32 bytes). */ struct sos_fs_pagecache_entry { /** offset of the cached page in the file or device */ sos_luoffset_t file_offset; /** Address of the cached page for this offset */ sos_vaddr_t kernel_vaddr; struct sos_kmutex lock; sos_count_t ref_cnt; sos_bool_t initial_fill_aborted; /**< True when the page could not be correctly filled */ /** * When 0: the page is clean wrt to read/write syscalls, ie the disk * contents reflect the contents of the page since the last * read/write operation. However, the disk may NOT be in sync wrt to * mmap() operations: if mmap() operations occured in the meantime, * the disk may NOT be up to date, and the pagecache entry may even * NOT be considered dirty. This is because we do not trace each of * the read/write MMU operations from every processes (this would * need to catch all writes even on read/write mapped pages) and we * don't have a reverse mapping available to set the page read-only * in every mappings once it has been synched to disk (to * effectively trace the dirty state relative to mmap operations). * * When ">0": at least one process changed the contents of the page * through read/write syscalls since last sync operation. * * @note A boolean is enough for 99% of the code. But we need a real * counter for the sos_fs_pagecache_sync operation to make sure we * don't iterate 2 times over the same page. */ sos_lcount_t rw_dirty_order; #define ENTRY_IS_RW_DIRTY(e) ((e)->rw_dirty_order > 0) /** Linkage structure to keep the cache entry in the hash map */ struct sos_hash_linkage hlink; /** Links to insert the entry into the rw_sync/rw_dirty lists */ struct sos_fs_pagecache_entry *prev, *next; }; struct sos_fs_pagecache { /** The operation used to synchronize the mapped pages with the backing store */ sos_fs_pagecache_sync_function_t sync_fct; void * sync_fct_custom_data; /** The dictionary offset -> pagecache_entry */ struct sos_hash_table * lookup_table; /* Lists to look into in order to free a node */ struct sos_fs_pagecache_entry * rw_sync_list; /**< Pages in sync with disk wrt read/write API (LRU at end) */ struct sos_fs_pagecache_entry * rw_dirty_list; /**< Dirty pages wrt read/write API (LRU last) */ /** The "timestamp" high watermark used to iterate over the dirty pages in the sync function */ sos_lcount_t top_rw_dirty_order; }; /** The slab cache for pagecache */ static struct sos_kslab_cache * cache_of_pagecache; /** The slab cache for pagecache entries */ static struct sos_kslab_cache * cache_of_pagecache_entry; sos_ret_t sos_fs_pagecache_subsystem_setup() { /* Allocate the slab caches for the page caches and page cache entries */ cache_of_pagecache = sos_kmem_cache_create("pagecache", sizeof(struct sos_fs_pagecache), 2, 0, SOS_KSLAB_CREATE_MAP | SOS_KSLAB_CREATE_ZERO); if (NULL == cache_of_pagecache) return -SOS_ENOMEM; cache_of_pagecache_entry = sos_kmem_cache_create("pagecache_entry", sizeof(struct sos_fs_pagecache_entry), 2, 0, SOS_KSLAB_CREATE_MAP | SOS_KSLAB_CREATE_ZERO); if (NULL == cache_of_pagecache_entry) { sos_kmem_cache_destroy(cache_of_pagecache); return -SOS_ENOMEM; } return SOS_OK; } struct sos_fs_pagecache * sos_fs_pagecache_new_cache(sos_fs_pagecache_sync_function_t sync_fct, void * sync_fct_custom_data) { struct sos_fs_pagecache * pagecache = (struct sos_fs_pagecache*) sos_kmem_cache_alloc(cache_of_pagecache, 0); if (NULL == pagecache) return NULL; pagecache->lookup_table = sos_hash_create("pagecache", struct sos_fs_pagecache_entry, sos_hash_ui64, sos_hash_key_eq_ui64, 127, file_offset, hlink); if (NULL == pagecache->lookup_table) { sos_kmem_cache_free((sos_vaddr_t) pagecache); return NULL; } pagecache->sync_fct = sync_fct; pagecache->sync_fct_custom_data = sync_fct_custom_data; pagecache->top_rw_dirty_order = 0x24; return pagecache; } sos_ret_t sos_fs_pagecache_delete_cache(struct sos_fs_pagecache * pc) { /* The cache is EXPECTED to be empty ! */ if (!list_is_empty(pc->rw_dirty_list)) SOS_FATAL_ERROR("Non empty dirty list"); if (!list_is_empty(pc->rw_sync_list)) SOS_FATAL_ERROR("Non empty sync list"); sos_hash_dispose(pc->lookup_table); return sos_kmem_cache_free((sos_vaddr_t)pc); } /** Helper function to flush a page to disk. Expects the entry to be locked */ static sos_ret_t pagecache_sync_page(struct sos_fs_pagecache * pc, struct sos_fs_pagecache_entry * entry) { sos_ret_t retval; if (! ENTRY_IS_RW_DIRTY(entry)) return SOS_OK; /* Now do the real transfer to backing store */ retval = pc->sync_fct(entry->file_offset, entry->kernel_vaddr, pc->sync_fct_custom_data); if (SOS_OK != retval) return retval; /* Transfer page to the sync list */ list_delete(pc->rw_dirty_list, entry); entry->rw_dirty_order = 0; list_add_head(pc->rw_sync_list, entry); return SOS_OK; } /** Helper function to correctly lock an entry */ static sos_ret_t pagecache_use(struct sos_fs_pagecache * pc, struct sos_fs_pagecache_entry * entry) { entry->ref_cnt ++; return sos_kmutex_lock(& entry->lock, NULL); } /** * Helper function to transfer a page to the dirty r/w list */ static sos_ret_t pagecache_set_rw_dirty(struct sos_fs_pagecache * pc, struct sos_fs_pagecache_entry * entry) { if (ENTRY_IS_RW_DIRTY(entry)) return SOS_OK; /* Nothing to do */ list_delete(pc->rw_sync_list, entry); entry->rw_dirty_order = ++ pc->top_rw_dirty_order; list_add_head(pc->rw_dirty_list, entry); return SOS_OK; } /** Helper function to correctly unlock an entry, flushing it to disk if needed */ static sos_ret_t pagecache_release(struct sos_fs_pagecache * pc, struct sos_fs_pagecache_entry * entry) { if (entry->ref_cnt > 1) { entry->ref_cnt --; sos_kmutex_unlock(& entry->lock); return SOS_OK; } /* * The cached page is now referenced ONLY by US, we can try to * remove it from the cache */ /* Flush any change to disk, at least if we are sure that its content is legal, ie that the page_in callback did success in filling it */ if (! entry->initial_fill_aborted) pagecache_sync_page(pc, entry); /* Ok, now WE are not interested by this entry anymore */ entry->ref_cnt --; /* During blocking time, another thread could have asked for the entry. In this case, stop here */ if (entry->ref_cnt > 0) { sos_kmutex_unlock(& entry->lock); return SOS_OK; } /* Remove it from the lists */ sos_hash_remove(pc->lookup_table, entry); if (ENTRY_IS_RW_DIRTY(entry)) list_delete(pc->rw_dirty_list, entry); else list_delete(pc->rw_sync_list, entry); /* We can safely erase it now ! */ sos_kmutex_unlock(& entry->lock); SOS_ASSERT_FATAL(SOS_OK == sos_kmutex_dispose(& entry->lock)); /* No thread are waiting */ sos_kfree(entry->kernel_vaddr); sos_kmem_cache_free((sos_vaddr_t)entry); return SOS_OK; } /** * Helper function to look up an entry from the cache and lock it. If * the entry does not exist (yet), return NULL. */ static struct sos_fs_pagecache_entry * pagecache_lookup_and_lock(struct sos_fs_pagecache * pc, sos_luoffset_t offset) { sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset); struct sos_fs_pagecache_entry * entry = NULL; while (TRUE) { entry = (struct sos_fs_pagecache_entry*) sos_hash_lookup(pc->lookup_table, & pgoffs); if (! entry) break; /* Lock it now */ SOS_ASSERT_FATAL(SOS_OK == pagecache_use(pc, entry)); /* * Entry is now locked */ /* Make sure it contains legal contents: if we were blocked because of the page_in operations reading it from disk, an error could have been occured. In this case, we must consider that this entry is not yet inserted in the cache */ if (entry->initial_fill_aborted) { pagecache_release(pc, entry); continue; } /* Ok, we have the entry and it is correctly initialized ! */ break; } return entry; } sos_ret_t sos_fs_pagecache_read(struct sos_fs_pagecache * pc, sos_luoffset_t offset, sos_genaddr_t dest_buf, sos_size_t * /* in/out */len) { sos_ret_t retval; sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset); sos_luoffset_t endpos = offset + *len; struct sos_fs_pagecache_entry * entry; entry = pagecache_lookup_and_lock(pc, pgoffs); if (NULL == entry) return -SOS_ENOENT; /* Great ! Found the entry in the cache ! */ /* Read only up to the end of the page */ if (endpos - pgoffs > SOS_PAGE_SIZE) endpos = pgoffs + SOS_PAGE_SIZE; /* Copy page contents to destination buffer */ retval = sos_memcpy_generic_to(dest_buf, entry->kernel_vaddr + (offset - pgoffs), endpos - offset); pagecache_release(pc, entry); if (retval < 0) { *len = 0; return retval; } *len = retval; if ((sos_luoffset_t)retval != endpos - offset) return -SOS_EFAULT; return SOS_OK; } sos_ret_t sos_fs_pagecache_write(struct sos_fs_pagecache * pc, sos_luoffset_t offset, sos_genaddr_t src_buf, sos_size_t * /* in/out */len, sos_bool_t synchronous_write) { sos_ret_t retval; sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset); sos_luoffset_t endpos = offset + *len; struct sos_fs_pagecache_entry * entry; entry = pagecache_lookup_and_lock(pc, pgoffs); if (NULL == entry) return -SOS_ENOENT; /* Great ! Found the entry in the cache ! */ /* Read only up to the end of the page */ if (endpos - pgoffs > SOS_PAGE_SIZE) endpos = pgoffs + SOS_PAGE_SIZE; /* Copy page contents to destination buffer */ retval = sos_memcpy_generic_from(entry->kernel_vaddr + (offset - pgoffs), src_buf, endpos - offset); /* Transfer the entry in the dirty list if needed */ if (retval >= 0) pagecache_set_rw_dirty(pc, entry); if (retval < 0) { *len = 0; pagecache_release(pc, entry); return retval; } *len = retval; if ((sos_luoffset_t)retval != endpos - offset) retval = -SOS_EFAULT; else retval = SOS_OK; /* Flush to disk if needed */ if (synchronous_write) { sos_ret_t ret = pagecache_sync_page(pc, entry); if (SOS_OK == retval) retval = ret; } pagecache_release(pc, entry); return retval; } sos_ret_t sos_fs_pagecache_set_dirty(struct sos_fs_pagecache * pc, sos_luoffset_t offset, sos_bool_t sync_backing_store) { sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset); struct sos_fs_pagecache_entry * entry; entry = pagecache_lookup_and_lock(pc, pgoffs); if (NULL == entry) return -SOS_ENOENT; /* Great ! Found the entry in the cache ! */ pagecache_set_rw_dirty(pc, entry); /* Synchronize to backing store if needed */ if (sync_backing_store) pagecache_sync_page(pc, entry); pagecache_release(pc, entry); return SOS_OK; } struct sos_fs_pagecache_entry * sos_fs_pagecache_ref_page(struct sos_fs_pagecache * pc, sos_luoffset_t offset, sos_vaddr_t * /* out */ kernel_vaddr, sos_bool_t * /* out */ newly_allocated) { sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset); struct sos_fs_pagecache_entry * entry; /* The offset is expected to be page-aligned */ if (pgoffs != offset) return NULL; entry = pagecache_lookup_and_lock(pc, pgoffs); if (NULL != entry) { /* Found it ! No need to go further */ *newly_allocated = FALSE; *kernel_vaddr = entry->kernel_vaddr; return entry; } /* * Need to allocate a new kernel page */ entry = (struct sos_fs_pagecache_entry*) sos_kmem_cache_alloc(cache_of_pagecache_entry, 0); if (NULL == entry) return (sos_vaddr_t)NULL; if (SOS_OK != sos_kmutex_init(& entry->lock, "pagecache_entry", SOS_KWQ_ORDER_FIFO)) { sos_kmem_cache_free((sos_vaddr_t)entry); return NULL; } /* Initial state of the page correspond to an erroneous initialization */ entry->file_offset = pgoffs; entry->initial_fill_aborted = TRUE; entry->ref_cnt = 1; /* Allocate the page */ entry->kernel_vaddr = sos_kmalloc(SOS_PAGE_SIZE, 0); if (((sos_vaddr_t)NULL) == entry->kernel_vaddr) { sos_kmutex_dispose(& entry->lock); sos_kmem_cache_free((sos_vaddr_t)entry); return NULL; } /* Own the mutex */ SOS_ASSERT_FATAL(SOS_OK == sos_kmutex_lock(& entry->lock, NULL)); /* Try to insert it into the hash table. Might fail if the page was already inserted, which could be possible because the allocation routines might have blocked */ if (SOS_OK != sos_hash_insert(pc->lookup_table, entry)) { /* entry was inserted during allocations, undo the new entry */ sos_kmutex_unlock(& entry->lock); sos_kmutex_dispose(& entry->lock); sos_kfree(entry->kernel_vaddr); sos_kmem_cache_free((sos_vaddr_t)entry); /* Get the real entry */ entry = pagecache_lookup_and_lock(pc, offset); SOS_ASSERT_FATAL(NULL != entry); *kernel_vaddr = entry->kernel_vaddr; *newly_allocated = FALSE; return entry; } /* Now register the entry in the sync list */ entry->rw_dirty_order = 0; list_add_head(pc->rw_sync_list, entry); *newly_allocated = TRUE; *kernel_vaddr = entry->kernel_vaddr; return entry; } sos_ret_t sos_fs_pagecache_unlock_page(struct sos_fs_pagecache * pc, struct sos_fs_pagecache_entry * entry, sos_bool_t initial_fill_aborted) { entry->initial_fill_aborted = initial_fill_aborted; if (initial_fill_aborted) return pagecache_release(pc, entry); return sos_kmutex_unlock(& entry->lock); } sos_ret_t sos_fs_pagecache_unref_page(struct sos_fs_pagecache * pc, sos_luoffset_t offset) { sos_luoffset_t pgoffs = SOS_OFFSET64_PAGE_ALIGN_INF(offset); struct sos_fs_pagecache_entry * entry; /* The offset is expected to be page-aligned */ if (pgoffs != offset) return -SOS_EINVAL; entry = (struct sos_fs_pagecache_entry*) sos_hash_lookup(pc->lookup_table, & pgoffs); SOS_ASSERT_FATAL(NULL != entry); SOS_ASSERT_FATAL(SOS_OK == sos_kmutex_lock(& entry->lock, NULL)); return pagecache_release(pc, entry); } sos_ret_t sos_fs_pagecache_sync(struct sos_fs_pagecache * pc) { sos_ret_t retval = SOS_OK; int dummy = 0; sos_lcount_t rw_dirty_order = 0; /** High watermark telling "you won't take the pages added afterwards into account" */ sos_lcount_t top_rw_dirty_order = pc->top_rw_dirty_order; if (list_is_empty(pc->rw_dirty_list)) return SOS_OK; /* This scan will be exhaustive and resilient to addition/removal of devices as long as new devices are added with list_add_tail (because the scan is "forward", ie in order head -> tail) */ while (TRUE) { struct sos_fs_pagecache_entry * entry = NULL; int ndirty; /* As long as we don't block, we can safely access the prev/next fields of the page descriptor */ list_foreach_backward(pc->rw_dirty_list, entry, ndirty) { sos_ret_t ret = SOS_OK; struct sos_fs_pagecache_entry * prev_entry = NULL; /* Reached the initial high watermark ? Don't take the additional pages into account */ if (entry->rw_dirty_order > top_rw_dirty_order) break; if (entry->rw_dirty_order <= rw_dirty_order) continue; rw_dirty_order = entry->rw_dirty_order; prev_entry = entry->prev; SOS_ASSERT_FATAL(SOS_OK == pagecache_use(pc, entry)); if (! entry->initial_fill_aborted) ret = pagecache_sync_page(pc, entry); if (SOS_OK != ret) retval = ret; pagecache_release(pc, entry); /* We must NOT continue the loops because the prev/next page cache entry might have been removed or added (sync pages, by definition) ! */ if (prev_entry != entry->prev) goto lookup_next_ent; } /* Reached the end of the list */ break; lookup_next_ent: /* Loop over */ dummy ++; } return retval; }