/* Copyright (C) 2005,2006 David Decotigny This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _SOS_FSPAGECACHE_H_ #define _SOS_FSPAGECACHE_H_ /** * @file fs_pagecache.h * * Simple page cache interface. Used to automate the synchronization * between the rad/write operations and mmap. A "FS page cache" is * simply a set of pages mapping a file in memory. A file may not be * entirely mapped into memory: its pages are mapped only if any user * thread invoked an mmap and page-faulted inside the mapped region to * map these pages into memory. Contrary to some other caches in SOS * (eg the block cache), this one is not limited in size. As many * pages as needed will be allocated for it, as permitted by the * available RAM. With a pageout mechanism, this cache will be * shrinked when needed: some of its pages will be transferred back to * disk and unmapped. * * A page cache is used both to cache memory mapped files of an FS, * and memory mapped block devices. Hence: * - there is one SINGLE page cache for each block device (proper to * each disk, to each partition) * - there is one SINGLE page cache for each file of a file system * * For block devices, the page cache automatically synchronizes the * pages with the block cache as long as these mapped pages are * accessed through the read/write API. However, NO automatic * and accurate synchronization between the in-memory modified * pages (accessed through the MMU) and the block cache is provided * because we have no way to collect the accurate list of pages * modified through MMU write accesses (this would require either to * catch all the MMU write operations [too inefficient !], or to have * a reverse-mapping system in order to look at the dirty bit of all * the mappings). Hence, to enforce blkcache/pagecache * synchronization, the msync/munmap API must be used manually. Thus, * the page cache is accurately synchronized with the block cache: * - automatically: with the read/write/sync operations * - manually: with the msync and munmap (and of course: exit) operations * * Nevertheless, from the viewpoint of "blkdev.c", the * blockdev_read/write operations are always in sync with the MMU * because the pagecache is accessed prior to the blkcache: any * divergence between the pagecache and the blkcache is hence * hidden. But keep in mind that if you want the disk to * accurately reflect the contents of the mapped pages, you have to * eventually call msync, munmap, or to destroy the address space (ie * exit the process). * * A side effect: if you map /dev/hda and /dev/hda1, both mappings * will be inconsistent and may also be inconsistent with read/write * accesses. This is because the partitions have their own page cache * while they share the block cache with the disk device. A solution * would be to share the page cache between the disk device and all * its partitions. But, due to the fact that partitions are not * necessarily page-aligned in the disk, this would impose some pages * to not correspond to a page-aligned offset inside a partition, * requiring either to have an odd semantic of the mmap syscall (the * allowed device "offset" would depend on the disk partitioning) if * we want to share the mapped pages between the cache and userspace, * or to allocate other pages for the required userspace mappings and * keep them in sync with the page cache pages. Both solutions seem * ugly to me, and not worth implementing since the page cache is * aimed at being generic enough to be used for file mappings: files * don't have sub-files (as do disk devices that have partitions). So * solving the problem is non pertinent for files. And who will ever * need /dev/hda mappings to be consistent with those of /dev/hda1 ?... */ #include #include /** Opaque structure holding a page cache */ struct sos_fs_pagecache; /** Opaque structure holding a page of the cache */ struct sos_fs_pagecache_entry; sos_ret_t sos_fs_pagecache_subsystem_setup(void); /** * Function called to flush the dirty pages to backing store */ typedef sos_ret_t (*sos_fs_pagecache_sync_function_t)(sos_luoffset_t offset, sos_vaddr_t dirty_page, void * custom_data); /** * Create a new pagecache. * * @param sync_fct, the function used to flush the dirty pages to * backing store. may be NULL */ struct sos_fs_pagecache * sos_fs_pagecache_new_cache(sos_fs_pagecache_sync_function_t sync_fct, void * sync_fct_custom_data); /** * Delete the page cache. * * The page cache is expected to be already flushed to backing store */ sos_ret_t sos_fs_pagecache_delete_cache(struct sos_fs_pagecache * pc); /** * Read from the given offset from the cache, if present. * @return ENOENT when no page for the given offset is mapped, return * EFAULT when the contents could not be completely copied to * destination buffer */ sos_ret_t sos_fs_pagecache_read(struct sos_fs_pagecache * pc, sos_luoffset_t offset, sos_genaddr_t dest_buf, sos_size_t * /* in/out */len); /** * Write at the given offset from the cache, if present * @return ENOENT when no page for the given offset is mapped, return * EFAULT when the contents could not be completely copied from * source buffer */ sos_ret_t sos_fs_pagecache_write(struct sos_fs_pagecache * pc, sos_luoffset_t offset, sos_genaddr_t src_buf, sos_size_t * /* in/out */len, sos_bool_t synchronous_write); /** * Function reserved to blkdev.c and FS code: used by the msync * callback to mark a pagecache page dirty * * @param sync_backing_store When TRUE, then the page must be flushed * to backing store. */ sos_ret_t sos_fs_pagecache_set_dirty(struct sos_fs_pagecache * pc, sos_luoffset_t offset, sos_bool_t sync_backing_store); /** * Prepare a page to be mapped: get a NEW reference to the page * (kernel address) of the page to be mapped, which is also locked in * order to be used. If the page is not yet present in the cache, * allocate it and prepare it to be filled * * @param offset MUST be page-aligned * @param newly_allocated TRUE when the page was not already mapped by * someone: the contents of the page is then IRRELEVANT * * @return NULL on error * * @note The page is also LOCKED, use unlock to unlock it before * unreferencing it */ struct sos_fs_pagecache_entry * sos_fs_pagecache_ref_page(struct sos_fs_pagecache * pc, sos_luoffset_t offset, sos_vaddr_t * /* out */ kernel_vaddr, sos_bool_t * /* out */ newly_allocated); /** Called by the blkdev.c and FS page_in callback to unlock the entry after it has been initialized. */ sos_ret_t sos_fs_pagecache_unlock_page(struct sos_fs_pagecache * pc, struct sos_fs_pagecache_entry * entry, sos_bool_t initial_fill_aborted); /** * Called when the page is unmapped from a user process space * @param offset MUST be page-aligned * * @note the page is expected to be present in the cache * @note the entry is expected NOT to be locked ! */ sos_ret_t sos_fs_pagecache_unref_page(struct sos_fs_pagecache * pc, sos_luoffset_t offset); /** Call the sync function on each dirty page */ sos_ret_t sos_fs_pagecache_sync(struct sos_fs_pagecache * pc); #endif