sos-code-article10/sos/fs_pagecache.h

215 lines
7.8 KiB
C

/* Copyright (C) 2005,2006 David Decotigny
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#ifndef _SOS_FSPAGECACHE_H_
#define _SOS_FSPAGECACHE_H_
/**
* @file fs_pagecache.h
*
* Simple page cache interface. Used to automate the synchronization
* between the rad/write operations and mmap. A "FS page cache" is
* simply a set of pages mapping a file in memory. A file may not be
* entirely mapped into memory: its pages are mapped only if any user
* thread invoked an mmap and page-faulted inside the mapped region to
* map these pages into memory. Contrary to some other caches in SOS
* (eg the block cache), this one is not limited in size. As many
* pages as needed will be allocated for it, as permitted by the
* available RAM. With a pageout mechanism, this cache will be
* shrinked when needed: some of its pages will be transferred back to
* disk and unmapped.
*
* A page cache is used both to cache memory mapped files of an FS,
* and memory mapped block devices. Hence:
* - there is one SINGLE page cache for each block device (proper to
* each disk, to each partition)
* - there is one SINGLE page cache for each file of a file system
*
* For block devices, the page cache automatically synchronizes the
* pages with the block cache as long as these mapped pages are
* accessed through the read/write API. However, <b>NO <i>automatic
* and accurate</i> synchronization</b> between the in-memory modified
* pages (accessed through the MMU) and the block cache is provided
* because we have no way to collect the accurate list of pages
* modified through MMU write accesses (this would require either to
* catch all the MMU write operations [too inefficient !], or to have
* a reverse-mapping system in order to look at the dirty bit of all
* the mappings). Hence, to enforce blkcache/pagecache
* synchronization, the msync/munmap API must be used manually. Thus,
* the page cache is accurately synchronized with the block cache:
* - automatically: with the read/write/sync operations
* - manually: with the msync and munmap (and of course: exit) operations
*
* Nevertheless, from the viewpoint of "blkdev.c", the
* blockdev_read/write operations are always in sync with the MMU
* because the pagecache is accessed prior to the blkcache: any
* divergence between the pagecache and the blkcache is hence
* <i>hidden</i>. But keep in mind that if you want the disk to
* accurately reflect the contents of the mapped pages, you have to
* eventually call msync, munmap, or to destroy the address space (ie
* exit the process).
*
* A side effect: if you map /dev/hda and /dev/hda1, both mappings
* will be inconsistent and may also be inconsistent with read/write
* accesses. This is because the partitions have their own page cache
* while they share the block cache with the disk device. A solution
* would be to share the page cache between the disk device and all
* its partitions. But, due to the fact that partitions are not
* necessarily page-aligned in the disk, this would impose some pages
* to not correspond to a page-aligned offset inside a partition,
* requiring either to have an odd semantic of the mmap syscall (the
* allowed device "offset" would depend on the disk partitioning) if
* we want to share the mapped pages between the cache and userspace,
* or to allocate other pages for the required userspace mappings and
* keep them in sync with the page cache pages. Both solutions seem
* ugly to me, and not worth implementing since the page cache is
* aimed at being generic enough to be used for file mappings: files
* don't have sub-files (as do disk devices that have partitions). So
* solving the problem is non pertinent for files. And who will ever
* need /dev/hda mappings to be consistent with those of /dev/hda1 ?...
*/
#include <sos/errno.h>
#include <sos/uaccess.h>
/** Opaque structure holding a page cache */
struct sos_fs_pagecache;
/** Opaque structure holding a page of the cache */
struct sos_fs_pagecache_entry;
sos_ret_t sos_fs_pagecache_subsystem_setup(void);
/**
* Function called to flush the dirty pages to backing store
*/
typedef sos_ret_t
(*sos_fs_pagecache_sync_function_t)(sos_luoffset_t offset,
sos_vaddr_t dirty_page,
void * custom_data);
/**
* Create a new pagecache.
*
* @param sync_fct, the function used to flush the dirty pages to
* backing store. may be NULL
*/
struct sos_fs_pagecache *
sos_fs_pagecache_new_cache(sos_fs_pagecache_sync_function_t sync_fct,
void * sync_fct_custom_data);
/**
* Delete the page cache.
*
* The page cache is expected to be already flushed to backing store
*/
sos_ret_t
sos_fs_pagecache_delete_cache(struct sos_fs_pagecache * pc);
/**
* Read from the given offset from the cache, if present.
* @return ENOENT when no page for the given offset is mapped, return
* EFAULT when the contents could not be completely copied to
* destination buffer
*/
sos_ret_t
sos_fs_pagecache_read(struct sos_fs_pagecache * pc,
sos_luoffset_t offset,
sos_genaddr_t dest_buf,
sos_size_t * /* in/out */len);
/**
* Write at the given offset from the cache, if present
* @return ENOENT when no page for the given offset is mapped, return
* EFAULT when the contents could not be completely copied from
* source buffer
*/
sos_ret_t
sos_fs_pagecache_write(struct sos_fs_pagecache * pc,
sos_luoffset_t offset,
sos_genaddr_t src_buf,
sos_size_t * /* in/out */len,
sos_bool_t synchronous_write);
/**
* Function reserved to blkdev.c and FS code: used by the msync
* callback to mark a pagecache page dirty
*
* @param sync_backing_store When TRUE, then the page must be flushed
* to backing store.
*/
sos_ret_t sos_fs_pagecache_set_dirty(struct sos_fs_pagecache * pc,
sos_luoffset_t offset,
sos_bool_t sync_backing_store);
/**
* Prepare a page to be mapped: get a NEW reference to the page
* (kernel address) of the page to be mapped, which is also locked in
* order to be used. If the page is not yet present in the cache,
* allocate it and prepare it to be filled
*
* @param offset MUST be page-aligned
* @param newly_allocated TRUE when the page was not already mapped by
* someone: the contents of the page is then IRRELEVANT
*
* @return NULL on error
*
* @note The page is also LOCKED, use unlock to unlock it before
* unreferencing it
*/
struct sos_fs_pagecache_entry *
sos_fs_pagecache_ref_page(struct sos_fs_pagecache * pc,
sos_luoffset_t offset,
sos_vaddr_t * /* out */ kernel_vaddr,
sos_bool_t * /* out */ newly_allocated);
/** Called by the blkdev.c and FS page_in callback to unlock the entry
after it has been initialized. */
sos_ret_t
sos_fs_pagecache_unlock_page(struct sos_fs_pagecache * pc,
struct sos_fs_pagecache_entry * entry,
sos_bool_t initial_fill_aborted);
/**
* Called when the page is unmapped from a user process space
* @param offset MUST be page-aligned
*
* @note the page is expected to be present in the cache
* @note the entry is expected NOT to be locked !
*/
sos_ret_t
sos_fs_pagecache_unref_page(struct sos_fs_pagecache * pc,
sos_luoffset_t offset);
/** Call the sync function on each dirty page */
sos_ret_t
sos_fs_pagecache_sync(struct sos_fs_pagecache * pc);
#endif