--- linux/fs/buffer.c.~2.2 1995/06/03 01:36:25 +++ linux/fs/buffer.c 1995/06/03 04:12:00 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,10 @@ static struct buffer_head ** hash_table; struct buffer_head ** buffer_pages; static struct buffer_head * lru_list[NR_LIST] = {NULL, }; +/* next_to_age is an array of pointers into the lru lists, used to + cycle through the buffers aging their contents when deciding which + buffers to discard when more memory is needed */ +static struct buffer_head * next_to_age[NR_LIST] = {NULL, }; static struct buffer_head * free_list[NR_SIZES] = {NULL, }; static struct buffer_head * unused_list = NULL; static struct wait_queue * buffer_wait = NULL; @@ -297,8 +302,13 @@ if (lru_list[bh->b_list] == bh) lru_list[bh->b_list] = bh->b_next_free; - if(lru_list[bh->b_list] == bh) + if (lru_list[bh->b_list] == bh) lru_list[bh->b_list] = NULL; + if (next_to_age[bh->b_list] == bh) + next_to_age[bh->b_list] = bh->b_next_free; + if (next_to_age[bh->b_list] == bh) + next_to_age[bh->b_list] = NULL; + bh->b_next_free = bh->b_prev_free = NULL; } @@ -341,6 +351,8 @@ return; if (bh == lru_list[bh->b_list]) { lru_list[bh->b_list] = bh->b_next_free; + if (next_to_age[bh->b_list] == bh) + next_to_age[bh->b_list] = bh->b_next_free; return; } if(bh->b_dev == 0xffff) panic("Wrong block for lru list"); @@ -351,6 +363,8 @@ lru_list[bh->b_list] = bh; lru_list[bh->b_list]->b_prev_free = bh; }; + if (!next_to_age[bh->b_list]) + next_to_age[bh->b_list] = bh; bh->b_next_free = lru_list[bh->b_list]; bh->b_prev_free = lru_list[bh->b_list]->b_prev_free; @@ -384,14 +398,16 @@ { /* put at end of free list */ - if(bh->b_dev == 0xffff) { + if (bh->b_dev == 0xffff) { put_last_free(bh); return; }; - if(!lru_list[bh->b_list]) { + if (!lru_list[bh->b_list]) { lru_list[bh->b_list] = bh; bh->b_prev_free = bh; }; + if (!next_to_age[bh->b_list]) + next_to_age[bh->b_list] = bh; if (bh->b_next_free) panic("VFS: buffer LRU pointers corrupted"); bh->b_next_free = lru_list[bh->b_list]; bh->b_prev_free = lru_list[bh->b_list]->b_prev_free; @@ -662,7 +678,7 @@ /* Too bad, that was not enough. Try a little harder to grow some. */ - if (nr_free_pages > 5) { + if (nr_free_pages > min_free_pages + 5) { if (grow_buffers(GFP_BUFFER, size)) { needed -= PAGE_SIZE; goto repeat0; @@ -703,6 +719,7 @@ if (bh->b_uptodate && !bh->b_dirt) put_last_lru(bh); if(!bh->b_dirt) bh->b_flushtime = 0; + bh->b_touched = 1; return bh; } @@ -722,6 +739,8 @@ bh->b_uptodate=0; bh->b_flushtime = 0; bh->b_req=0; + bh->b_touched = 1; + bh->b_has_aged = 0; bh->b_dev=dev; bh->b_blocknr=block; insert_into_queues(bh); @@ -1220,7 +1239,8 @@ * try_to_free() checks if all the buffers on this particular page * are unused, and free's the page if so. */ -static int try_to_free(struct buffer_head * bh, struct buffer_head ** bhp) +static int try_to_free(struct buffer_head * bh, struct buffer_head ** bhp, + int priority) { unsigned long page; struct buffer_head * tmp, * p; @@ -1235,6 +1255,8 @@ return 0; if (tmp->b_count || tmp->b_dirt || tmp->b_lock || tmp->b_wait) return 0; + if (priority && tmp->b_touched) + return 0; tmp = tmp->b_this_page; } while (tmp != bh); tmp = bh; @@ -1259,6 +1281,39 @@ } +/* Age buffers on a given page, according to whether they have been + visited recently or not. */ +static inline void age_buffer(struct buffer_head *bh) +{ + struct buffer_head *tmp = bh; + int touched = 0; + + /* + * When we age a page, we mark all other buffers in the page + * with the "has_aged" flag. Then, when these aliased buffers + * come up for aging, we skip them until next pass. This + * ensures that a page full of multiple buffers only gets aged + * once per pass through the lru lists. + */ + if (bh->b_has_aged) { + bh->b_has_aged = 0; + return; + } + + do { + touched |= tmp->b_touched; + tmp->b_touched = 0; + tmp = tmp->b_this_page; + tmp->b_has_aged = 1; + } while (tmp != bh); + bh->b_has_aged = 0; + + if (touched) + touch_page((unsigned long) bh->b_data); + else + age_page((unsigned long) bh->b_data); +} + /* * Consult the load average for buffers and decide whether or not * we should shrink the buffers of one size or not. If we decide yes, @@ -1308,11 +1363,12 @@ } return 0; } + /* * Try to free up some pages by shrinking the buffer-cache * * Priority tells the routine how hard to try to shrink the - * buffers: 3 means "don't bother too much", while a value + * buffers: 6 means "don't bother too much", while a value * of 0 means "we'd better get some free pages now". */ int shrink_buffers(unsigned int priority) @@ -1349,10 +1405,12 @@ for (i=0 ; !i || bh != free_list[isize]; bh = bh->b_next_free, i++) { if (bh->b_count || !bh->b_this_page) continue; - if (try_to_free(bh, &bh)) + if (!age_of((unsigned long) bh->b_data) && + try_to_free(bh, &bh, 6)) return 1; - if(!bh) break; /* Some interrupt must have used it after we - freed the page. No big deal - keep looking */ + if(!bh) break; + /* Some interrupt must have used it after we + freed the page. No big deal - keep looking */ } } @@ -1360,12 +1418,19 @@ for(nlist = 0; nlist < NR_LIST; nlist++) { repeat1: - if(priority > 3 && nlist == BUF_SHARED) continue; - bh = lru_list[nlist]; - if(!bh) continue; - i = 2*nr_buffers_type[nlist] >> priority; - for ( ; i-- > 0 ; bh = bh->b_next_free) { - /* We may have stalled while waiting for I/O to complete. */ + if(priority > 2 && nlist == BUF_SHARED) continue; + i = nr_buffers_type[nlist]; + i = ((BUFFEROUT_WEIGHT * i) >> 10) >> priority; + for ( ; i > 0; i-- ) { + bh = next_to_age[nlist]; + if (!bh) + break; + next_to_age[nlist] = bh->b_next_free; + + /* First, age the buffer. */ + age_buffer(bh); + /* We may have stalled while waiting for I/O + to complete. */ if(bh->b_list != nlist) goto repeat1; if (bh->b_count || !bh->b_this_page) continue; @@ -1382,7 +1447,13 @@ bh->b_count--; continue; } - if (try_to_free(bh, &bh)) + /* At priority 6, only consider really old + (age==0) buffers for reclaiming. At + priority 0, consider any buffers. */ + if ((age_of((unsigned long) bh->b_data) >> + (6-priority)) > 0) + continue; + if (try_to_free(bh, &bh, 0)) return 1; if(!bh) break; } --- linux/init/main.c.~2.2 1995/06/03 01:36:25 +++ linux/init/main.c 1995/06/03 04:12:00 @@ -50,6 +50,8 @@ unsigned long net_dev_init(unsigned long, unsigned long); extern long bios32_init(long, long); +extern void swap_setup(char *str, int *ints); +extern void buff_setup(char *str, int *ints); extern void bmouse_setup(char *str, int *ints); extern void eth_setup(char *str, int *ints); extern void xd_setup(char *str, int *ints); @@ -132,6 +134,8 @@ } bootsetups[] = { { "reserve=", reserve_setup }, { "ramdisk=", ramdisk_setup }, + { "swap=", swap_setup }, + { "buff=", buff_setup }, #ifdef CONFIG_BUGi386 { "no-hlt", no_halt }, { "no387", no_387 }, --- linux/kernel/sys.c.~2.2 1995/06/03 01:36:25 +++ linux/kernel/sys.c 1995/06/03 04:12:00 @@ -734,6 +734,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(p->stime); r.ru_minflt = p->mm->min_flt; r.ru_majflt = p->mm->maj_flt; + r.ru_nswap = p->mm->nswap; break; case RUSAGE_CHILDREN: r.ru_utime.tv_sec = CT_TO_SECS(p->cutime); @@ -742,6 +743,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(p->cstime); r.ru_minflt = p->mm->cmin_flt; r.ru_majflt = p->mm->cmaj_flt; + r.ru_nswap = p->mm->cnswap; break; default: r.ru_utime.tv_sec = CT_TO_SECS(p->utime + p->cutime); @@ -750,6 +752,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(p->stime + p->cstime); r.ru_minflt = p->mm->min_flt + p->mm->cmin_flt; r.ru_majflt = p->mm->maj_flt + p->mm->cmaj_flt; + r.ru_nswap = p->mm->nswap + p->mm->cnswap; break; } memcpy_tofs(ru, &r, sizeof(r)); --- linux/kernel/fork.c.~2.2 1995/06/03 01:36:25 +++ linux/kernel/fork.c 1995/06/03 04:12:00 @@ -141,6 +141,7 @@ if (clone_flags & COPYVM) { p->mm->min_flt = p->mm->maj_flt = 0; p->mm->cmin_flt = p->mm->cmaj_flt = 0; + p->mm->nswap = p->mm->cnswap = 0; if (copy_page_tables(p)) return 1; return dup_mmap(p); --- linux/kernel/exit.c.~2.2 1995/06/03 01:36:25 +++ linux/kernel/exit.c 1995/06/03 04:12:00 @@ -527,6 +527,7 @@ current->cstime += p->stime + p->cstime; current->mm->cmin_flt += p->mm->min_flt + p->mm->cmin_flt; current->mm->cmaj_flt += p->mm->maj_flt + p->mm->cmaj_flt; + current->mm->cnswap += p->mm->nswap + p->mm->cnswap; if (ru != NULL) getrusage(p, RUSAGE_BOTH, ru); flag = p->pid; --- linux/kernel/Makefile.~2.0.2.2 1995/06/03 01:36:25 +++ linux/kernel/Makefile 1995/06/03 04:12:00 @@ -18,7 +18,7 @@ OBJS = sched.o dma.o fork.o exec_domain.o panic.o printk.o sys.o \ module.o exit.o signal.o itimer.o info.o time.o softirq.o \ - resource.o + resource.o sysctl.o SYMTAB_OBJS = ksyms.o --- linux/kernel/ksyms.c.~2.2 1995/06/03 01:36:25 +++ linux/kernel/ksyms.c 1995/06/03 04:12:00 @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef CONFIG_NET #include @@ -202,6 +203,10 @@ X(lookup_exec_domain), X(register_exec_domain), X(unregister_exec_domain), + + /* sysctl table registration */ + X(register_sysctl_table), + X(unregister_sysctl_table), /* interrupt handling */ X(request_irq), --- linux/kernel/sysctl.c.~2.2 1995/06/03 01:36:25 +++ linux/kernel/sysctl.c 1995/06/03 04:12:00 @@ -0,0 +1,315 @@ +/* + * sysctl.c: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + */ + +#include +#include +#include +#include +#include + +/* For debugging... */ +#define static + +static ctl_table root_table[]; +static struct ctl_table_header root_table_header = + {root_table, DNODE_SINGLE(&root_table_header)}; +DLIST(struct ctl_table_header) table_list = DLIST_SINGLE(&root_table_header); + +static ctl_handler no_handler; +#define do_ls_blockdevs no_handler +#define do_blockdev no_handler +#define do_ls_chardevs no_handler +#define do_chardev no_handler +#define do_ls_netdevs no_handler +#define do_netdev no_handler +#define do_ls_scsidevs no_handler +#define do_scsidev no_handler +#define do_ls_sounddevs no_handler +#define do_sounddev no_handler + +static ctl_handler do_kern; +static ctl_handler do_vm; +static ctl_handler do_kern_prof; + +static ctl_table kern_table[]; +static ctl_table vm_table[]; +static ctl_table kern_prof_table[]; + +static ctl_table root_table[] = { + {CTL_KERN, 0, kern_table}, + {CTL_VM, 0, vm_table}, + {0,0,0}, +}; + +static ctl_table kern_table[] = { + {KERN_PROF, 0, + ((ctl_table[]) { + {CTL_ANY, do_kern_prof, 0}, + })}, + {CTL_ANY, do_kern, 0}, +}; + +static ctl_table kern_prof_table[] = { + {CTL_ANY, do_kern_prof, 0}, +}; + +static ctl_table vm_table[] = { + {CTL_ANY, do_vm, 0}, +}; + +static ctl_table dev_table[] = { + {DEV_BLOCK, 0, + ((ctl_table[]) { + {DEV_LSDEVS, do_ls_blockdevs, 0}, + {CTL_ANY, do_blockdev, 0}, + })}, + {DEV_CHAR, 0, + ((ctl_table[]) { + {DEV_LSDEVS, do_ls_chardevs, 0}, + {CTL_ANY, do_chardev, 0}, + })}, + {DEV_NET, 0, + ((ctl_table[]) { + {DEV_LSDEVS, do_ls_netdevs, 0}, + {CTL_ANY, do_netdev, 0}, + })}, + {DEV_SCSI, 0, + ((ctl_table[]) { + {DEV_LSDEVS, do_ls_scsidevs, 0}, + {CTL_ANY, do_scsidev, 0}, + })}, + {DEV_SOUND, 0, + ((ctl_table[]) { + {DEV_LSDEVS, do_ls_sounddevs, 0}, + {CTL_ANY, do_sounddev, 0}, + })}, + {0, 0, 0}, +}; + +static int parse_table(int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, + ctl_table *table, ctl_context *context) +{ + int error; +repeat: + if (!nlen) + return -ENOTDIR; + + for ( ; table->ctl_name; table++) { + if (get_user_long(name) == table->ctl_name || + table->ctl_name == CTL_ANY) { + if (table->ctl_handler) { + error = table->ctl_handler(name, nlen, + oldval, oldlenp, + newval, newlen, + context); + if (error) + return error; + } + if (table->ctl_table) { + name++; + nlen--; + table = table->ctl_table; + goto repeat; + } + return 0; + } + }; + return -ENOTDIR; +} + + +int do_sysctl (int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen) +{ + int error; + ctl_context context; + struct ctl_table_header *tmp; + + if (nlen == 0 || nlen >= CTL_MAXNAME) + return -ENOTDIR; + error = verify_area(VERIFY_READ,name,nlen*sizeof(int)); + if (error) return error; + if (oldval) { + if (!oldlenp) + return -EFAULT; + error = verify_area(VERIFY_WRITE,oldlenp,sizeof(size_t)); + if (error) return error; + error = verify_area(VERIFY_WRITE,oldval,get_user_long(oldlenp)); + if (error) return error; + } + if (newval) { + error = verify_area(VERIFY_READ,newval,newlen); + if (error) return error; + } + tmp = DLIST_FIRST(table_list); + do { + error = parse_table(name, nlen, oldval, oldlenp, + newval, newlen, + root_table, &context); + if (error != ENOTDIR) + return error; + tmp = tmp->DLIST_NEXT(ctl_entry); + } while (tmp != DLIST_FIRST(table_list)); + return -ENOTDIR; +} + +extern asmlinkage int sys_sysctl(int *buffer) +{ + int *name, nlen; + void *oldval, *newval; + size_t *oldlenp, newlen; + + int error; + error = verify_area(VERIFY_READ, buffer, 6 * sizeof(int)); + if (error) + return error; + name = (int *) get_user_long(buffer++); + nlen = (int) get_user_long(buffer++); + oldval = (void *) get_user_long(buffer++); + oldlenp = (size_t *) get_user_long(buffer++); + newval = (void *) get_user_long(buffer++); + newlen = (size_t) get_user_long(buffer++); + return do_sysctl(name, nlen, oldval, oldlenp, newval, newlen); +} + +struct ctl_table_header *register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + struct ctl_table_header *tmp; + tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return 0; + *tmp = ((struct ctl_table_header) {table, DLIST_NULL}); + if (insert_at_head) + DLIST_INSERT_HEAD(table_list, tmp, ctl_entry); + else + DLIST_INSERT_TAIL(table_list, tmp, ctl_entry); + return tmp; +} + +void unregister_sysctl_table(struct ctl_table_header * table) +{ + DLIST_DELETE(table_list, table, ctl_entry); +} + +int no_handler (int *name, int nlen, void *oldval, size_t *oldlenp, + void *newval, size_t newlen, ctl_context *context) +{ + return -ENOSYS; +} + +int do_kern (int *name, int nlen, void *oldval, size_t *oldlenp, + void *newval, size_t newlen, ctl_context *context) +{ + int id = get_user_long(name++); + if (!id || id >= KERN_MAXID) + return -ENOTDIR; + switch (id) { + default: + return -ENOSYS; + } +} + +int do_kern_prof (int *name, int nlen, void *oldval, size_t *oldlenp, + void *newval, size_t newlen, ctl_context *context) +{ + return -ENOSYS; +} + +int do_vm (int *name, int nlen, void *oldval, size_t *oldlenp, + void *newval, size_t newlen, ctl_context *context) +{ + int id = get_user_long(name++); + if (!id || id >= VM_MAXID) + return -ENOTDIR; + /* First deal with names longer than 1 field */ + switch (id) { + case VM_SWAPCTL: /* from include/linux/swapctl.h */ + if (nlen != 2) + return -ENOTDIR; + switch (get_user_long(name++)) { + case SC_VERSION: + return do_struct(oldval, oldlenp, newval, newlen, + suser(), (void *) &swap_control, + sizeof(swap_control)); + default: + return -ENOTDIR; + } + default: + ; + } + /* OK, other names should be 1 field exactly: */ + if (nlen != 1) + return -ENOTDIR; + switch (id) { + default: + return -ENOTDIR; + } +} + +/* Support routines */ +int do_string ( + void *oldval, size_t *oldlenp, void *newval, size_t newlen, + int rdwr, char *data, size_t max) +{ + int l = strlen(data) + 1; + if (newval && !rdwr) + return -EPERM; + if (newval && newlen >= max) + return -EINVAL; + if (oldval) { + if (l > get_user_long(oldlenp)) + return -ENOMEM; + put_user_long(l, oldlenp); + memcpy_tofs(oldval, data, l); + } + if (newval) { + memcpy_fromfs(data, newval, newlen); + data[newlen] = 0; + } + return 0; +} + +int do_int ( + void *oldval, size_t *oldlenp, void *newval, size_t newlen, + int rdwr, int *data) +{ + if (newval && !rdwr) + return -EPERM; + if (newval && newlen != sizeof(int)) + return -EINVAL; + if (oldval) { + if (get_user_long(oldlenp) < sizeof(int)) + return -ENOMEM; + put_user_long(sizeof(int), oldlenp); + memcpy_tofs(oldval, data, sizeof(int)); + } + if (newval) + memcpy_fromfs(data, newval, sizeof(int)); + return 0; +} + +int do_struct ( + void *oldval, size_t *oldlenp, void *newval, size_t newlen, + int rdwr, void *data, size_t len) +{ + if (newval && !rdwr) + return -EPERM; + if (newval && newlen != len) + return -EINVAL; + if (oldval) { + if (get_user_long(oldlenp) < len) + return -ENOMEM; + put_user_long(len, oldlenp); + memcpy_tofs(oldval, data, len); + } + if (newval) + memcpy_fromfs(data, newval, len); + return 0; +} --- linux/mm/swap.c.~2.2 1995/06/03 01:36:25 +++ linux/mm/swap.c 1995/06/03 04:12:10 @@ -7,6 +7,8 @@ /* * This file should contain most things doing the swapping from/to disk. * Started 18.12.91 + * + * Swap aging added 23.2.95, Stephen Tweedie. */ #include @@ -18,9 +20,11 @@ #include #include #include +#include #include #include /* for cli()/sti() */ +#include /* for memcpy_to/fromfs */ #include #include @@ -35,6 +39,22 @@ int min_free_pages = 20; +/* + * Constants for the page aging mechanism: the maximum age (actually, + * the maximum "youthfulness"); the quanta by which pages rejuvinate + * and age; and the initial age for new pages. + */ + +swap_control_t swap_control = { + 20, 3, 1, 3, /* Page aging */ + 10, 2, 2, 0, /* Buffer aging */ + 32, 4, /* Aging cluster */ + 8192, 4096, /* Pageout and bufferout weights */ + -200, /* Buffer grace */ + 1, 1, /* Buffs/pages to free */ + RCL_ROUND_ROBIN /* Balancing policy */ +}; + static int nr_swapfiles = 0; static struct wait_queue * lock_queue = NULL; @@ -53,6 +73,7 @@ extern int shm_swap (int); unsigned long *swap_cache; +unsigned char *age_map; #ifdef SWAP_CACHE_INFO unsigned long swap_cache_add_total = 0; @@ -103,6 +124,62 @@ return (unsigned long) (swap_cache + swap_cache_size); } +/* General swap control */ + +/* Parse the kernel command line "swap=" option at load time: */ +void swap_setup(char *str, int *ints) +{ + int * swap_vars[8] = { + &MAX_PAGE_AGE, + &PAGE_ADVANCE, + &PAGE_DECLINE, + &PAGE_INITIAL_AGE, + &AGE_CLUSTER_FRACT, + &AGE_CLUSTER_MIN, + &PAGEOUT_WEIGHT, + &BUFFEROUT_WEIGHT + }; + int i; + for (i=0; i < ints[0] && i < 8; i++) { + if (ints[i+1]) + *(swap_vars[i]) = ints[i+1]; + } +} + +/* Parse the kernel command line "buff=" option at load time: */ +void buff_setup(char *str, int *ints) +{ + int * buff_vars[6] = { + &MAX_BUFF_AGE, + &BUFF_ADVANCE, + &BUFF_DECLINE, + &BUFF_INITIAL_AGE, + &BUFFEROUT_WEIGHT, + &BUFFERMEM_GRACE + }; + int i; + for (i=0; i < ints[0] && i < 6; i++) { + if (ints[i+1]) + *(buff_vars[i]) = ints[i+1]; + } +} + +/* Page aging */ + +static unsigned long init_age_map(unsigned long mem_start, + unsigned long mem_end) +{ + unsigned long age_map_size; + + printk ("kswap 2.2.1.3 (Exp 1995/06/03 04:10:43)\n"); + + mem_start = (mem_start + 15) & ~15; + age_map = (unsigned char *) mem_start; + age_map_size = MAP_NR(mem_end) ; + memset(age_map, 0, age_map_size); + return (unsigned long) (age_map + age_map_size); +} + void rw_swap_page(int rw, unsigned long entry, char * buf) { unsigned long type, offset; @@ -332,10 +409,18 @@ return 0; if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED) return 0; - if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte)) { + /* Deal with page aging. Pages age from being unused; they + * rejuvinate on being accessed. Only swap old pages (age==0 + * is oldest). */ + if ((pte_dirty(pte) && delete_from_swap_cache(page)) + || pte_young(pte)) { *page_table = pte_mkold(pte); + touch_page(page); + return 0; + } + age_page(page); + if (age_of(page)) return 0; - } if (pte_dirty(pte)) { if (mem_map[MAP_NR(page)] != 1) return 0; @@ -348,6 +433,7 @@ vma->vm_task->mm->rss--; pte_val(*page_table) = entry; invalidate(); + vma->vm_task->mm->nswap++; write_swap_page(entry, (char *) page); } free_page(page); @@ -387,19 +473,6 @@ * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de */ -/* - * These are the minimum and maximum number of pages to swap from one process, - * before proceeding to the next: - */ -#define SWAP_MIN 4 -#define SWAP_MAX 32 - -/* - * The actual number of pages to swap is determined as: - * SWAP_RATIO / (number of recent major page faults) - */ -#define SWAP_RATIO 128 - static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end) { @@ -472,6 +545,11 @@ if (vma->vm_flags & VM_SHM) return 0; + /* Don't swap out areas like shared memory which have their + own separate swapping mechanism. */ + if (vma->vm_flags & VM_DONTSWAP) + return 0; + end = vma->vm_end; while (start < end) { int result = swap_out_pgd(vma, pgdir, start, end); @@ -489,8 +567,7 @@ struct vm_area_struct* vma; /* - * Go through process' page directory. - */ + * Go through process' page directory. */ address = p->mm->swap_address; p->mm->swap_address = 0; @@ -522,7 +599,7 @@ int loop, counter; struct task_struct *p; - counter = 6*nr_tasks >> priority; + counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority; for(; counter >= 0; counter--) { /* * Check that swap_task is suitable for swapping. If not, look for @@ -549,16 +626,9 @@ * Determine the number of pages to swap from this process. */ if (!p->mm->swap_cnt) { - p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt; - p->mm->old_maj_flt = p->mm->maj_flt; - - if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) { - p->mm->dec_flt = SWAP_RATIO / SWAP_MIN; - p->mm->swap_cnt = SWAP_MIN; - } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX) - p->mm->swap_cnt = SWAP_MAX; - else - p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt; + /* Normalise the number of pages swapped by + multiplying by (RSS / 1MB) */ + p->mm->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss); } if (!--p->mm->swap_cnt) swap_task++; @@ -577,13 +647,9 @@ } /* - * we keep on shrinking one resource until it's considered "too hard", - * and then switch to the next one (priority being an indication on how - * hard we should try with the resource). - * - * This should automatically find the resource that can most easily be - * free'd, so hopefully we'll get reasonable behaviour even under very - * different circumstances. + * We are much more aggressive about trying to swap out than we used + * to be. This works out OK, because we now do proper aging on page + * contents. */ static int try_to_free_page(int priority) { @@ -732,6 +798,7 @@ restore_flags(flags); \ addr = (struct mem_list *) (size + (unsigned long) addr); \ } mem_map[MAP_NR((unsigned long) addr)] = 1; \ + age_map[MAP_NR((unsigned long) addr)] = PAGE_INITIAL_AGE; \ } while (0) unsigned long __get_free_pages(int priority, unsigned long order) @@ -1213,6 +1280,7 @@ i = 16; min_free_pages = i; start_mem = init_swap_cache(start_mem, end_mem); + start_mem = init_age_map(start_mem, end_mem); mem_map = (mem_map_t *) start_mem; p = mem_map + MAP_NR(end_mem); start_mem = (unsigned long) p; --- linux/include/linux/fs.h.~2.2 1995/06/03 01:36:25 +++ linux/include/linux/fs.h 1995/06/03 04:12:02 @@ -137,12 +137,21 @@ unsigned char b_uptodate; unsigned char b_dirt; /* 0-clean,1-dirty */ unsigned char b_lock; /* 0 - ok, 1 -locked */ - unsigned char b_req; /* 0 if the buffer has been invalidated */ + unsigned char b_req; /* 0 if the buffer has been + * invalidated */ unsigned char b_list; /* List that this buffer appears */ unsigned char b_retain; /* Expected number of times this will - be used. Put on freelist when 0 */ - unsigned long b_flushtime; /* Time when this (dirty) buffer should be written */ - unsigned long b_lru_time; /* Time when this buffer was last used. */ + * be used. Put on freelist when 0 */ + unsigned char b_touched:1; /* True if the buffer has been + * accessed since it was last aged */ + unsigned char b_has_aged:1; /* True if the buffer has aged + * (by alias to another buffer + * on the same page) since it + * was last scanned for aging */ + unsigned long b_flushtime; /* Time when this (dirty) + * buffer should be written */ + unsigned long b_lru_time; /* Time when this buffer was + * last used. */ struct wait_queue * b_wait; struct buffer_head * b_prev; /* doubly linked list of hash-queue */ struct buffer_head * b_next; --- linux/include/linux/mm.h.~2.2 1995/06/03 01:36:25 +++ linux/include/linux/mm.h 1995/06/03 04:12:02 @@ -75,6 +75,8 @@ #define VM_DENYWRITE 0x0800 /* ETXTBSY on write attempts.. */ #define VM_EXECUTABLE 0x1000 +#define VM_DONTSWAP 0x2000 /* Some vm types have their own + * hard-coded swap mechanism */ #define VM_STACK_FLAGS 0x0177 @@ -106,6 +108,7 @@ }; extern mem_map_t * mem_map; +extern unsigned char *age_map; /* planning stage.. */ #define P_DIRTY 0x0001 --- linux/include/linux/unistd.h.~2.2 1995/06/03 01:36:25 +++ linux/include/linux/unistd.h 1995/06/03 04:12:02 @@ -146,6 +146,7 @@ #define __NR_setfsuid 138 #define __NR_setfsgid 139 #define __NR__llseek 140 +#define __NR_sysctl 141 extern int errno; --- linux/include/linux/sched.h.~2.2 1995/06/03 01:36:25 +++ linux/include/linux/sched.h 1995/06/03 04:12:02 @@ -117,11 +117,9 @@ unsigned long start_brk, brk, start_stack, start_mmap; unsigned long arg_start, arg_end, env_start, env_end; unsigned long rss; - unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; + unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; int swappable:1; unsigned long swap_address; - unsigned long old_maj_flt; /* old value of maj_flt */ - unsigned long dec_flt; /* page fault count of the last time */ unsigned long swap_cnt; /* number of pages to swap on next pass */ struct vm_area_struct * mmap; struct vm_area_struct * mmap_avl; @@ -131,11 +129,11 @@ 0, \ 0, 0, 0, \ 0, 0, 0, 0, \ - 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, \ 0, \ /* ?_flt */ 0, 0, 0, 0, \ 0, \ -/* swap */ 0, 0, 0, 0, \ +/* swap */ 0, 0, \ &init_mmap, &init_mmap } struct task_struct { --- linux/include/linux/swapctl.h.~2.2 1995/06/03 01:36:25 +++ linux/include/linux/swapctl.h 1995/06/03 04:12:02 @@ -0,0 +1,112 @@ +#ifndef _LINUX_SWAPCTL_H +#define _LINUX_SWAPCTL_H + +#include +#include + +/* Swap tuning control */ + +/* First, enumerate the different reclaim policies */ +enum RCL_POLICY {RCL_ROUND_ROBIN, RCL_BUFF_FIRST, RCL_PERSIST}; + +typedef struct swap_control_v5 +{ + int sc_max_page_age; + int sc_page_advance; + int sc_page_decline; + int sc_page_initial_age; + int sc_max_buff_age; + int sc_buff_advance; + int sc_buff_decline; + int sc_buff_initial_age; + int sc_age_cluster_fract; + int sc_age_cluster_min; + int sc_pageout_weight; + int sc_bufferout_weight; + int sc_buffer_grace; + int sc_nr_buffs_to_free; + int sc_nr_pages_to_free; + enum RCL_POLICY sc_policy; +} swap_control_v5; + +typedef struct swap_control_v5 swap_control_t; + +extern swap_control_t swap_control; + +#define SC_VERSION 1 +#define SC_MAX_VERSION 1 + +#ifdef __KERNEL__ + +/* Define the maximum (least urgent) priority for the page reclaim code */ +#define RCL_MAXPRI 6 +/* We use an extra priority in the swap accounting code to represent + failure to free a resource at any priority */ +#define RCL_FAILURE (RCL_MAXPRI + 1) + +#define RCL_POLICY (swap_control.sc_policy) +#define AGE_CLUSTER_FRACT (swap_control.sc_age_cluster_fract) +#define AGE_CLUSTER_MIN (swap_control.sc_age_cluster_min) +#define PAGEOUT_WEIGHT (swap_control.sc_pageout_weight) +#define BUFFEROUT_WEIGHT (swap_control.sc_bufferout_weight) + +#define NR_BUFFS_TO_FREE (swap_control.sc_nr_buffs_to_free) +#define NR_PAGES_TO_FREE (swap_control.sc_nr_pages_to_free) + +#define BUFFERMEM_GRACE (swap_control.sc_buffer_grace) + +/* Page aging (see mm/swap.c) */ + +#define MAX_PAGE_AGE (swap_control.sc_max_page_age) +#define PAGE_ADVANCE (swap_control.sc_page_advance) +#define PAGE_DECLINE (swap_control.sc_page_decline) +#define PAGE_INITIAL_AGE (swap_control.sc_page_initial_age) + +#define MAX_BUFF_AGE (swap_control.sc_max_buff_age) +#define BUFF_ADVANCE (swap_control.sc_buff_advance) +#define BUFF_DECLINE (swap_control.sc_buff_decline) +#define BUFF_INITIAL_AGE (swap_control.sc_buff_initial_age) + +/* Given a resource of N units (pages or buffers etc), we only try to + * age and reclaim AGE_CLUSTER_FRACT per 1024 resources each time we + * scan the resource list. */ +static inline int AGE_CLUSTER_SIZE(int resources) +{ + int n = (resources * AGE_CLUSTER_FRACT) >> 10; + if (n < AGE_CLUSTER_MIN) + return AGE_CLUSTER_MIN; + else + return n; +} + +static inline void touch_page(unsigned long addr) +{ + unsigned char *p = age_map + MAP_NR(addr); + if (*p < (MAX_PAGE_AGE - PAGE_ADVANCE)) + *p += PAGE_ADVANCE; + else + *p = MAX_PAGE_AGE; +} + +static inline void age_page(unsigned long addr) +{ + unsigned char *p = age_map + MAP_NR(addr); + if (*p > PAGE_DECLINE) + *p -= PAGE_DECLINE; + else + *p = 0; +} + +static inline int age_of(unsigned long addr) +{ + return age_map[MAP_NR(addr)]; +} + +static inline void set_page_new(unsigned long addr) +{ + age_map[MAP_NR(addr)] = PAGE_INITIAL_AGE; +} + +#endif /* __KERNEL */ + +#endif /* _LINUX_SWAPCTL_H */ --- linux/include/linux/sysctl.h.~2.2 1995/06/03 01:36:25 +++ linux/include/linux/sysctl.h 1995/06/03 04:12:02 @@ -0,0 +1,120 @@ +/* + * sysctl.h: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + */ + +#include + +#ifndef _LINUX_SYSCTL_H +#define _LINUX_SYSCTL_H + +#define CTL_MAXNAME 10 + +/* Define sysctl names first */ + +/* Top-level names: */ +#ifdef __KERNEL__ +#define CTL_ANY -1 /* Matches any name */ +#define CTL_NONE 0 +#endif +#define CTL_KERN 1 /* General kernel info and control */ +#define CTL_VM 2 /* VM management */ +#define CTL_NET 3 /* Networking */ +#define CTL_PROC 4 /* Process info */ +#define CTL_FS 5 /* Filesystems */ +#define CTL_DEBUG 6 /* Debugging */ +#define CTL_DEV 7 /* Devices */ +#define CTL_MAXID 8 + +/* CTL_KERN names: */ +#define KERN_OSTYTPE 1 /* string: system version */ +#define KERN_OSRELEASE 2 /* string: system release */ +#define KERN_OSREV 3 /* int: system revision */ +#define KERN_VERSION 4 /* string: compile time info */ +#define KERN_SECUREMASK 5 /* struct: maximum rights mask */ +#define KERN_PROF 6 /* table: profiling information */ +#define KERN_MAXID 7 + +/* CTL_VM names: */ +#define VM_SWAPCTL 1 /* struct: Set vm swapping control */ +#define VM_MAXID 2 + +/* CTL_NET names: */ + +/* CTL_PROC names: */ + +/* CTL_FS names: */ + +/* CTL_DEBUG names: */ + +/* CTL_DEV names: */ + +#define DEV_BLOCK 1 +#define DEV_CHAR 2 +#define DEV_NET 3 +#define DEV_SCSI 4 +#define DEV_SOUND 5 +#define DEV_MAXID 6 +/* Sub-name of CTL_DEV: */ +#define DEV_LSDEVS -1 + +#ifdef __KERNEL__ + +extern asmlinkage int sys_sysctl(int *buffer); + +typedef struct ctl_table ctl_table; +typedef struct ctl_context ctl_context; +typedef int ctl_handler (int *, int, void *, size_t *, void *, size_t, + ctl_context *); + +extern int do_sysctl (int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen); + +extern int do_string ( + void *oldval, size_t *oldlenp, void *newval, size_t newlen, + int rdwr, char *data, size_t max); +extern int do_int ( + void *oldval, size_t *oldlenp, void *newval, size_t newlen, + int rdwr, int *data); +extern int do_struct ( + void *oldval, size_t *oldlenp, void *newval, size_t newlen, + int rdwr, void *data, size_t len); + + +/* + * ctl_context is used in the parsing of sysctl names to store data + * indexed by previous components of the name. + */ + +struct ctl_context +{ + union + { + struct task_struct *ctl_proc; + } u; +}; + +struct ctl_table +{ + int ctl_name; + ctl_handler * ctl_handler; + ctl_table * ctl_table; +}; + +struct ctl_table_header +{ + ctl_table *ctl_table; + DLNODE(struct ctl_table_header) ctl_entry; +}; + +struct ctl_table_header * register_sysctl_table(ctl_table * table, + int insert_at_head); +void unregister_sysctl_table(struct ctl_table_header * table); + +#else /* __KERNEL__ */ + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_SYSCTL_H */ --- linux/include/linux/lists.h.~2.2 1995/06/03 01:36:25 +++ linux/include/linux/lists.h 1995/06/03 04:12:02 @@ -0,0 +1,80 @@ +/* + * lists.h: Simple list macros for Linux + */ + +#define DLIST(ptype) \ + struct { \ + ptype *dl_first; \ + ptype *dl_last; \ + } + +#define DLIST_NULL {0,0} + +#define DLNODE(ptype) \ + struct { \ + ptype * dl_prev; \ + ptype * dl_next; \ + } + +#define DNODE_SINGLE(node) {(node),(node)} +#define DLIST_SINGLE(node) {(node),(node)} + +#define DLIST_INIT(listnam) \ + (listnam).dl_first = 0; \ + (listnam).dl_last = 0; + +#define DLIST_NEXT(listnam) listnam.dl_next +#define DLIST_PREV(listnam) listnam.dl_prev +#define DLIST_FIRST(head) (head).dl_first +#define DLIST_LAST(head) (head).dl_last + +#define DLIST_INSERT_HEAD(head, new, listnam) do { \ + if ((head).dl_first) { \ + (new)->listnam.dl_prev = (head).dl_last; \ + (new)->listnam.dl_next = (head).dl_first; \ + (head).dl_first->listnam.dl_prev = new; \ + } else { \ + (head).dl_first = (head).dl_last = new; \ + (new)->listnam.dl_prev = \ + (new)->listnam.dl_next = new; \ + }} while (0) + +#define DLIST_INSERT_TAIL(head, new, listnam) do { \ + if ((head).dl_first) { \ + (new)->listnam.dl_prev = (head).dl_last; \ + (new)->listnam.dl_next = (head).dl_first; \ + (head).dl_last->listnam.dl_next = new; \ + } else { \ + (head).dl_first = (head).dl_last = new; \ + (new)->listnam.dl_prev = \ + (new)->listnam.dl_next = new; \ + }} while (0) + +#define DLIST_INSERT_AFTER(node, new, listnam) do { \ + (new)->listnam.dl_prev = (node); \ + (new)->listnam.dl_next = (node)->listnam.dl_next; \ + (node)->listnam.dl_next->listnam.dl_prev = (new); \ + (node)->listnam.dl_next = (new); \ + } while (0) + +#define DLIST_INSERT_BEFORE(node, new, listnam) do { \ + (new)->listnam.dl_next = (node); \ + (new)->listnam.dl_prev = (node)->listnam.dl_prev; \ + (node)->listnam.dl_prev->listnam.dl_next = (new); \ + (node)->listnam.dl_prev = (new); \ + } while (0) + +#define DLIST_DELETE(head, node, listnam) do { \ + node->listnam.dl_prev->listnam.dl_next = \ + node->listnam.dl_next; \ + node->listnam.dl_next->listnam.dl_prev = \ + node->listnam.dl_prev; \ + if ((head).dl_first == node) \ + (head).dl_first = node->listnam.dl_next; \ + if ((head).dl_first == node) \ + (head).dl_first = 0; \ + if ((head).dl_last == node) \ + (head).dl_last = node->listnam.dl_prev; \ + if ((head).dl_last == node) \ + (head).dl_last = 0; \ + } while (0) --- linux/ipc/shm.c.~2.2 1995/06/03 01:36:25 +++ linux/ipc/shm.c 1995/06/03 04:12:02 @@ -507,7 +507,7 @@ shmd->vm_end = addr + shp->shm_npages * PAGE_SIZE; shmd->vm_task = current; shmd->vm_page_prot = (shmflg & SHM_RDONLY) ? PAGE_READONLY : PAGE_SHARED; - shmd->vm_flags = VM_SHM | VM_MAYSHARE | VM_SHARED + shmd->vm_flags = VM_SHM | VM_MAYSHARE | VM_SHARED | VM_DONTSWAP | VM_MAYREAD | VM_MAYEXEC | VM_READ | VM_EXEC | ((shmflg & SHM_RDONLY) ? 0 : VM_MAYWRITE | VM_WRITE); shmd->vm_next_share = shmd->vm_prev_share = NULL; @@ -674,7 +674,7 @@ unsigned long id, idx; int loop = 0, invalid = 0; int counter; - + counter = shm_rss >> prio; if (!counter || !(swap_nr = get_swap_page())) return 0; --- linux/arch/i386/kernel/entry.S.~2.2 1995/06/03 01:36:25 +++ linux/arch/i386/kernel/entry.S 1995/06/03 04:12:02 @@ -541,4 +541,5 @@ .long _sys_setfsuid .long _sys_setfsgid .long _sys_llseek /* 140 */ - .space (NR_syscalls-140)*4 + .long _sys_sysctl /* 141 */ + .space (NR_syscalls-141)*4 --- linux/arch/sparc/kernel/entry.S.~2.2 1995/06/03 01:36:25 +++ linux/arch/sparc/kernel/entry.S 1995/06/03 04:12:02 @@ -924,4 +924,5 @@ .long C_LABEL(sys_setfsuid) .long C_LABEL(sys_setfsgid) .long C_LABEL(sys_llseek) /* 140 */ + .long C_LABEL(sys_sysctl) /* 141 */ .align 4 --- linux/arch/mips/kernel/entry.S.~2.2 1995/06/03 01:36:25 +++ linux/arch/mips/kernel/entry.S 1995/06/03 04:12:02 @@ -965,7 +965,8 @@ .word _sys_setfsuid .word _sys_setfsgid .word _sys_llseek /* 140 */ - .space (NR_syscalls-140)*4 + .word _sys_sysctl /* 141 */ + .space (NR_syscalls-141)*4 .bss .globl _IRQ_vectors