patch-2.3.10 linux/mm/memory.c

Next file: linux/mm/mlock.c
Previous file: linux/mm/filemap.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.9/linux/mm/memory.c linux/mm/memory.c
@@ -36,7 +36,9 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/pagemap.h>
 #include <linux/smp_lock.h>
+#include <linux/swapctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -320,7 +322,7 @@
 	}
 }
 
-static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
+static inline int zap_pte_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size)
 {
 	pte_t * pte;
 	int freed;
@@ -345,15 +347,15 @@
 		page = *pte;
 		pte++;
 		size--;
+		pte_clear(pte-1);
 		if (pte_none(page))
 			continue;
-		pte_clear(pte-1);
 		freed += free_pte(page);
 	}
 	return freed;
 }
 
-static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
+static inline int zap_pmd_range(struct mm_struct *mm, pgd_t * dir, unsigned long address, unsigned long size)
 {
 	pmd_t * pmd;
 	unsigned long end;
@@ -373,7 +375,7 @@
 		end = PGDIR_SIZE;
 	freed = 0;
 	do {
-		freed += zap_pte_range(pmd, address, end - address);
+		freed += zap_pte_range(mm, pmd, address, end - address);
 		address = (address + PMD_SIZE) & PMD_MASK; 
 		pmd++;
 	} while (address < end);
@@ -390,11 +392,21 @@
 	int freed = 0;
 
 	dir = pgd_offset(mm, address);
+
+	/*
+	 * This is a long-lived spinlock. That's fine.
+	 * There's no contention, because the page table
+	 * lock only protects against kswapd anyway, and
+	 * even if kswapd happened to be looking at this
+	 * process we _want_ it to get stuck.
+	 */
+	spin_lock(&mm->page_table_lock);
 	while (address < end) {
-		freed += zap_pmd_range(dir, address, end - address);
+		freed += zap_pmd_range(mm, dir, address, end - address);
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	}
+	spin_unlock(&mm->page_table_lock);
 	/*
 	 * Update rss for the mm_struct (not necessarily current->mm)
 	 */
@@ -599,17 +611,16 @@
  * We also mark the page dirty at this point even though the page will
  * change only once the write actually happens. This avoids a few races,
  * and potentially makes it more efficient.
+ *
+ * We enter with the page table read-lock held, and need to exit without
+ * it.
  */
 static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	unsigned long address, pte_t *page_table, pte_t pte)
 {
 	unsigned long old_page, new_page;
 	struct page * page;
-	
-	new_page = __get_free_page(GFP_USER);
-	/* Did swap_out() unmap the protected page while we slept? */
-	if (pte_val(*page_table) != pte_val(pte))
-		goto end_wp_page;
+
 	old_page = pte_page(pte);
 	if (MAP_NR(old_page) >= max_mapnr)
 		goto bad_wp_page;
@@ -634,44 +645,44 @@
 		/* FallThrough */
 	case 1:
 		flush_cache_page(vma, address);
-		set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
+		set_pte(page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
 		flush_tlb_page(vma, address);
-end_wp_page:
-		/*
-		 * We can release the kernel lock now.. Now swap_out will see
-		 * a dirty page and so won't get confused and flush_tlb_page
-		 * won't SMP race. -Andrea
-		 */
-		unlock_kernel();
-
-		if (new_page)
-			free_page(new_page);
+		spin_unlock(&tsk->mm->page_table_lock);
 		return 1;
 	}
-		
+
+	/*
+	 * Ok, we need to copy. Oh, well..
+	 */
+	spin_unlock(&tsk->mm->page_table_lock);
+	new_page = __get_free_page(GFP_USER);
 	if (!new_page)
-		goto no_new_page;
+		return -1;
+	spin_lock(&tsk->mm->page_table_lock);
 
-	if (PageReserved(page))
-		++vma->vm_mm->rss;
-	copy_cow_page(old_page,new_page);
-	flush_page_to_ram(old_page);
-	flush_page_to_ram(new_page);
-	flush_cache_page(vma, address);
-	set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
-	flush_tlb_page(vma, address);
-	unlock_kernel();
-	__free_page(page);
+	/*
+	 * Re-check the pte - we dropped the lock
+	 */
+	if (pte_val(*page_table) == pte_val(pte)) {
+		if (PageReserved(page))
+			++vma->vm_mm->rss;
+		copy_cow_page(old_page,new_page);
+		flush_page_to_ram(old_page);
+		flush_page_to_ram(new_page);
+		flush_cache_page(vma, address);
+		set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+		flush_tlb_page(vma, address);
+
+		/* Free the old page.. */
+		new_page = old_page;
+	}
+	spin_unlock(&tsk->mm->page_table_lock);
+	free_page(new_page);
 	return 1;
 
 bad_wp_page:
 	printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
-	send_sig(SIGKILL, tsk, 1);
-no_new_page:
-	unlock_kernel();
-	if (new_page)
-		free_page(new_page);
-	return 0;
+	return -1;
 }
 
 /*
@@ -725,8 +736,9 @@
 	struct vm_area_struct * mpnt;
 
 	truncate_inode_pages(inode, offset);
+	spin_lock(&inode->i_shared_lock);
 	if (!inode->i_mmap)
-		return;
+		goto out_unlock;
 	mpnt = inode->i_mmap;
 	do {
 		struct mm_struct *mm = mpnt->vm_mm;
@@ -757,35 +769,81 @@
 		zap_page_range(mm, start, len);
 		flush_tlb_range(mm, start, end);
 	} while ((mpnt = mpnt->vm_next_share) != NULL);
+out_unlock:
+	spin_unlock(&inode->i_shared_lock);
 }
 
 
-/*
- * This is called with the kernel lock held, we need
- * to return without it.
+
+/* 
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time.  We also make sure to queue
+ * the 'original' request together with the readahead ones...  
  */
-static int do_swap_page(struct task_struct * tsk, 
+static void swapin_readahead(unsigned long entry)
+{
+	int i;
+	struct page *new_page;
+	unsigned long offset = SWP_OFFSET(entry);
+	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
+	
+	offset = (offset >> page_cluster) << page_cluster;
+
+	i = 1 << page_cluster;
+	do {
+		/* Don't read-ahead past the end of the swap area */
+		if (offset >= swapdev->max)
+			break;
+		/* Don't block on I/O for read-ahead */
+		if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
+			break;
+		/* Don't read in bad or busy pages */
+		if (!swapdev->swap_map[offset])
+			break;
+		if (swapdev->swap_map[offset] == SWAP_MAP_BAD)
+			break;
+
+		/* Ok, do the async read-ahead now */
+		new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
+		if (new_page != NULL)
+			__free_page(new_page);
+		offset++;
+	} while (--i);
+	return;
+}
+
+static int do_swap_page(struct task_struct * tsk,
 	struct vm_area_struct * vma, unsigned long address,
-	pte_t * page_table, pte_t entry, int write_access)
+	pte_t * page_table, unsigned long entry, int write_access)
 {
-	if (!vma->vm_ops || !vma->vm_ops->swapin) {
-		swap_in(tsk, vma, page_table, pte_val(entry), write_access);
-		flush_page_to_ram(pte_page(*page_table));
-	} else {
-		pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
-		if (pte_val(*page_table) != pte_val(entry)) {
-			free_page(pte_page(page));
-		} else {
-			if (page_count(mem_map + MAP_NR(pte_page(page))) > 1 &&
-			    !(vma->vm_flags & VM_SHARED))
-				page = pte_wrprotect(page);
-			++vma->vm_mm->rss;
-			++tsk->maj_flt;
-			flush_page_to_ram(pte_page(page));
-			set_pte(page_table, page);
-		}
+	struct page *page = lookup_swap_cache(entry);
+	pte_t pte;
+
+	if (!page) {
+		lock_kernel();
+		swapin_readahead(entry);
+		page = read_swap_cache(entry);
+		unlock_kernel();
+		if (!page)
+			return -1;
+
+		flush_page_to_ram(page_address(page));
+	}
+
+	vma->vm_mm->rss++;
+	tsk->min_flt++;
+	swap_free(entry);
+
+	pte = mk_pte(page_address(page), vma->vm_page_prot);
+
+	if (write_access && !is_page_shared(page)) {
+		delete_from_swap_cache(page);
+		pte = pte_mkwrite(pte_mkdirty(pte));
 	}
-	unlock_kernel();
+	set_pte(page_table, pte);
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, address, pte);
 	return 1;
 }
 
@@ -798,7 +856,7 @@
 	if (write_access) {
 		unsigned long page = __get_free_page(GFP_USER);
 		if (!page)
-			return 0;
+			return -1;
 		clear_page(page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		vma->vm_mm->rss++;
@@ -806,6 +864,8 @@
 		flush_page_to_ram(page);
 	}
 	set_pte(page_table, entry);
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, entry);
 	return 1;
 }
 
@@ -827,23 +887,17 @@
 	unsigned long page;
 	pte_t entry;
 
-	if (!vma->vm_ops || !vma->vm_ops->nopage) {
-		unlock_kernel();
-		return do_anonymous_page(tsk, vma, page_table, write_access,
-		                         address);
-	}
+	if (!vma->vm_ops || !vma->vm_ops->nopage)
+		return do_anonymous_page(tsk, vma, page_table, write_access, address);
 
 	/*
 	 * The third argument is "no_share", which tells the low-level code
 	 * to copy, not share the page even if sharing is possible.  It's
 	 * essentially an early COW detection.
 	 */
-	page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
-		(vma->vm_flags & VM_SHARED)?0:write_access);
-
-	unlock_kernel();
+	page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
 	if (!page)
-		return 0;
+		return 0;	/* SIGBUS - but we _really_ should know whether it is OOM or SIGBUS */
 
 	++tsk->maj_flt;
 	++vma->vm_mm->rss;
@@ -866,6 +920,7 @@
 		entry = pte_wrprotect(entry);
 	set_pte(page_table, entry);
 	/* no need to invalidate: a not-present page shouldn't be cached */
+	update_mmu_cache(vma, address, entry);
 	return 1;
 }
 
@@ -877,6 +932,15 @@
  * There is also a hook called "update_mmu_cache()" that architectures
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
+ *
+ * Note the "page_table_lock". It is to protect against kswapd removing
+ * pages from under us. Note that kswapd only ever _removes_ pages, never
+ * adds them. As such, once we have noticed that the page is not present,
+ * we can drop the lock early.
+ *
+ * The adding of pages is protected by the MM semaphore (which we hold),
+ * so we don't need to worry about a page being suddenly been added into
+ * our VM.
  */
 static inline int handle_pte_fault(struct task_struct *tsk,
 	struct vm_area_struct * vma, unsigned long address,
@@ -884,27 +948,32 @@
 {
 	pte_t entry;
 
-	lock_kernel();
 	entry = *pte;
-
 	if (!pte_present(entry)) {
 		if (pte_none(entry))
 			return do_no_page(tsk, vma, address, write_access, pte);
-		return do_swap_page(tsk, vma, address, pte, entry, write_access);
+		return do_swap_page(tsk, vma, address, pte, pte_val(entry), write_access);
 	}
 
-	entry = pte_mkyoung(entry);
-	set_pte(pte, entry);
-	flush_tlb_page(vma, address);
-	if (write_access) {
-		if (!pte_write(entry))
-			return do_wp_page(tsk, vma, address, pte, entry);
+	/*
+	 * Ok, the entry was present, we need to get the page table
+	 * lock to synchronize with kswapd, and verify that the entry
+	 * didn't change from under us..
+	 */
+	spin_lock(&tsk->mm->page_table_lock);
+	if (pte_val(entry) == pte_val(*pte)) {
+		if (write_access) {
+			if (!pte_write(entry))
+				return do_wp_page(tsk, vma, address, pte, entry);
 
-		entry = pte_mkdirty(entry);
+			entry = pte_mkdirty(entry);
+		}
+		entry = pte_mkyoung(entry);
 		set_pte(pte, entry);
 		flush_tlb_page(vma, address);
+		update_mmu_cache(vma, address, entry);
 	}
-	unlock_kernel();
+	spin_unlock(&tsk->mm->page_table_lock);
 	return 1;
 }
 
@@ -921,28 +990,27 @@
 	pmd = pmd_alloc(pgd, address);
 	if (pmd) {
 		pte_t * pte = pte_alloc(pmd, address);
-		if (pte) {
-			if (handle_pte_fault(tsk, vma, address, write_access, pte)) {
-				update_mmu_cache(vma, address, *pte);
-				return 1;
-			}
-		}
+		if (pte)
+			return handle_pte_fault(tsk, vma, address, write_access, pte);
 	}
-	return 0;
+	return -1;
 }
 
 /*
  * Simplistic page force-in..
  */
-void make_pages_present(unsigned long addr, unsigned long end)
+int make_pages_present(unsigned long addr, unsigned long end)
 {
 	int write;
+	struct task_struct *tsk = current;
 	struct vm_area_struct * vma;
 
-	vma = find_vma(current->mm, addr);
+	vma = find_vma(tsk->mm, addr);
 	write = (vma->vm_flags & VM_WRITE) != 0;
 	while (addr < end) {
-		handle_mm_fault(current, vma, addr, write);
+		if (handle_mm_fault(tsk, vma, addr, write) < 0)
+			return -1;
 		addr += PAGE_SIZE;
 	}
+	return 0;
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)