Home Home > GIT Browse > vanilla
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@penguin.transmeta.com>2002-02-27 21:18:04 -0800
committerLinus Torvalds <torvalds@penguin.transmeta.com>2002-02-27 21:18:04 -0800
commit141efd8bf4280a26a94fe6f5c81220bf11734651 (patch)
tree248cbd681295de109445c8016afbfe470c42a908
parent22e678470469b15ddf151694859673836b2abeae (diff)
parent9707269f502096831225b39a9c63431ea71555cd (diff)
Merge
-rw-r--r--Documentation/Changes19
-rw-r--r--Documentation/filesystems/00-INDEX2
-rw-r--r--Documentation/filesystems/changelog.jfs211
-rw-r--r--Documentation/filesystems/jfs.txt136
-rw-r--r--MAINTAINERS7
-rw-r--r--fs/Config.help19
-rw-r--r--fs/Config.in4
-rw-r--r--fs/Makefile1
-rw-r--r--fs/jfs/Makefile20
-rw-r--r--fs/jfs/endian24.h50
-rw-r--r--fs/jfs/file.c105
-rw-r--r--fs/jfs/inode.c314
-rw-r--r--fs/jfs/jfs_btree.h163
-rw-r--r--fs/jfs/jfs_debug.c145
-rw-r--r--fs/jfs/jfs_debug.h96
-rw-r--r--fs/jfs/jfs_defragfs.h55
-rw-r--r--fs/jfs/jfs_dinode.h157
-rw-r--r--fs/jfs/jfs_dmap.c4190
-rw-r--r--fs/jfs/jfs_dmap.h301
-rw-r--r--fs/jfs/jfs_dtree.c4539
-rw-r--r--fs/jfs/jfs_dtree.h284
-rw-r--r--fs/jfs/jfs_extendfs.h39
-rw-r--r--fs/jfs/jfs_extent.c637
-rw-r--r--fs/jfs/jfs_extent.h31
-rw-r--r--fs/jfs/jfs_filsys.h274
-rw-r--r--fs/jfs/jfs_imap.c3212
-rw-r--r--fs/jfs/jfs_imap.h161
-rw-r--r--fs/jfs/jfs_incore.h149
-rw-r--r--fs/jfs/jfs_inode.c132
-rw-r--r--fs/jfs/jfs_inode.h23
-rw-r--r--fs/jfs/jfs_lock.h106
-rw-r--r--fs/jfs/jfs_logmgr.c2490
-rw-r--r--fs/jfs/jfs_logmgr.h499
-rw-r--r--fs/jfs/jfs_metapage.c686
-rw-r--r--fs/jfs/jfs_metapage.h123
-rw-r--r--fs/jfs/jfs_mount.c541
-rw-r--r--fs/jfs/jfs_superblock.h143
-rw-r--r--fs/jfs/jfs_txnmgr.c3021
-rw-r--r--fs/jfs/jfs_txnmgr.h315
-rw-r--r--fs/jfs/jfs_types.h187
-rw-r--r--fs/jfs/jfs_umount.c158
-rw-r--r--fs/jfs/jfs_unicode.c110
-rw-r--r--fs/jfs/jfs_unicode.h143
-rw-r--r--fs/jfs/jfs_uniupr.c137
-rw-r--r--fs/jfs/jfs_xtree.c4444
-rw-r--r--fs/jfs/jfs_xtree.h143
-rw-r--r--fs/jfs/namei.c1499
-rw-r--r--fs/jfs/super.c501
-rw-r--r--fs/jfs/symlink.c47
-rw-r--r--fs/nls/Config.in2
50 files changed, 30768 insertions, 3 deletions
diff --git a/Documentation/Changes b/Documentation/Changes
index a4ac11940fb8..8d7aea65706e 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -54,6 +54,7 @@ o binutils 2.9.5.0.25 # ld -v
o util-linux 2.10o # fdformat --version
o modutils 2.4.2 # insmod -V
o e2fsprogs 1.25 # tune2fs
+o jfsutils 1.0.14 # fsck.jfs -V
o reiserfsprogs 3.x.0j # reiserfsck 2>&1|grep reiserfsprogs
o pcmcia-cs 3.1.21 # cardmgr -V
o PPP 2.4.0 # pppd --version
@@ -106,8 +107,8 @@ assembling the 16-bit boot code, removing the need for as86 to compile
your kernel. This change does, however, mean that you need a recent
release of binutils.
-System utilities
-================
+System utililities
+==================
Architectural changes
---------------------
@@ -165,6 +166,16 @@ E2fsprogs
The latest version of e2fsprogs fixes several bugs in fsck and
debugfs. Obviously, it's a good idea to upgrade.
+JFSutils
+--------
+
+The jfsutils package contains the utilities for the file system.
+The following utilities are available:
+o fsck.jfs - initiate replay of the transaction log, and check
+ and repair a JFS formatted partition.
+o mkfs.jfs - create a JFS formatted partition.
+o other file system utilities are also available in this package.
+
Reiserfsprogs
-------------
@@ -303,6 +314,10 @@ E2fsprogs
---------
o <http://prdownloads.sourceforge.net/e2fsprogs/e2fsprogs-1.25.tar.gz>
+JFSutils
+--------
+o <http://oss.software.ibm.com/jfs>
+
Reiserfsprogs
-------------
o <ftp://ftp.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.x.0j.tar.gz>
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 18a3a4761a3c..f3c4cf5d2464 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -22,6 +22,8 @@ hpfs.txt
- info and mount options for the OS/2 HPFS.
isofs.txt
- info and mount options for the ISO 9660 (CDROM) filesystem.
+jfs.txt
+ - info and mount options for the JFS filesystem.
ncpfs.txt
- info on Novell Netware(tm) filesystem using NCP protocol.
ntfs.txt
diff --git a/Documentation/filesystems/changelog.jfs b/Documentation/filesystems/changelog.jfs
new file mode 100644
index 000000000000..9d2b03f1910c
--- /dev/null
+++ b/Documentation/filesystems/changelog.jfs
@@ -0,0 +1,211 @@
+IBM's Journaled File System (JFS) for Linux version 1.0.15
+Team members
+Steve Best sbest@us.ibm.com
+Dave Kleikamp shaggy@austin.ibm.com
+Barry Arndt barndt@us.ibm.com
+Christoph Hellwig hch@caldera.de
+
+
+Release February 15, 2002 (version 1.0.15)
+
+This is our fifty-third release of IBM's Enterprise JFS technology port to Linux.
+Beta 1 was release 0.1.0 on 12/8/2000, Beta 2 was release 0.2.0 on 3/7/2001,
+Beta 3 was release 0.3.0 on 4/30/2001, and release 1.0.0 on 6/28/2001.
+
+Function and Fixes in drop 53 (1.0.15)
+ - Fix trap when appending to very large file
+ - Moving jfs headers into fs/jfs at Linus' request
+ - Move up to linux-2.5.4
+ - Fix file size limit on 32-bit (Andi Kleen)
+ - make changelog more read-able and include only 1.0.0 and above (Christoph Hellwig)
+ - Don't allocate metadata pages from high memory. JFS keeps them kmapped too long causing deadlock.
+ - Fix xtree corruption when creating file with >= 64 GB of physically contiguous dasd
+ - Replace semaphore with struct completion for thread startup/shutdown (Benedikt Spranger)
+ - cleanup Tx alloc/free (Christoph Hellwig)
+ - Move up to linux-2.5.3
+ - thread cleanups (Christoph Hellwig)
+ - First step toward making tblocks and tlocks dynamically allocated. Intro tid_t and lid_t to
+ insulate the majority of the code from future changes. Also hide TxBlock and TxLock arrays
+ by using macros to get from tids and lids to real structures.
+ - minor list-handling cleanup (Christoph Hellwig)
+ - Replace altnext and altprev with struct list_head
+ - Clean up the debugging code and add support for collecting statistics (Christoph Hellwig)
+
+
+Function and Fixes in drop 52 (1.0.14)
+ - Fix hang in invalidate_metapages when jfs.o is built as a module
+ - Fix anon_list removal logic in txLock
+
+Function and Fixes in drop 51 (1.0.13)
+ - chmod changes on newly created directories are lost after umount (bug 2535)
+ - Page locking race fixes
+ - Improve metapage locking
+ - Fix timing window. Lock page while metapage is active to avoid page going
+ away before the metadata is released. (Fixed crash during mount/umount testing)
+ - Make changes for 2.5.2 kernel
+ - Fix race condition truncating large files
+
+Function and Fixes in drop50 (1.0.12)
+ - Add O_DIRECT support
+ - Add support for 2.4.17 kernel
+ - Make sure COMMIT_STALE gets reset before the inode is unlocked. Fixing
+ this gets rid of XT_GETPAGE errors
+ - Remove invalid __exit keyword from metapage_exit and txExit.
+ - fix assert(log->cqueue.head == NULL by waiting longer
+
+Function and Fixes in drop49 (1.0.11)
+ - Readdir was not handling multibyte codepages correctly.
+ - Make mount option parsing more robust.
+ - Add iocharset mount option.
+ - Journalling of symlinks incorrect, resulting in logredo failure of -265.
+ - Add jfsutils information to Changes file
+ - Improve recoverability of the file system when metadata corruption is detected.
+ - Fix kernel OOPS when root inode is corrupted
+
+Function and Fixes in drop48 (1.0.10)
+ - put inodes later on hash queues
+ - Fix boundary case in xtTruncate
+ - When invalidating metadata, try to flush the dirty buffers rather than sync them.
+ - Add another sanity check to avoid trapping when imap is corrupt
+ - Fix file truncate while removing large file (assert(cmp == 0))
+ - read_cache_page returns ERR_PTR, not NULL on error
+ - Add dtSearchNode and dtRelocate
+ - JFS needs to use generic_file_open & generic_file_llseek
+ - Remove lazyQwait, etc. It created an unnecessary bottleneck in TxBegin.
+
+Function and Fixes in drop47 (1.0.9)
+ - Fix data corruption problem when creating files while deleting others. (jitterbug 183)
+ - Make sure all metadata is written before finalizing the log
+ - Fix serialization problem in shutdown by setting i_size of directory sooner. (bugzilla #334)
+ - JFS should quit whining when special files are marked dirty during read-only mount.
+ - Must always check rc after DT_GETPAGE
+ - Add diExtendFS
+ - Removing defconfig form JFS source - not really needed
+
+Function and Fixes in drop46 (1.0.8)
+ - Synclist was being built backwards causing logredo to quit too early
+ - jfs_compat.h needs to include module.h
+ - uncomment EXPORTS_NO_SYMBOLS in super.c
+ - Minor code cleanup
+ - xtree of zero-truncated file not being logged
+ - Fix logging on file truncate
+ - remove unused metapage fields
+
+Function and Fixes in drop45 (1.0.7)
+ - cleanup remove IS_KIOBUFIO define.
+ - cleanup remove TRUNC_NO_TOSS define.
+ - have jFYI's use the name directly from dentry
+ - Remove nul _ALLOC and _FREE macros and also make spinlocks static.
+ - cleanup add externs where needed in the header files
+ - jfs_write_inode is a bad place to call iput. Also limit warnings.
+ - More truncate cleanup
+ - Truncate cleanup
+ - Add missing statics in jfs_metapage.c
+ - fsync fixes
+ - Clean up symlink code - use page_symlink_inode_operations
+ - unicode handling cleanup
+ - cleanup replace UniChar with wchar_t
+ - Get rid of CDLL_* macros - use list.h instead
+ - 2.4.11-prex mount problem Call new_inode instead of get_empty_inode
+ - use kernel min/max macros
+ - Add MODULE_LICENSE stub for older kernels
+ - IA64/gcc3 fixes
+ - Log Manager fixes, introduce __SLEEP_COND macro
+ - Mark superblock dirty when some errors detected (forcing fsck to be run).
+ - More robust remounting from r/o to r/w.
+ - Misc. cleanup add static where appropriate
+ - small cleanup in jfs_umount_rw
+ - add MODULE_ stuff
+ - Set *dropped_lock in alloc_metapage
+ - Get rid of unused log list
+ - cleanup jfs_imap.c to remove _OLD_STUFF and _NO_MORE_MOUNT_INODE defines
+ - Log manager cleanup
+ - Transaction manager cleanup
+ - correct memory allocations flags
+ - Better handling of iterative truncation
+ - Change continue to break, otherwise we don't re-acquire LAZY_LOCK
+
+Function and Fixes in drop44 (1.0.6)
+ - Create jfs_incore.h which merges linux/jfs_fs.h, linux/jfs_fs_i.h, and jfs_fs_sb.h
+ - Create a configuration option to handle JFS_DEBUG define
+ - Fixed a few cases where positive error codes were returned to the VFS.
+ - Replace jfs_dir_read by generic_read_dir.
+ - jfs_fsync_inode is only called by jfs_fsync_file, merge the two and rename to jfs_fsync.
+ - Add a bunch of missing externs.
+ - jfs_rwlock_lock is unused, nuke it.
+ - Always use atomic set/test_bit operations to protect jfs_ip->cflag
+ - Combine jfs_ip->flag with jfs_ip->cflag
+ - Fixed minor format errors reported by fsck
+ - cflags should be long so bitops always works correctly
+ - Use GFP_NOFS for runtime memory allocations
+ - Support VM changes in 2.4.10 of the kernel
+ - Remove ifdefs supporting older 2.4 kernels. JFS now requires at least 2.4.3 or 2.4.2-ac2
+ - Simplify and remove one use of IWRITE_TRYLOCK
+ - jfs_truncate was not passing tid to xtTruncate
+ - removed obsolete extent_page workaround
+ - correct recovery from failed diAlloc call (disk full)
+ - In write_metapage, don't call commit_write if prepare_write failed
+
+Function and Fixes in drop43 (1.0.5)
+ - Allow separate allocation of JFS-private superblock/inode data.
+ - Remove checks in namei.c that are already done by the VFS.
+ - Remove redundant mutex defines.
+ - Replace all occurrences of #include <linux/malloc.h> with #include <linux/slab.h>
+ - Work around race condition in remount -fixes OOPS during shutdown
+ - Truncate large files incrementally ( affects directories too)
+
+Function and Fixes in drop42 (1.0.4)
+ - Fixed compiler warnings in the FS when building on 64 bits systems
+ - Fixed deadlock where jfsCommit hung in hold_metapage
+ - Fixed problems with remount
+ - Reserve metapages for jfsCommit thread
+ - Get rid of buggy invalidate_metapage & use discard_metapage
+ - Don't hand metapages to jfsIOthread (too many context switches) (jitterbug 125, bugzilla 238)
+ - Fix error message in jfs_strtoUCS
+
+Function and Fixes in drop41 (1.0.3)
+ - Patch to move from previous release to latest release needs to update the version number in super.c
+ - Jitterbug problems (134,140,152) removing files have been fixed
+ - Set rc=ENOSPC if ialloc fails in jfs_create and jfs_mkdir
+ - Fixed jfs_txnmgr.c 775! assert
+ - Fixed jfs_txnmgr.c 884! assert(mp->nohomeok==0)
+ - Fix hang - prevent tblocks from being exhausted
+ - Fix oops trying to mount reiserfs
+ - Fail more gracefully in jfs_imap.c
+ - Print more information when char2uni fails
+ - Fix timing problem between Block map and metapage cache - jitterbug 139
+ - Code Cleanup (removed many ifdef's, obsolete code, ran code through indent) Mostly 2.4 tree
+ - Split source tree (Now have a separate source tree for 2.2, 2.4, and jfsutils)
+
+Function and Fixes in drop40 (1.0.2)
+ - Fixed multiple truncate hang
+ - Fixed hang on unlink a file and sync happening at the same time
+ - Improved handling of kmalloc error conditions
+ - Fixed hang in blk_get_queue and SMP deadlock: bh_end_io call generic_make_request
+ (jitterbug 145 and 146)
+ - stbl was not set correctly set in dtDelete
+ - changed trap to printk in dbAllocAG to avoid system hang
+
+Function and Fixes in drop 39 (1.0.1)
+ - Fixed hang during copying files on 2.2.x series
+ - Fixed TxLock compile problem
+ - Fixed to correctly update the number of blocks for directories (this was causing the FS
+ to show fsck error after compiling mozilla).
+ - Fixed to prevent old data from being written to disk from the page cache.
+
+Function and Fixes in drop 38 (1.0.0)
+ - Fixed some general log problems
+
+Please send bugs, comments, cards and letters to linuxjfs@us.ibm.com.
+
+The JFS mailing list can be subscribed to by using the link labeled "Mail list Subscribe"
+at our web page http://oss.software.ibm.com/jfs/.
+
+
+
+
+
+
+
+
+
diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt
new file mode 100644
index 000000000000..3f322b9db954
--- /dev/null
+++ b/Documentation/filesystems/jfs.txt
@@ -0,0 +1,136 @@
+IBM's Journaled File System (JFS) for Linux version 1.0.15
+Team members
+Steve Best sbest@us.ibm.com
+Dave Kleikamp shaggy@austin.ibm.com
+Barry Arndt barndt@us.ibm.com
+Christoph Hellwig hch@caldera.de
+
+
+Release February 15, 2002 (version 1.0.15)
+
+This is our fifty-third release of IBM's Enterprise JFS technology port to Linux.
+Beta 1 was release 0.1.0 on 12/8/2000, Beta 2 was release 0.2.0 on 3/7/2001,
+Beta 3 was release 0.3.0 on 4/30/2001, and release 1.0.0 on 6/28/2001.
+
+The changelog.jfs file contains detailed information of changes done in each source
+code drop.
+
+JFS has a source tree that can be built on 2.4.3 - 2.4.17 and 2.5.4 kernel.org
+source trees.
+
+Our current goal on the 2.5.x series of the kernel is to update to the latest
+2.5.x version and only support the latest version of this kernel.
+This will change when the distros start shipping the 2.5.x series of the kernel.
+
+Our current goal on the 2.4.x series of the kernel is to continue to support
+all of the kernels in this series as we do today.
+
+There is an anonymous cvs access available for the JFS tree. The steps below are
+what is needed to pull the JFS cvs tree from the oss.software.ibm.com server.
+
+id anoncvs
+password anoncvs
+
+To checkout 2.4.x series of the JFS files do the following:
+CVSROOT should be set to :pserver:anoncvs@oss.software.ibm.com:/usr/cvs/jfs
+cvs checkout linux24
+
+To checkout 2.5.2 series of the JFS files do the following:
+CVSROOT should be set to :pserver:anoncvs@oss.software.ibm.com:/usr/cvs/jfs
+cvs checkout linux25
+
+To checkout the JFS utilities do the following:
+CVSROOT should be set to :pserver:anoncvs@oss.software.ibm.com:/usr/cvs/jfs
+cvs checkout jfsutils
+
+The cvs tree contains the latest changes being done to JFS. To receive notification
+of commits to the cvs tree, please send e-mail to linuxjfs@us.ibm.com stating that
+you would like notifications sent to you.
+
+The jfs-2.4-1.0.15-patch.tar.gz is the easiest way to get the latest file system
+source code on your system. There are also patch files that can move your jfs source
+code from one release to another. If you have release 1.0.14 and would like to move
+to release 1.0.15 the patch file named jfs-2.4-1_0_14-to-1_0_15-patch.gz will do that.
+
+The jfs-2.4-1.0.15-patch.tar.gz file contains a readme and patch files for different
+levels of the 2.4 kernel. Please see the README in the jfs-2.4-1.0.15-patch.tar.gz
+file for help on applying the two patch files.
+
+
+The following files in the kernel source tree have been changed so JFS can be built.
+The jfs-2.4-1.0.15.tar.gz source tar ball contains each of the files below with
+the extension of the kernel level it is associated with. As an example, there are now
+four Config.in files named Config.in-2.4.0, Config.in-2.4.5, Config.in-2.4.7 and
+Config.in-2.4.17.
+
+
+If you use the jfs-2.4-1.0.15.tar.gz to build JFS you must rename each of the
+kernel files to the file names listed below. The standard kernel from www.kernel.org
+is the source of the kernel files that are included in the jfs tar file.
+
+
+In sub dir fs Config.in, Makefile
+In sub dir fs/nls Config.in
+In sub dir Documentation Configure.help, Changes
+In sub dir Documentation/filesystems 00-INDEX
+In sub dir linux MAINTAINERS
+
+Please backup the above files before the JFS tar file is added to the kernel source
+tree. All JFS files are located in the include/linux/jfs or fs/jfs sub dirs.
+
+Our development team has used the Linux kernel levels 2.4.3 - 2.4.17 kernels
+with gcc version egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)
+for our port so far. A goal of the JFS team is to have JFS run on all architectures
+that Linux supports, there is no architecture specific code in JFS. JFS has been run
+on the following architectures (x86, PowerPC, Alpha, s/390, ARM) so far.
+
+To make JFS build, during the "make config" step of building the kernel answer y to
+the Prompt for development and/or incomplete code/drivers in the Code maturity level
+options section. In the Filesystems section use the m for the answer to
+JFS filesystem support (experimental) (CONFIG_JFS_FS) [Y/m/n?]
+
+
+Build in /usr/src/linux with the command:
+
+
+make modules
+make modules_install
+
+If you rebuild jfs.o after having mounted and unmounted a partition, "modprobe -r jfs"
+will unload the old module.
+
+For the file system debugging messages are being written to /var/log/messages.
+
+Please see the readme in the utilities package for information about building
+the JFS utilities.
+
+JFS TODO list:
+
+Plans for our near term development items
+
+ - get defrag capabilities operational in the FS
+ - get extendfs capabilities operational in the FS
+ - test EXTENDFS utility, for growing JFS partitions
+ - test defrag utility, calls file system to defrag the file system.
+ - add support for block sizes (512,1024,2048)
+ - add support for logfile on dedicated partition
+
+
+Longer term work items
+
+ - get access control list functionality operational
+ - get extended attributes functionality operational
+ - add quota support
+
+Please send bugs, comments, cards and letters to linuxjfs@us.ibm.com.
+
+The JFS mailing list can be subscribed to by using the link labeled "Mail list Subscribe"
+at our web page http://oss.software.ibm.com/jfs/.
+
+
+
+
+
+
+
+
diff --git a/MAINTAINERS b/MAINTAINERS
index 72023516b6c9..9c92cf3da7d2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -856,6 +856,13 @@ L: jffs-dev@axis.com
W: http://sources.redhat.com/jffs2/
S: Maintained
+JFS FILESYSTEM
+P: Dave Kleikamp
+M: shaggy@austin.ibm.com
+L: jfs-discussion@oss.software.ibm.com
+W: http://oss.software.ibm.com/jfs/
+S: Supported
+
JOYSTICK DRIVER
P: Vojtech Pavlik
M: vojtech@suse.cz
diff --git a/fs/Config.help b/fs/Config.help
index 11a316b89638..3581b6fa3896 100644
--- a/fs/Config.help
+++ b/fs/Config.help
@@ -859,6 +859,25 @@ CONFIG_ADFS_FS_RW
hard drives and ADFS-formatted floppy disks. This is experimental
codes, so if you're unsure, say N.
+JFS filesystem support
+CONFIG_JFS_FS
+ This is a port of IBM's Journaled Filesystem . More information is
+ available in the file Documentation/filesystems/jfs.txt.
+
+ If you do not intend to use the JFS filesystem, say N.
+
+JFS Debugging
+CONFIG_JFS_DEBUG
+ If you are experiencing any problems with the JFS filesystem, say
+ Y here. This will result in additional debugging messages to be
+ written to the system log. Under normal circumstances, this
+ results in very little overhead.
+
+JFS Statistics
+CONFIG_JFS_STATISTICS
+ Enabling this option will cause statistics from the JFS file system
+ to be made available to the user in the /proc/fs/jfs/ directory.
+
CONFIG_DEVPTS_FS
You should say Y here if you said Y to "Unix98 PTY support" above.
You'll then get a virtual file system which can be mounted on
diff --git a/fs/Config.in b/fs/Config.in
index 8ff6fdf9fb05..4db680cd4812 100644
--- a/fs/Config.in
+++ b/fs/Config.in
@@ -54,6 +54,10 @@ tristate 'ISO 9660 CDROM file system support' CONFIG_ISO9660_FS
dep_mbool ' Microsoft Joliet CDROM extensions' CONFIG_JOLIET $CONFIG_ISO9660_FS
dep_mbool ' Transparent decompression extension' CONFIG_ZISOFS $CONFIG_ISO9660_FS
+tristate 'JFS filesystem support' CONFIG_JFS_FS
+dep_mbool ' JFS debugging' CONFIG_JFS_DEBUG $CONFIG_JFS_FS
+dep_mbool ' JFS statistics' CONFIG_JFS_STATISTICS $CONFIG_JFS_FS
+
tristate 'Minix fs support' CONFIG_MINIX_FS
tristate 'FreeVxFS file system support (VERITAS VxFS(TM) compatible)' CONFIG_VXFS_FS
diff --git a/fs/Makefile b/fs/Makefile
index 253c01ec4453..560fa8b317ba 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -67,6 +67,7 @@ subdir-$(CONFIG_ADFS_FS) += adfs
subdir-$(CONFIG_REISERFS_FS) += reiserfs
subdir-$(CONFIG_DEVPTS_FS) += devpts
subdir-$(CONFIG_SUN_OPENPROMFS) += openpromfs
+subdir-$(CONFIG_JFS_FS) += jfs
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
new file mode 100644
index 000000000000..5bcec98c97d3
--- /dev/null
+++ b/fs/jfs/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for the Linux JFS filesystem routines.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (not a .c file).
+#
+# Note 2! The CFLAGS definitions are now in the main makefile.
+
+O_TARGET := jfs.o
+obj-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
+ jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \
+ jfs_unicode.o jfs_dtree.o jfs_inode.o \
+ jfs_extent.o symlink.o jfs_metapage.o \
+ jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o
+obj-m := $(O_TARGET)
+
+EXTRA_CFLAGS += -D_JFS_4K
+
+include $(TOPDIR)/Rules.make
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
new file mode 100644
index 000000000000..fdebc8859c4e
--- /dev/null
+++ b/fs/jfs/endian24.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _H_ENDIAN24
+#define _H_ENDIAN24
+
+/*
+ * fs/jfs/endian24.h:
+ *
+ * Endian conversion for 24-byte data
+ *
+ */
+#define __swab24(x) \
+({ \
+ __u32 __x = (x); \
+ ((__u32)( \
+ ((__x & (__u32)0x000000ffUL) << 16) | \
+ (__x & (__u32)0x0000ff00UL) | \
+ ((__x & (__u32)0x00ff0000UL) >> 16) )); \
+})
+
+#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
+ #define __cpu_to_le24(x) ((__u32)(x))
+ #define __le24_to_cpu(x) ((__u32)(x))
+#else
+ #define __cpu_to_le24(x) __swab24(x)
+ #define __le24_to_cpu(x) __swab24(x)
+#endif
+
+#ifdef __KERNEL__
+ #define cpu_to_le24 __cpu_to_le24
+ #define le24_to_cpu __le24_to_cpu
+#endif
+
+#endif /* !_H_ENDIAN24 */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
new file mode 100644
index 000000000000..df0c299ee712
--- /dev/null
+++ b/fs/jfs/file.c
@@ -0,0 +1,105 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/locks.h>
+#include "jfs_incore.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+
+
+extern int generic_file_open(struct inode *, struct file *);
+extern loff_t generic_file_llseek(struct file *, loff_t, int origin);
+
+extern int jfs_commit_inode(struct inode *, int);
+
+int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct inode *inode = dentry->d_inode;
+ int rc = 0;
+
+ rc = fsync_inode_data_buffers(inode);
+
+ if (!(inode->i_state & I_DIRTY))
+ return rc;
+ if (datasync || !(inode->i_state & I_DIRTY_DATASYNC))
+ return rc;
+
+ IWRITE_LOCK(inode);
+ rc |= jfs_commit_inode(inode, 1);
+ IWRITE_UNLOCK(inode);
+
+ return rc ? -EIO : 0;
+}
+
+struct file_operations jfs_file_operations = {
+ open: generic_file_open,
+ llseek: generic_file_llseek,
+ write: generic_file_write,
+ read: generic_file_read,
+ mmap: generic_file_mmap,
+ fsync: jfs_fsync,
+};
+
+/*
+ * Guts of jfs_truncate. Called with locks already held. Can be called
+ * with directory for truncating directory index table.
+ */
+void jfs_truncate_nolock(struct inode *ip, loff_t length)
+{
+ loff_t newsize;
+ tid_t tid;
+
+ ASSERT(length >= 0);
+
+ if (test_cflag(COMMIT_Nolink, ip)) {
+ xtTruncate(0, ip, length, COMMIT_WMAP);
+ return;
+ }
+
+ do {
+ tid = txBegin(ip->i_sb, 0);
+
+ newsize = xtTruncate(tid, ip, length,
+ COMMIT_TRUNCATE | COMMIT_PWMAP);
+ if (newsize < 0) {
+ txEnd(tid);
+ break;
+ }
+
+ ip->i_mtime = ip->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(ip);
+
+ txCommit(tid, 1, &ip, 0);
+ txEnd(tid);
+ } while (newsize > length); /* Truncate isn't always atomic */
+}
+
+static void jfs_truncate(struct inode *ip)
+{
+ jFYI(1, ("jfs_truncate: size = 0x%lx\n", (ulong) ip->i_size));
+
+ IWRITE_LOCK(ip);
+ jfs_truncate_nolock(ip, ip->i_size);
+ IWRITE_UNLOCK(ip);
+}
+
+struct inode_operations jfs_file_inode_operations = {
+ truncate: jfs_truncate,
+};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
new file mode 100644
index 000000000000..ee3dfc090863
--- /dev/null
+++ b/fs/jfs/inode.c
@@ -0,0 +1,314 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/locks.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_imap.h"
+#include "jfs_extent.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+
+
+extern struct inode_operations jfs_dir_inode_operations;
+extern struct inode_operations jfs_file_inode_operations;
+extern struct inode_operations jfs_symlink_inode_operations;
+extern struct file_operations jfs_dir_operations;
+extern struct file_operations jfs_file_operations;
+struct address_space_operations jfs_aops;
+extern int freeZeroLink(struct inode *);
+
+void jfs_put_inode(struct inode *inode)
+{
+ jFYI(1, ("In jfs_put_inode, inode = 0x%p\n", inode));
+}
+
+void jfs_read_inode(struct inode *inode)
+{
+ jFYI(1, ("In jfs_read_inode, inode = 0x%p\n", inode));
+
+ if (diRead(inode))
+ goto bad_inode;
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &jfs_file_inode_operations;
+ inode->i_fop = &jfs_file_operations;
+ inode->i_mapping->a_ops = &jfs_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &jfs_dir_inode_operations;
+ inode->i_fop = &jfs_dir_operations;
+ inode->i_mapping->a_ops = &jfs_aops;
+ inode->i_mapping->gfp_mask = GFP_NOFS;
+ } else if (S_ISLNK(inode->i_mode)) {
+ if (inode->i_size > IDATASIZE) {
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_mapping->a_ops = &jfs_aops;
+ } else
+ inode->i_op = &jfs_symlink_inode_operations;
+ } else {
+ init_special_inode(inode, inode->i_mode,
+ kdev_t_to_nr(inode->i_rdev));
+ }
+
+ return;
+
+ bad_inode:
+ make_bad_inode(inode);
+}
+
+/* This define is from fs/open.c */
+#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
+
+/*
+ * Workhorse of both fsync & write_inode
+ */
+int jfs_commit_inode(struct inode *inode, int wait)
+{
+ int rc = 0;
+ tid_t tid;
+ static int noisy = 5;
+
+ jFYI(1, ("In jfs_commit_inode, inode = 0x%p\n", inode));
+
+ /*
+ * Don't commit if inode has been committed since last being
+ * marked dirty, or if it has been deleted.
+ */
+ if (test_cflag(COMMIT_Nolink, inode) ||
+ !test_cflag(COMMIT_Dirty, inode))
+ return 0;
+
+ if (isReadOnly(inode)) {
+ /* kernel allows writes to devices on read-only
+ * partitions and may think inode is dirty
+ */
+ if (!special_file(inode->i_mode) && noisy) {
+ jERROR(1, ("jfs_commit_inode(0x%p) called on "
+ "read-only volume\n", inode));
+ jERROR(1, ("Is remount racy?\n"));
+ noisy--;
+ }
+ return 0;
+ }
+
+ tid = txBegin(inode->i_sb, COMMIT_INODE);
+ rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0);
+ txEnd(tid);
+ return -rc;
+}
+
+void jfs_write_inode(struct inode *inode, int wait)
+{
+ /*
+ * If COMMIT_DIRTY is not set, the inode isn't really dirty.
+ * It has been committed since the last change, but was still
+ * on the dirty inode list
+ */
+ if (test_cflag(COMMIT_Nolink, inode) ||
+ !test_cflag(COMMIT_Dirty, inode))
+ return;
+
+ IWRITE_LOCK(inode);
+
+ if (jfs_commit_inode(inode, wait)) {
+ jERROR(1, ("jfs_write_inode: jfs_commit_inode failed!\n"));
+ }
+
+ IWRITE_UNLOCK(inode);
+}
+
+void jfs_delete_inode(struct inode *inode)
+{
+ jFYI(1, ("In jfs_delete_inode, inode = 0x%p\n", inode));
+
+ IWRITE_LOCK(inode);
+ if (test_cflag(COMMIT_Freewmap, inode))
+ freeZeroLink(inode);
+
+ diFree(inode);
+ IWRITE_UNLOCK(inode);
+
+ clear_inode(inode);
+}
+
+void jfs_dirty_inode(struct inode *inode)
+{
+ static int noisy = 5;
+
+ if (isReadOnly(inode)) {
+ if (!special_file(inode->i_mode) && noisy) {
+ /* kernel allows writes to devices on read-only
+ * partitions and may try to mark inode dirty
+ */
+ jERROR(1, ("jfs_dirty_inode called on "
+ "read-only volume\n"));
+ jERROR(1, ("Is remount racy?\n"));
+ noisy--;
+ }
+ return;
+ }
+
+ set_cflag(COMMIT_Dirty, inode);
+}
+
+static int jfs_get_block(struct inode *ip, sector_t lblock,
+ struct buffer_head *bh_result, int create)
+{
+ s64 lblock64 = lblock;
+ int no_size_check = 0;
+ int rc = 0;
+ int take_locks;
+ xad_t xad;
+ s64 xaddr;
+ int xflag;
+ s32 xlen;
+
+ /*
+ * If this is a special inode (imap, dmap) or directory,
+ * the lock should already be taken
+ */
+ take_locks = ((JFS_IP(ip)->fileset != AGGREGATE_I) &&
+ !S_ISDIR(ip->i_mode));
+ /*
+ * Take appropriate lock on inode
+ */
+ if (take_locks) {
+ if (create)
+ IWRITE_LOCK(ip);
+ else
+ IREAD_LOCK(ip);
+ }
+
+ /*
+ * A directory's "data" is the inode index table, but i_size is the
+ * size of the d-tree, so don't check the offset against i_size
+ */
+ if (S_ISDIR(ip->i_mode))
+ no_size_check = 1;
+
+ if ((no_size_check ||
+ ((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size)) &&
+ (xtLookup
+ (ip, lblock64, 1, &xflag, &xaddr, &xlen, no_size_check)
+ == 0) && xlen) {
+ if (xflag & XAD_NOTRECORDED) {
+ if (!create)
+ /*
+ * Allocated but not recorded, read treats
+ * this as a hole
+ */
+ goto unlock;
+#ifdef _JFS_4K
+ XADoffset(&xad, lblock64);
+ XADlength(&xad, xlen);
+ XADaddress(&xad, xaddr);
+#else /* _JFS_4K */
+ /*
+ * As long as block size = 4K, this isn't a problem.
+ * We should mark the whole page not ABNR, but how
+ * will we know to mark the other blocks BH_New?
+ */
+ BUG();
+#endif /* _JFS_4K */
+ rc = extRecord(ip, &xad);
+ if (rc)
+ goto unlock;
+ bh_result->b_state |= (1UL << BH_New);
+ }
+
+ map_bh(bh_result, ip->i_sb, xaddr);
+ goto unlock;
+ }
+ if (!create)
+ goto unlock;
+
+ /*
+ * Allocate a new block
+ */
+#ifdef _JFS_4K
+ if ((rc =
+ extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
+ goto unlock;
+ rc = extAlloc(ip, 1, lblock64, &xad, FALSE);
+ if (rc)
+ goto unlock;
+
+ bh_result->b_state |= (1UL << BH_New);
+ map_bh(bh_result, ip->i_sb, addressXAD(&xad));
+
+#else /* _JFS_4K */
+ /*
+ * We need to do whatever it takes to keep all but the last buffers
+ * in 4K pages - see jfs_write.c
+ */
+ BUG();
+#endif /* _JFS_4K */
+
+ unlock:
+ /*
+ * Release lock on inode
+ */
+ if (take_locks) {
+ if (create)
+ IWRITE_UNLOCK(ip);
+ else
+ IREAD_UNLOCK(ip);
+ }
+ return -rc;
+}
+
+static int jfs_writepage(struct page *page)
+{
+ return block_write_full_page(page, jfs_get_block);
+}
+
+static int jfs_readpage(struct file *file, struct page *page)
+{
+ return block_read_full_page(page, jfs_get_block);
+}
+
+static int jfs_prepare_write(struct file *file,
+ struct page *page, unsigned from, unsigned to)
+{
+ return block_prepare_write(page, from, to, jfs_get_block);
+}
+
+static int jfs_bmap(struct address_space *mapping, long block)
+{
+ return generic_block_bmap(mapping, block, jfs_get_block);
+}
+
+static int jfs_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
+ unsigned long blocknr, int blocksize)
+{
+ return generic_direct_IO(rw, inode, iobuf, blocknr,
+ blocksize, jfs_get_block);
+}
+
+struct address_space_operations jfs_aops = {
+ readpage: jfs_readpage,
+ writepage: jfs_writepage,
+ sync_page: block_sync_page,
+ prepare_write: jfs_prepare_write,
+ commit_write: generic_commit_write,
+ bmap: jfs_bmap,
+ direct_IO: jfs_direct_IO,
+};
diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h
new file mode 100644
index 000000000000..5b9ba459c774
--- /dev/null
+++ b/fs/jfs/jfs_btree.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef _H_JFS_BTREE
+#define _H_JFS_BTREE
+/*
+ * jfs_btree.h: B+-tree
+ *
+ * JFS B+-tree (dtree and xtree) common definitions
+ */
+
+/*
+ * basic btree page - btpage_t
+ */
+typedef struct {
+ s64 next; /* 8: right sibling bn */
+ s64 prev; /* 8: left sibling bn */
+
+ u8 flag; /* 1: */
+ u8 rsrvd[7]; /* 7: type specific */
+ s64 self; /* 8: self address */
+
+ u8 entry[4064]; /* 4064: */
+} btpage_t; /* (4096) */
+
+/* btpaget_t flag */
+#define BT_TYPE 0x07 /* B+-tree index */
+#define BT_ROOT 0x01 /* root page */
+#define BT_LEAF 0x02 /* leaf page */
+#define BT_INTERNAL 0x04 /* internal page */
+#define BT_RIGHTMOST 0x10 /* rightmost page */
+#define BT_LEFTMOST 0x20 /* leftmost page */
+#define BT_SWAPPED 0x80 /* used by fsck for endian swapping */
+
+/* btorder (in inode) */
+#define BT_RANDOM 0x0000
+#define BT_SEQUENTIAL 0x0001
+#define BT_LOOKUP 0x0010
+#define BT_INSERT 0x0020
+#define BT_DELETE 0x0040
+
+/*
+ * btree page buffer cache access
+ */
+#define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0)
+
+/* get page from buffer page */
+#define BT_PAGE(IP, MP, TYPE, ROOT)\
+ (BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data)
+
+/* get the page buffer and the page for specified block address */
+#define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\
+{\
+ if ((BN) == 0)\
+ {\
+ MP = (metapage_t *)&JFS_IP(IP)->bxflag;\
+ P = (TYPE *)&JFS_IP(IP)->ROOT;\
+ RC = 0;\
+ jEVENT(0,("%d BT_GETPAGE returning root\n", __LINE__));\
+ }\
+ else\
+ {\
+ jEVENT(0,("%d BT_GETPAGE reading block %d\n", __LINE__,\
+ (int)BN));\
+ MP = read_metapage((IP), BN, SIZE, 1);\
+ if (MP) {\
+ RC = 0;\
+ P = (MP)->data;\
+ } else {\
+ P = NULL;\
+ jERROR(1,("bread failed!\n"));\
+ RC = EIO;\
+ }\
+ }\
+}
+
+#define BT_MARK_DIRTY(MP, IP)\
+{\
+ if (BT_IS_ROOT(MP))\
+ mark_inode_dirty(IP);\
+ else\
+ mark_metapage_dirty(MP);\
+}
+
+/* put the page buffer */
+#define BT_PUTPAGE(MP)\
+{\
+ if (! BT_IS_ROOT(MP)) \
+ release_metapage(MP); \
+}
+
+
+/*
+ * btree traversal stack
+ *
+ * record the path traversed during the search;
+ * top frame record the leaf page/entry selected.
+ */
+#define MAXTREEHEIGHT 8
+typedef struct btframe { /* stack frame */
+ s64 bn; /* 8: */
+ s16 index; /* 2: */
+ s16 lastindex; /* 2: */
+ struct metapage *mp; /* 4: */
+} btframe_t; /* (16) */
+
+typedef struct btstack {
+ btframe_t *top; /* 4: */
+ int nsplit; /* 4: */
+ btframe_t stack[MAXTREEHEIGHT];
+} btstack_t;
+
+#define BT_CLR(btstack)\
+ (btstack)->top = (btstack)->stack
+
+#define BT_PUSH(BTSTACK, BN, INDEX)\
+{\
+ (BTSTACK)->top->bn = BN;\
+ (BTSTACK)->top->index = INDEX;\
+ ++(BTSTACK)->top;\
+ assert((BTSTACK)->top != &((BTSTACK)->stack[MAXTREEHEIGHT]));\
+}
+
+#define BT_POP(btstack)\
+ ( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top )
+
+#define BT_STACK(btstack)\
+ ( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top )
+
+/* retrieve search results */
+#define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\
+{\
+ BN = (LEAF)->bn;\
+ MP = (LEAF)->mp;\
+ if (BN)\
+ P = (TYPE *)MP->data;\
+ else\
+ P = (TYPE *)&JFS_IP(IP)->ROOT;\
+ INDEX = (LEAF)->index;\
+}
+
+/* put the page buffer of search */
+#define BT_PUTSEARCH(BTSTACK)\
+{\
+ if (! BT_IS_ROOT((BTSTACK)->top->mp))\
+ release_metapage((BTSTACK)->top->mp);\
+}
+#endif /* _H_JFS_BTREE */
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
new file mode 100644
index 000000000000..4a5aaa771e70
--- /dev/null
+++ b/fs/jfs/jfs_debug.c
@@ -0,0 +1,145 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_debug.h"
+
+#ifdef CONFIG_JFS_DEBUG
+void dump_mem(char *label, void *data, int length)
+{
+ int i, j;
+ int *intptr = data;
+ char *charptr = data;
+ char buf[10], line[80];
+
+ printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length,
+ data);
+ for (i = 0; i < length; i += 16) {
+ line[0] = 0;
+ for (j = 0; (j < 4) && (i + j * 4 < length); j++) {
+ sprintf(buf, " %08x", intptr[i / 4 + j]);
+ strcat(line, buf);
+ }
+ buf[0] = ' ';
+ buf[2] = 0;
+ for (j = 0; (j < 16) && (i + j < length); j++) {
+ buf[1] =
+ isprint(charptr[i + j]) ? charptr[i + j] : '.';
+ strcat(line, buf);
+ }
+ printk("%s\n", line);
+ }
+}
+
+#ifdef CONFIG_PROC_FS
+static int loglevel_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len;
+
+ len = sprintf(page, "%d\n", jfsloglevel);
+
+ len -= off;
+ *start = page + off;
+
+ if (len > count)
+ len = count;
+ else
+ *eof = 1;
+
+ if (len < 0)
+ len = 0;
+
+ return len;
+}
+
+static int loglevel_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ char c;
+
+ if (get_user(c, buffer))
+ return -EFAULT;
+
+ /* yes, I know this is an ASCIIism. --hch */
+ if (c < '0' || c > '9')
+ return -EINVAL;
+ jfsloglevel = c - '0';
+ return count;
+}
+
+
+extern read_proc_t jfs_txanchor_read;
+#ifdef CONFIG_JFS_STATISTICS
+extern read_proc_t jfs_lmstats_read;
+extern read_proc_t jfs_xtstat_read;
+extern read_proc_t jfs_mpstat_read;
+#endif
+static struct proc_dir_entry *base;
+
+static struct {
+ const char *name;
+ read_proc_t *read_fn;
+ write_proc_t *write_fn;
+} Entries[] = {
+ { "TxAnchor", jfs_txanchor_read, },
+#ifdef CONFIG_JFS_STATISTICS
+ { "lmstats", jfs_lmstats_read, },
+ { "xtstat", jfs_xtstat_read, },
+ { "mpstat", jfs_mpstat_read, },
+#endif
+ { "loglevel", loglevel_read, loglevel_write }
+};
+#define NPROCENT (sizeof(Entries)/sizeof(Entries[0]))
+
+void jfs_proc_init(void)
+{
+ int i;
+
+ if (!(base = proc_mkdir("jfs", proc_root_fs)))
+ return;
+ base->owner = THIS_MODULE;
+
+ for (i = 0; i < NPROCENT; i++) {
+ struct proc_dir_entry *p;
+ if ((p = create_proc_entry(Entries[i].name, 0, base))) {
+ p->read_proc = Entries[i].read_fn;
+ p->write_proc = Entries[i].write_fn;
+ }
+ }
+}
+
+void jfs_proc_clean(void)
+{
+ int i;
+
+ if (base) {
+ for (i = 0; i < NPROCENT; i++)
+ remove_proc_entry(Entries[i].name, base);
+ remove_proc_entry("jfs", base);
+ }
+}
+
+#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_JFS_DEBUG */
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
new file mode 100644
index 000000000000..6974bc66448e
--- /dev/null
+++ b/fs/jfs/jfs_debug.h
@@ -0,0 +1,96 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+*/
+#ifndef _H_JFS_DEBUG
+#define _H_JFS_DEBUG
+
+/*
+ * jfs_debug.h
+ *
+ * global debug message, data structure/macro definitions
+ * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS;
+ */
+
+/*
+ * assert with traditional printf/panic
+ */
+#ifdef CONFIG_KERNEL_ASSERTS
+/* kgdb stuff */
+#define assert(p) KERNEL_ASSERT(#p, p)
+#else
+#define assert(p) {\
+if (!(p))\
+ {\
+ printk("assert(%s)\n",#p);\
+ BUG();\
+ }\
+}
+#endif
+
+/*
+ * debug ON
+ * --------
+ */
+#ifdef CONFIG_JFS_DEBUG
+#define ASSERT(p) assert(p)
+
+/* dump memory contents */
+extern void dump_mem(char *label, void *data, int length);
+extern int jfsloglevel;
+
+/* information message: e.g., configuration, major event */
+#define jFYI(button, prspec) \
+ do { if (button && jfsloglevel > 1) printk prspec; } while (0)
+
+/* error event message: e.g., i/o error */
+extern int jfsERROR;
+#define jERROR(button, prspec) \
+ do { if (button && jfsloglevel > 0) { printk prspec; } } while (0)
+
+/* debug event message: */
+#define jEVENT(button,prspec) \
+ do { if (button) printk prspec; } while (0)
+
+/*
+ * debug OFF
+ * ---------
+ */
+#else /* CONFIG_JFS_DEBUG */
+#define dump_mem(label,data,length)
+#define ASSERT(p)
+#define jEVENT(button,prspec)
+#define jERROR(button,prspec)
+#define jFYI(button,prspec)
+#endif /* CONFIG_JFS_DEBUG */
+
+/*
+ * statistics
+ * ----------
+ */
+#ifdef CONFIG_JFS_STATISTICS
+#define INCREMENT(x) ((x)++)
+#define DECREMENT(x) ((x)--)
+#define HIGHWATERMARK(x,y) ((x) = max((x), (y)))
+#else
+#define INCREMENT(x)
+#define DECREMENT(x)
+#define HIGHWATERMARK(x,y)
+#endif /* CONFIG_JFS_STATISTICS */
+
+#endif /* _H_JFS_DEBUG */
diff --git a/fs/jfs/jfs_defragfs.h b/fs/jfs/jfs_defragfs.h
new file mode 100644
index 000000000000..089e9dec7c2c
--- /dev/null
+++ b/fs/jfs/jfs_defragfs.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef _H_JFS_DEFRAGFS
+#define _H_JFS_DEFRAGFS
+
+/*
+ * jfs_defragfs.h
+ */
+/*
+ * defragfs parameter list
+ */
+typedef struct {
+ uint flag; /* 4: */
+ u8 dev; /* 1: */
+ u8 pad[3]; /* 3: */
+ s32 fileset; /* 4: */
+ u32 inostamp; /* 4: */
+ u32 ino; /* 4: */
+ u32 gen; /* 4: */
+ s64 xoff; /* 8: */
+ s64 old_xaddr; /* 8: */
+ s64 new_xaddr; /* 8: */
+ s32 xlen; /* 4: */
+} defragfs_t; /* (52) */
+
+/* plist flag */
+#define DEFRAGFS_SYNC 0x80000000
+#define DEFRAGFS_COMMIT 0x40000000
+#define DEFRAGFS_RELOCATE 0x10000000
+
+#define INODE_TYPE 0x0000F000 /* IFREG or IFDIR */
+
+#define EXTENT_TYPE 0x000000ff
+#define DTPAGE 0x00000001
+#define XTPAGE 0x00000002
+#define DATAEXT 0x00000004
+#define EAEXT 0x00000008
+
+#endif /* _H_JFS_DEFRAGFS */
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
new file mode 100644
index 000000000000..465955f9e849
--- /dev/null
+++ b/fs/jfs/jfs_dinode.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _H_JFS_DINODE
+#define _H_JFS_DINODE
+
+/*
+ * jfs_dinode.h: on-disk inode manager
+ *
+ */
+
+#define INODESLOTSIZE 128
+#define L2INODESLOTSIZE 7
+#define log2INODESIZE 9 /* log2(bytes per dinode) */
+
+
+/*
+ * on-disk inode (dinode_t): 512 bytes
+ *
+ * note: align 64-bit fields on 8-byte boundary.
+ */
+struct dinode {
+ /*
+ * I. base area (128 bytes)
+ * ------------------------
+ *
+ * define generic/POSIX attributes
+ */
+ u32 di_inostamp; /* 4: stamp to show inode belongs to fileset */
+ s32 di_fileset; /* 4: fileset number */
+ u32 di_number; /* 4: inode number, aka file serial number */
+ u32 di_gen; /* 4: inode generation number */
+
+ pxd_t di_ixpxd; /* 8: inode extent descriptor */
+
+ s64 di_size; /* 8: size */
+ s64 di_nblocks; /* 8: number of blocks allocated */
+
+ u32 di_nlink; /* 4: number of links to the object */
+
+ u32 di_uid; /* 4: user id of owner */
+ u32 di_gid; /* 4: group id of owner */
+
+ u32 di_mode; /* 4: attribute, format and permission */
+
+ struct timestruc_t di_atime; /* 8: time last data accessed */
+ struct timestruc_t di_ctime; /* 8: time last status changed */
+ struct timestruc_t di_mtime; /* 8: time last data modified */
+ struct timestruc_t di_otime; /* 8: time created */
+
+ dxd_t di_acl; /* 16: acl descriptor */
+
+ dxd_t di_ea; /* 16: ea descriptor */
+
+ u32 di_next_index; /* 4: Next available dir_table index */
+
+ s32 di_acltype; /* 4: Type of ACL */
+
+ /*
+ * Extension Areas.
+ *
+ * Historically, the inode was partitioned into 4 128-byte areas,
+ * the last 3 being defined as unions which could have multiple
+ * uses. The first 96 bytes had been completely unused until
+ * an index table was added to the directory. It is now more
+ * useful to describe the last 3/4 of the inode as a single
+ * union. We would probably be better off redesigning the
+ * entire structure from scratch, but we don't want to break
+ * commonality with OS/2's JFS at this time.
+ */
+ union {
+ struct {
+ /*
+ * This table contains the information needed to
+ * find a directory entry from a 32-bit index.
+ * If the index is small enough, the table is inline,
+ * otherwise, an x-tree root overlays this table
+ */
+ dir_table_slot_t _table[12]; /* 96: inline */
+
+ dtroot_t _dtroot; /* 288: dtree root */
+ } _dir; /* (384) */
+#define di_dirtable u._dir._table
+#define di_dtroot u._dir._dtroot
+#define di_parent di_dtroot.header.idotdot
+#define di_DASD di_dtroot.header.DASD
+
+ struct {
+ union {
+ u8 _data[96]; /* 96: unused */
+ struct {
+ void *_imap; /* 4: unused */
+ u32 _gengen; /* 4: generator */
+ } _imap;
+ } _u1; /* 96: */
+#define di_gengen u._file._u1._imap._gengen
+
+ union {
+ xtpage_t _xtroot;
+ struct {
+ u8 unused[16]; /* 16: */
+ dxd_t _dxd; /* 16: */
+ union {
+ u32 _rdev; /* 4: */
+ u8 _fastsymlink[128];
+ } _u;
+ u8 _inlineea[128];
+ } _special;
+ } _u2;
+ } _file;
+#define di_xtroot u._file._u2._xtroot
+#define di_dxd u._file._u2._special._dxd
+#define di_btroot di_xtroot
+#define di_inlinedata u._file._u2._special._u
+#define di_rdev u._file._u2._special._u._rdev
+#define di_fastsymlink u._file._u2._special._u._fastsymlink
+#define di_inlineea u._file._u2._special._inlineea
+ } u;
+};
+
+typedef struct dinode dinode_t;
+
+
+/* extended mode bits (on-disk inode di_mode) */
+#define IFJOURNAL 0x00010000 /* journalled file */
+#define ISPARSE 0x00020000 /* sparse file enabled */
+#define INLINEEA 0x00040000 /* inline EA area free */
+#define ISWAPFILE 0x00800000 /* file open for pager swap space */
+
+/* more extended mode bits: attributes for OS/2 */
+#define IREADONLY 0x02000000 /* no write access to file */
+#define IARCHIVE 0x40000000 /* file archive bit */
+#define ISYSTEM 0x08000000 /* system file */
+#define IHIDDEN 0x04000000 /* hidden file */
+#define IRASH 0x4E000000 /* mask for changeable attributes */
+#define INEWNAME 0x80000000 /* non-8.3 filename format */
+#define IDIRECTORY 0x20000000 /* directory (shadow of real bit) */
+#define ATTRSHIFT 25 /* bits to shift to move attribute
+ specification to mode position */
+
+#endif /*_H_JFS_DINODE */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
new file mode 100644
index 000000000000..f8fd8de1feeb
--- /dev/null
+++ b/fs/jfs/jfs_dmap.c
@@ -0,0 +1,4190 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * MODULE_NAME: jfs_dmap.c
+ *
+ * COMPONENT_NAME: sysjfs
+ *
+ * FUNCTION: block allocation map manager
+ *
+*/
+
+/*
+ * Change History :
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_lock.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+
+/*
+ * Debug code for double-checking block map
+ */
+/* #define _JFS_DEBUG_DMAP 1 */
+
+#ifdef _JFS_DEBUG_DMAP
+#define DBINITMAP(size,ipbmap,results) \
+ DBinitmap(size,ipbmap,results)
+#define DBALLOC(dbmap,mapsize,blkno,nblocks) \
+ DBAlloc(dbmap,mapsize,blkno,nblocks)
+#define DBFREE(dbmap,mapsize,blkno,nblocks) \
+ DBFree(dbmap,mapsize,blkno,nblocks)
+#define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \
+ DBAllocCK(dbmap,mapsize,blkno,nblocks)
+#define DBFREECK(dbmap,mapsize,blkno,nblocks) \
+ DBFreeCK(dbmap,mapsize,blkno,nblocks)
+
+static void DBinitmap(s64, struct inode *, u32 **);
+static void DBAlloc(uint *, s64, s64, s64);
+static void DBFree(uint *, s64, s64, s64);
+static void DBAllocCK(uint *, s64, s64, s64);
+static void DBFreeCK(uint *, s64, s64, s64);
+#else
+#define DBINITMAP(size,ipbmap,results)
+#define DBALLOC(dbmap, mapsize, blkno, nblocks)
+#define DBFREE(dbmap, mapsize, blkno, nblocks)
+#define DBALLOCCK(dbmap, mapsize, blkno, nblocks)
+#define DBFREECK(dbmap, mapsize, blkno, nblocks)
+#endif /* _JFS_DEBUG_DMAP */
+
+/*
+ * SERIALIZATION of the Block Allocation Map.
+ *
+ * the working state of the block allocation map is accessed in
+ * two directions:
+ *
+ * 1) allocation and free requests that start at the dmap
+ * level and move up through the dmap control pages (i.e.
+ * the vast majority of requests).
+ *
+ * 2) allocation requests that start at dmap control page
+ * level and work down towards the dmaps.
+ *
+ * the serialization scheme used here is as follows.
+ *
+ * requests which start at the bottom are serialized against each
+ * other through buffers and each requests holds onto its buffers
+ * as it works it way up from a single dmap to the required level
+ * of dmap control page.
+ * requests that start at the top are serialized against each other
+ * and request that start from the bottom by the multiple read/single
+ * write inode lock of the bmap inode. requests starting at the top
+ * take this lock in write mode while request starting at the bottom
+ * take the lock in read mode. a single top-down request may proceed
+ * exclusively while multiple bottoms-up requests may proceed
+ * simultaneously (under the protection of busy buffers).
+ *
+ * in addition to information found in dmaps and dmap control pages,
+ * the working state of the block allocation map also includes read/
+ * write information maintained in the bmap descriptor (i.e. total
+ * free block count, allocation group level free block counts).
+ * a single exclusive lock (BMAP_LOCK) is used to guard this information
+ * in the face of multiple-bottoms up requests.
+ * (lock ordering: IREAD_LOCK, BMAP_LOCK);
+ *
+ * accesses to the persistent state of the block allocation map (limited
+ * to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
+ */
+
+#define BMAP_LOCK_INIT(bmp) init_MUTEX(&bmp->db_bmaplock)
+#define BMAP_LOCK(bmp) down(&bmp->db_bmaplock)
+#define BMAP_UNLOCK(bmp) up(&bmp->db_bmaplock)
+
+/*
+ * forward references
+ */
+static void dbAllocBits(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks);
+static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
+static void dbBackSplit(dmtree_t * tp, int leafno);
+static void dbJoin(dmtree_t * tp, int leafno, int newval);
+static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
+static int dbAdjCtl(bmap_t * bmp, s64 blkno, int newval, int alloc,
+ int level);
+static int dbAllocAny(bmap_t * bmp, s64 nblocks, int l2nb, s64 * results);
+static int dbAllocNext(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks);
+static int dbAllocNear(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks,
+ int l2nb, s64 * results);
+static int dbAllocDmap(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks);
+static int dbAllocDmapLev(bmap_t * bmp, dmap_t * dp, int nblocks, int l2nb,
+ s64 * results);
+static int dbAllocAG(bmap_t * bmp, int agno, s64 nblocks, int l2nb,
+ s64 * results);
+static int dbAllocCtl(bmap_t * bmp, s64 nblocks, int l2nb, s64 blkno,
+ s64 * results);
+int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
+static int dbFindBits(u32 word, int l2nb);
+static int dbFindCtl(bmap_t * bmp, int l2nb, int level, s64 * blkno);
+static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
+static void dbFreeBits(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks);
+static int dbFreeDmap(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks);
+static int dbMaxBud(u8 * cp);
+s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
+int blkstol2(s64 nb);
+void fsDirty(void);
+
+int cntlz(u32 value);
+int cnttz(u32 word);
+
+static int dbAllocDmapBU(bmap_t * bmp, dmap_t * dp, s64 blkno,
+ int nblocks);
+static int dbInitDmap(dmap_t * dp, s64 blkno, int nblocks);
+static int dbInitDmapTree(dmap_t * dp);
+static int dbInitTree(dmaptree_t * dtp);
+static int dbInitDmapCtl(dmapctl_t * dcp, int level, int i);
+static int dbGetL2AGSize(s64 nblocks);
+
+/*
+ * buddy table
+ *
+ * table used for determining buddy sizes within characters of
+ * dmap bitmap words. the characters themselves serve as indexes
+ * into the table, with the table elements yielding the maximum
+ * binary buddy of free bits within the character.
+ */
+signed char budtab[256] = {
+ 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
+};
+
+
+/*
+ * NAME: dbMount()
+ *
+ * FUNCTION: initializate the block allocation map.
+ *
+ * memory is allocated for the in-core bmap descriptor and
+ * the in-core descriptor is initialized from disk.
+ *
+ * PARAMETERS:
+ * ipbmap - pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOMEM - insufficient memory
+ * EIO - i/o error
+ */
+int dbMount(struct inode *ipbmap)
+{
+ bmap_t *bmp;
+ dbmap_t *dbmp_le;
+ metapage_t *mp;
+ int i;
+
+ /*
+ * allocate/initialize the in-memory bmap descriptor
+ */
+ /* allocate memory for the in-memory bmap descriptor */
+ bmp = kmalloc(sizeof(bmap_t), GFP_KERNEL);
+ if (bmp == NULL)
+ return (ENOMEM);
+
+ /* read the on-disk bmap descriptor. */
+ mp = read_metapage(ipbmap,
+ BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+ PSIZE, 0);
+ if (mp == NULL) {
+ kfree(bmp);
+ return (EIO);
+ }
+
+ /* copy the on-disk bmap descriptor to its in-memory version. */
+ dbmp_le = (dbmap_t *) mp->data;
+ bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
+ bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
+ bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
+ bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+ bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
+ bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
+ bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
+ bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
+ bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
+ bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
+ bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
+ bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+ for (i = 0; i < MAXAG; i++)
+ bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
+ bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
+ bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
+
+ /* release the buffer. */
+ release_metapage(mp);
+
+ /* bind the bmap inode and the bmap descriptor to each other. */
+ bmp->db_ipbmap = ipbmap;
+ JFS_SBI(ipbmap->i_sb)->bmap = bmp;
+
+ DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap);
+
+ /*
+ * allocate/initialize the bmap lock
+ */
+ BMAP_LOCK_INIT(bmp);
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbUnmount()
+ *
+ * FUNCTION: terminate the block allocation map in preparation for
+ * file system unmount.
+ *
+ * the in-core bmap descriptor is written to disk and
+ * the memory for this descriptor is freed.
+ *
+ * PARAMETERS:
+ * ipbmap - pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error
+ */
+int dbUnmount(struct inode *ipbmap, int mounterror)
+{
+ bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+
+ if (!(mounterror || isReadOnly(ipbmap)))
+ dbSync(ipbmap);
+
+ /*
+ * Invalidate the page cache buffers
+ */
+ truncate_inode_pages(ipbmap->i_mapping, 0);
+
+ /* free the memory for the in-memory bmap. */
+ kfree(bmp);
+
+ return (0);
+}
+
+/*
+ * dbSync()
+ */
+int dbSync(struct inode *ipbmap)
+{
+ dbmap_t *dbmp_le;
+ bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+ metapage_t *mp;
+ int i;
+
+ /*
+ * write bmap global control page
+ */
+ /* get the buffer for the on-disk bmap descriptor. */
+ mp = read_metapage(ipbmap,
+ BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+ PSIZE, 0);
+ if (mp == NULL) {
+ jERROR(1,("dbSync: read_metapage failed!\n"));
+ return (EIO);
+ }
+ /* copy the in-memory version of the bmap to the on-disk version */
+ dbmp_le = (dbmap_t *) mp->data;
+ dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
+ dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
+ dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
+ dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
+ dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
+ dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
+ dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
+ dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
+ dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
+ dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
+ dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
+ dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
+ for (i = 0; i < MAXAG; i++)
+ dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
+ dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
+ dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
+
+ /* write the buffer */
+ write_metapage(mp);
+
+ /*
+ * write out dirty pages of bmap
+ */
+ fsync_inode_data_buffers(ipbmap);
+
+ ipbmap->i_state |= I_DIRTY;
+ diWriteSpecial(ipbmap);
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbFree()
+ *
+ * FUNCTION: free the specified block range from the working block
+ * allocation map.
+ *
+ * the blocks will be free from the working map one dmap
+ * at a time.
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode;
+ * blkno - starting block number to be freed.
+ * nblocks - number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error
+ */
+int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
+{
+ metapage_t *mp;
+ dmap_t *dp;
+ int nb, rc;
+ s64 lblkno, rem;
+ struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+ bmap_t *bmp = JFS_SBI(ip->i_sb)->bmap;
+
+ IREAD_LOCK(ipbmap);
+
+ /* block to be freed better be within the mapsize. */
+ assert(blkno + nblocks <= bmp->db_mapsize);
+
+ /*
+ * free the blocks a dmap at a time.
+ */
+ mp = NULL;
+ for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
+ /* release previous dmap if any */
+ if (mp) {
+ write_metapage(mp);
+ }
+
+ /* get the buffer for the current dmap. */
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ IREAD_UNLOCK(ipbmap);
+ return (EIO);
+ }
+ dp = (dmap_t *) mp->data;
+
+ /* determine the number of blocks to be freed from
+ * this dmap.
+ */
+ nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
+
+ DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+
+ /* free the blocks. */
+ if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
+ release_metapage(mp);
+ IREAD_UNLOCK(ipbmap);
+ return (rc);
+ }
+
+ DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+ }
+
+ /* write the last buffer. */
+ write_metapage(mp);
+
+ IREAD_UNLOCK(ipbmap);
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbUpdatePMap()
+ *
+ * FUNCTION: update the allocation state (free or allocate) of the
+ * specified block range in the persistent block allocation map.
+ *
+ * the blocks will be updated in the persistent map one
+ * dmap at a time.
+ *
+ * PARAMETERS:
+ * ipbmap - pointer to in-core inode for the block map.
+ * free - TRUE if block range is to be freed from the persistent
+ * map; FALSE if it is to be allocated.
+ * blkno - starting block number of the range.
+ * nblocks - number of contiguous blocks in the range.
+ * tblk - transaction block;
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error
+ */
+int
+dbUpdatePMap(struct inode *ipbmap,
+ int free, s64 blkno, s64 nblocks, tblock_t * tblk)
+{
+ int nblks, dbitno, wbitno, rbits;
+ int word, nbits, nwords;
+ bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+ s64 lblkno, rem, lastlblkno;
+ u32 mask;
+ dmap_t *dp;
+ metapage_t *mp;
+ log_t *log;
+ int lsn, difft, diffp;
+
+ /* the blocks better be within the mapsize. */
+ assert(blkno + nblocks <= bmp->db_mapsize);
+
+ /* compute delta of transaction lsn from log syncpt */
+ lsn = tblk->lsn;
+ log = (log_t *) JFS_SBI(tblk->sb)->log;
+ logdiff(difft, lsn, log);
+
+ /*
+ * update the block state a dmap at a time.
+ */
+ mp = NULL;
+ lastlblkno = 0;
+ for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
+ /* get the buffer for the current dmap. */
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ if (lblkno != lastlblkno) {
+ if (mp) {
+ write_metapage(mp);
+ }
+
+ mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
+ 0);
+ if (mp == NULL)
+ return (EIO);
+ }
+ dp = (dmap_t *) mp->data;
+
+ /* determine the bit number and word within the dmap of
+ * the starting block. also determine how many blocks
+ * are to be updated within this dmap.
+ */
+ dbitno = blkno & (BPERDMAP - 1);
+ word = dbitno >> L2DBWORD;
+ nblks = min(rem, (s64)BPERDMAP - dbitno);
+
+ /* update the bits of the dmap words. the first and last
+ * words may only have a subset of their bits updated. if
+ * this is the case, we'll work against that word (i.e.
+ * partial first and/or last) only in a single pass. a
+ * single pass will also be used to update all words that
+ * are to have all their bits updated.
+ */
+ for (rbits = nblks; rbits > 0;
+ rbits -= nbits, dbitno += nbits) {
+ /* determine the bit number within the word and
+ * the number of bits within the word.
+ */
+ wbitno = dbitno & (DBWORD - 1);
+ nbits = min(rbits, DBWORD - wbitno);
+
+ /* check if only part of the word is to be updated. */
+ if (nbits < DBWORD) {
+ /* update (free or allocate) the bits
+ * in this word.
+ */
+ mask =
+ (ONES << (DBWORD - nbits) >> wbitno);
+ if (free)
+ dp->pmap[word] &=
+ cpu_to_le32(~mask);
+ else
+ dp->pmap[word] |=
+ cpu_to_le32(mask);
+
+ word += 1;
+ } else {
+ /* one or more words are to have all
+ * their bits updated. determine how
+ * many words and how many bits.
+ */
+ nwords = rbits >> L2DBWORD;
+ nbits = nwords << L2DBWORD;
+
+ /* update (free or allocate) the bits
+ * in these words.
+ */
+ if (free)
+ memset(&dp->pmap[word], 0,
+ nwords * 4);
+ else
+ memset(&dp->pmap[word], (int) ONES,
+ nwords * 4);
+
+ word += nwords;
+ }
+ }
+
+ /*
+ * update dmap lsn
+ */
+ if (lblkno == lastlblkno)
+ continue;
+
+ lastlblkno = lblkno;
+
+ if (mp->lsn != 0) {
+ /* inherit older/smaller lsn */
+ logdiff(diffp, mp->lsn, log);
+ if (difft < diffp) {
+ mp->lsn = lsn;
+
+ /* move bp after tblock in logsync list */
+ LOGSYNC_LOCK(log);
+ list_del(&mp->synclist);
+ list_add(&mp->synclist, &tblk->synclist);
+ LOGSYNC_UNLOCK(log);
+ }
+
+ /* inherit younger/larger clsn */
+ LOGSYNC_LOCK(log);
+ logdiff(difft, tblk->clsn, log);
+ logdiff(diffp, mp->clsn, log);
+ if (difft > diffp)
+ mp->clsn = tblk->clsn;
+ LOGSYNC_UNLOCK(log);
+ } else {
+ mp->log = log;
+ mp->lsn = lsn;
+
+ /* insert bp after tblock in logsync list */
+ LOGSYNC_LOCK(log);
+
+ log->count++;
+ list_add(&mp->synclist, &tblk->synclist);
+
+ mp->clsn = tblk->clsn;
+ LOGSYNC_UNLOCK(log);
+ }
+ }
+
+ /* write the last buffer. */
+ if (mp) {
+ write_metapage(mp);
+ }
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbNextAG()
+ *
+ * FUNCTION: find the preferred allocation group for new allocations.
+ *
+ * we try to keep the trailing (rightmost) allocation groups
+ * free for large allocations. we try to do this by targeting
+ * new inode allocations towards the leftmost or 'active'
+ * allocation groups while keeping the rightmost or 'inactive'
+ * allocation groups free. once the active allocation groups
+ * have dropped to a certain percentage of free space, we add
+ * the leftmost inactive allocation group to the active set.
+ *
+ * within the active allocation groups, we maintain a preferred
+ * allocation group which consists of a group with at least
+ * average free space over the active set. it is the preferred
+ * group that we target new inode allocation towards. the
+ * tie-in between inode allocation and block allocation occurs
+ * as we allocate the first (data) block of an inode and specify
+ * the inode (block) as the allocation hint for this block.
+ *
+ * PARAMETERS:
+ * ipbmap - pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ * the preferred allocation group number.
+ *
+ * note: only called by dbAlloc();
+ */
+int dbNextAG(struct inode *ipbmap)
+{
+ s64 avgfree, inactfree, actfree, rem;
+ int actags, inactags, l2agsize;
+ bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+
+ BMAP_LOCK(bmp);
+
+ /* determine the number of active allocation groups (i.e.
+ * the number of allocation groups up to and including
+ * the rightmost allocation group with blocks allocated
+ * in it.
+ */
+ actags = bmp->db_maxag + 1;
+ assert(actags <= bmp->db_numag);
+
+ /* get the number of inactive allocation groups (i.e. the
+ * number of allocation group following the rightmost group
+ * with allocation in it.
+ */
+ inactags = bmp->db_numag - actags;
+
+ /* determine how many blocks are in the inactive allocation
+ * groups. in doing this, we must account for the fact that
+ * the rightmost group might be a partial group (i.e. file
+ * system size is not a multiple of the group size).
+ */
+ l2agsize = bmp->db_agl2size;
+ rem = bmp->db_mapsize & (bmp->db_agsize - 1);
+ inactfree = (inactags
+ && rem) ? ((inactags - 1) << l2agsize) +
+ rem : inactags << l2agsize;
+
+ /* now determine how many free blocks are in the active
+ * allocation groups plus the average number of free blocks
+ * within the active ags.
+ */
+ actfree = bmp->db_nfree - inactfree;
+ avgfree = (u32) actfree / (u32) actags;
+
+ /* check if not all of the allocation groups are active.
+ */
+ if (actags < bmp->db_numag) {
+ /* not all of the allocation groups are active. determine
+ * if we should extend the active set by 1 (i.e. add the
+ * group following the current active set). we do so if
+ * the number of free blocks within the active set is less
+ * than the allocation group set and average free within
+ * the active set is less than 60%. we activate a new group
+ * by setting the allocation group preference to the new
+ * group.
+ */
+ if (actfree < bmp->db_agsize &&
+ ((avgfree * 100) >> l2agsize) < 60)
+ bmp->db_agpref = actags;
+ } else {
+ /* all allocation groups are in the active set. check if
+ * the preferred allocation group has average free space.
+ * if not, re-establish the preferred group as the leftmost
+ * group with average free space.
+ */
+ if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
+ for (bmp->db_agpref = 0; bmp->db_agpref < actags;
+ bmp->db_agpref++) {
+ if (bmp->db_agfree[bmp->db_agpref] <=
+ avgfree)
+ break;
+ }
+ assert(bmp->db_agpref < bmp->db_numag);
+ }
+ }
+
+ BMAP_UNLOCK(bmp);
+
+ /* return the preferred group.
+ */
+ return (bmp->db_agpref);
+}
+
+
+/*
+ * NAME: dbAlloc()
+ *
+ * FUNCTION: attempt to allocate a specified number of contiguous free
+ * blocks from the working allocation block map.
+ *
+ * the block allocation policy uses hints and a multi-step
+ * approach.
+ *
+ * for allocation requests smaller than the number of blocks
+ * per dmap, we first try to allocate the new blocks
+ * immediately following the hint. if these blocks are not
+ * available, we try to allocate blocks near the hint. if
+ * no blocks near the hint are available, we next try to
+ * allocate within the same dmap as contains the hint.
+ *
+ * if no blocks are available in the dmap or the allocation
+ * request is larger than the dmap size, we try to allocate
+ * within the same allocation group as contains the hint. if
+ * this does not succeed, we finally try to allocate anywhere
+ * within the aggregate.
+ *
+ * we also try to allocate anywhere within the aggregate for
+ * for allocation requests larger than the allocation group
+ * size or requests that specify no hint value.
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode;
+ * hint - allocation hint.
+ * nblocks - number of contiguous blocks in the range.
+ * results - on successful return, set to the starting block number
+ * of the newly allocated contiguous range.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ */
+int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
+{
+ int rc, agno;
+ struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+ bmap_t *bmp;
+ metapage_t *mp;
+ s64 lblkno, blkno;
+ dmap_t *dp;
+ int l2nb;
+ s64 mapSize;
+
+ /* assert that nblocks is valid */
+ assert(nblocks > 0);
+
+#ifdef _STILL_TO_PORT
+ /* DASD limit check F226941 */
+ if (OVER_LIMIT(ip, nblocks))
+ return ENOSPC;
+#endif /* _STILL_TO_PORT */
+
+ /* get the log2 number of blocks to be allocated.
+ * if the number of blocks is not a log2 multiple,
+ * it will be rounded up to the next log2 multiple.
+ */
+ l2nb = BLKSTOL2(nblocks);
+
+ bmp = JFS_SBI(ip->i_sb)->bmap;
+
+//retry: /* serialize w.r.t.extendfs() */
+ mapSize = bmp->db_mapsize;
+
+ /* the hint should be within the map */
+ assert(hint < mapSize);
+
+ /* if no hint was specified or the number of blocks to be
+ * allocated is greater than the allocation group size, try
+ * to allocate anywhere.
+ */
+ if (hint == 0 || l2nb > bmp->db_agl2size) {
+ IWRITE_LOCK(ipbmap);
+
+ rc = dbAllocAny(bmp, nblocks, l2nb, results);
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results,
+ nblocks);
+ }
+
+ IWRITE_UNLOCK(ipbmap);
+ return (rc);
+ }
+
+ /* we would like to allocate close to the hint. adjust the
+ * hint to the block following the hint since the allocators
+ * will start looking for free space starting at this point.
+ * if the hint was the last block of the file system, try to
+ * allocate in the same allocation group as the hint.
+ */
+ blkno = hint + 1;
+ if (blkno >= bmp->db_mapsize) {
+ blkno--;
+ goto tryag;
+ }
+
+ /* check if blkno crosses over into a new allocation group.
+ * if so, check if we should allow allocations within this
+ * allocation group. we try to keep the trailing (rightmost)
+ * allocation groups of the file system free for large
+ * allocations and may want to prevent this allocation from
+ * spilling over into this space.
+ */
+ if ((blkno & (bmp->db_agsize - 1)) == 0) {
+ /* check if the AG is beyond the rightmost AG with
+ * allocations in it. if so, call dbNextAG() to
+ * determine if the allocation should be allowed
+ * to proceed within this AG or should be targeted
+ * to another AG.
+ */
+ agno = blkno >> bmp->db_agl2size;
+ if (agno > bmp->db_maxag) {
+ agno = dbNextAG(ipbmap);
+ blkno = (s64) agno << bmp->db_agl2size;
+ goto tryag;
+ }
+ }
+
+ /* check if the allocation request size can be satisfied from a
+ * single dmap. if so, try to allocate from the dmap containing
+ * the hint using a tiered strategy.
+ */
+ if (nblocks <= BPERDMAP) {
+ IREAD_LOCK(ipbmap);
+
+ /* get the buffer for the dmap containing the hint.
+ */
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ IREAD_UNLOCK(ipbmap);
+ return (EIO);
+ }
+ dp = (dmap_t *) mp->data;
+
+ /* first, try to satisfy the allocation request with the
+ * blocks beginning at the hint.
+ */
+ if ((rc =
+ dbAllocNext(bmp, dp, blkno,
+ (int) nblocks)) != ENOSPC) {
+ if (rc == 0) {
+ *results = blkno;
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+ *results, nblocks);
+ write_metapage(mp);
+ } else {
+ assert(rc == EIO);
+ release_metapage(mp);
+ }
+
+ IREAD_UNLOCK(ipbmap);
+ return (rc);
+ }
+
+ /* next, try to satisfy the allocation request with blocks
+ * near the hint.
+ */
+ if ((rc =
+ dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb,
+ results))
+ != ENOSPC) {
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+ *results, nblocks);
+ mark_metapage_dirty(mp);
+ }
+ release_metapage(mp);
+
+ IREAD_UNLOCK(ipbmap);
+ return (rc);
+ }
+
+ /* try to satisfy the allocation request with blocks within
+ * the same allocation group as the hint.
+ */
+ if ((rc =
+ dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
+ != ENOSPC) {
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+ *results, nblocks);
+ mark_metapage_dirty(mp);
+ }
+ release_metapage(mp);
+
+ IREAD_UNLOCK(ipbmap);
+ return (rc);
+ }
+
+ release_metapage(mp);
+ IREAD_UNLOCK(ipbmap);
+ }
+
+ tryag:
+ IWRITE_LOCK(ipbmap);
+
+ /* determine the allocation group number of the hint and try to
+ * allocate within this allocation group. if that fails, try to
+ * allocate anywhere in the map.
+ */
+ agno = blkno >> bmp->db_agl2size;
+ if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == ENOSPC)
+ rc = dbAllocAny(bmp, nblocks, l2nb, results);
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks);
+ }
+
+ IWRITE_UNLOCK(ipbmap);
+
+ return (rc);
+}
+
+
+/*
+ * NAME: dbAllocExact()
+ *
+ * FUNCTION: try to allocate the requested extent;
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode;
+ * blkno - extent address;
+ * nblocks - extent length;
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ */
+int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
+{
+ int rc;
+ struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+ bmap_t *bmp = JFS_SBI(ip->i_sb)->bmap;
+ dmap_t *dp;
+ s64 lblkno;
+ metapage_t *mp;
+
+ IREAD_LOCK(ipbmap);
+
+ /*
+ * validate extent request:
+ *
+ * note: defragfs policy:
+ * max 64 blocks will be moved.
+ * allocation request size must be satisfied from a single dmap.
+ */
+ if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
+ IREAD_UNLOCK(ipbmap);
+ return EINVAL;
+ }
+
+ if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
+ /* the free space is no longer available */
+ IREAD_UNLOCK(ipbmap);
+ return ENOSPC;
+ }
+
+ /* read in the dmap covering the extent */
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ IREAD_UNLOCK(ipbmap);
+ return (EIO);
+ }
+ dp = (dmap_t *) mp->data;
+
+ /* try to allocate the requested extent */
+ rc = dbAllocNext(bmp, dp, blkno, nblocks);
+
+ IREAD_UNLOCK(ipbmap);
+
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
+ mark_metapage_dirty(mp);
+ }
+ release_metapage(mp);
+
+ return (rc);
+}
+
+
+/*
+ * NAME: dbReAlloc()
+ *
+ * FUNCTION: attempt to extend a current allocation by a specified
+ * number of blocks.
+ *
+ * this routine attempts to satisfy the allocation request
+ * by first trying to extend the existing allocation in
+ * place by allocating the additional blocks as the blocks
+ * immediately following the current allocation. if these
+ * blocks are not available, this routine will attempt to
+ * allocate a new set of contiguous blocks large enough
+ * to cover the existing allocation plus the additional
+ * number of blocks required.
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode requiring allocation.
+ * blkno - starting block of the current allocation.
+ * nblocks - number of contiguous blocks within the current
+ * allocation.
+ * addnblocks - number of blocks to add to the allocation.
+ * results - on successful return, set to the starting block number
+ * of the existing allocation if the existing allocation
+ * was extended in place or to a newly allocated contiguous
+ * range if the existing allocation could not be extended
+ * in place.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ */
+int
+dbReAlloc(struct inode *ip,
+ s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
+{
+ int rc;
+
+ /* try to extend the allocation in place.
+ */
+ if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
+ *results = blkno;
+ return (0);
+ } else {
+ if (rc != ENOSPC)
+ return (rc);
+ }
+
+ /* could not extend the allocation in place, so allocate a
+ * new set of blocks for the entire request (i.e. try to get
+ * a range of contiguous blocks large enough to cover the
+ * existing allocation plus the additional blocks.)
+ */
+ return (dbAlloc
+ (ip, blkno + nblocks - 1, addnblocks + nblocks, results));
+}
+
+
+/*
+ * NAME: dbExtend()
+ *
+ * FUNCTION: attempt to extend a current allocation by a specified
+ * number of blocks.
+ *
+ * this routine attempts to satisfy the allocation request
+ * by first trying to extend the existing allocation in
+ * place by allocating the additional blocks as the blocks
+ * immediately following the current allocation.
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode requiring allocation.
+ * blkno - starting block of the current allocation.
+ * nblocks - number of contiguous blocks within the current
+ * allocation.
+ * addnblocks - number of blocks to add to the allocation.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ */
+int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ s64 lblkno, lastblkno, extblkno;
+ uint rel_block;
+ metapage_t *mp;
+ dmap_t *dp;
+ int rc;
+ struct inode *ipbmap = sbi->ipbmap;
+ bmap_t *bmp;
+
+ /*
+ * We don't want a non-aligned extent to cross a page boundary
+ */
+ if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
+ (rel_block + nblocks + addnblocks > sbi->nbperpage))
+ return (ENOSPC);
+
+ /* get the last block of the current allocation */
+ lastblkno = blkno + nblocks - 1;
+
+ /* determine the block number of the block following
+ * the existing allocation.
+ */
+ extblkno = lastblkno + 1;
+
+ IREAD_LOCK(ipbmap);
+
+ /* better be within the file system */
+ bmp = sbi->bmap;
+ assert(lastblkno >= 0 && lastblkno < bmp->db_mapsize);
+
+ /* we'll attempt to extend the current allocation in place by
+ * allocating the additional blocks as the blocks immediately
+ * following the current allocation. we only try to extend the
+ * current allocation in place if the number of additional blocks
+ * can fit into a dmap, the last block of the current allocation
+ * is not the last block of the file system, and the start of the
+ * inplace extension is not on an allocation group boundry.
+ */
+ if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
+ (extblkno & (bmp->db_agsize - 1)) == 0) {
+ IREAD_UNLOCK(ipbmap);
+ return (ENOSPC);
+ }
+
+ /* get the buffer for the dmap containing the first block
+ * of the extension.
+ */
+ lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
+ mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ IREAD_UNLOCK(ipbmap);
+ return (EIO);
+ }
+
+ DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
+ dp = (dmap_t *) mp->data;
+
+ /* try to allocate the blocks immediately following the
+ * current allocation.
+ */
+ rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
+
+ IREAD_UNLOCK(ipbmap);
+
+ /* were we successful ? */
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize, extblkno,
+ addnblocks);
+ write_metapage(mp);
+ } else {
+ /* we were not successful */
+ release_metapage(mp);
+ assert(rc == ENOSPC || rc == EIO);
+ }
+
+ return (rc);
+}
+
+
+/*
+ * NAME: dbAllocNext()
+ *
+ * FUNCTION: attempt to allocate the blocks of the specified block
+ * range within a dmap.
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * dp - pointer to dmap.
+ * blkno - starting block number of the range.
+ * nblocks - number of contiguous free blocks of the range.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocNext(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks)
+{
+ int dbitno, word, rembits, nb, nwords, wbitno, nw;
+ int l2size;
+ s8 *leaf;
+ u32 mask;
+
+ /* pick up a pointer to the leaves of the dmap tree.
+ */
+ leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
+
+ /* determine the bit number and word within the dmap of the
+ * starting block.
+ */
+ dbitno = blkno & (BPERDMAP - 1);
+ word = dbitno >> L2DBWORD;
+
+ /* check if the specified block range is contained within
+ * this dmap.
+ */
+ if (dbitno + nblocks > BPERDMAP)
+ return (ENOSPC);
+
+ /* check if the starting leaf indicates that anything
+ * is free.
+ */
+ if (leaf[word] == NOFREE)
+ return (ENOSPC);
+
+ /* check the dmaps words corresponding to block range to see
+ * if the block range is free. not all bits of the first and
+ * last words may be contained within the block range. if this
+ * is the case, we'll work against those words (i.e. partial first
+ * and/or last) on an individual basis (a single pass) and examine
+ * the actual bits to determine if they are free. a single pass
+ * will be used for all dmap words fully contained within the
+ * specified range. within this pass, the leaves of the dmap
+ * tree will be examined to determine if the blocks are free. a
+ * single leaf may describe the free space of multiple dmap
+ * words, so we may visit only a subset of the actual leaves
+ * corresponding to the dmap words of the block range.
+ */
+ for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+ /* determine the bit number within the word and
+ * the number of bits within the word.
+ */
+ wbitno = dbitno & (DBWORD - 1);
+ nb = min(rembits, DBWORD - wbitno);
+
+ /* check if only part of the word is to be examined.
+ */
+ if (nb < DBWORD) {
+ /* check if the bits are free.
+ */
+ mask = (ONES << (DBWORD - nb) >> wbitno);
+ if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
+ return (ENOSPC);
+
+ word += 1;
+ } else {
+ /* one or more dmap words are fully contained
+ * within the block range. determine how many
+ * words and how many bits.
+ */
+ nwords = rembits >> L2DBWORD;
+ nb = nwords << L2DBWORD;
+
+ /* now examine the appropriate leaves to determine
+ * if the blocks are free.
+ */
+ while (nwords > 0) {
+ /* does the leaf describe any free space ?
+ */
+ if (leaf[word] < BUDMIN)
+ return (ENOSPC);
+
+ /* determine the l2 number of bits provided
+ * by this leaf.
+ */
+ l2size =
+ min((int)leaf[word], NLSTOL2BSZ(nwords));
+
+ /* determine how many words were handled.
+ */
+ nw = BUDSIZE(l2size, BUDMIN);
+
+ nwords -= nw;
+ word += nw;
+ }
+ }
+ }
+
+ /* allocate the blocks.
+ */
+ return (dbAllocDmap(bmp, dp, blkno, nblocks));
+}
+
+
+/*
+ * NAME: dbAllocNear()
+ *
+ * FUNCTION: attempt to allocate a number of contiguous free blocks near
+ * a specified block (hint) within a dmap.
+ *
+ * starting with the dmap leaf that covers the hint, we'll
+ * check the next four contiguous leaves for sufficient free
+ * space. if sufficient free space is found, we'll allocate
+ * the desired free space.
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * dp - pointer to dmap.
+ * blkno - block number to allocate near.
+ * nblocks - actual number of contiguous free blocks desired.
+ * l2nb - log2 number of contiguous free blocks desired.
+ * results - on successful return, set to the starting block number
+ * of the newly allocated range.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAllocNear(bmap_t * bmp,
+ dmap_t * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
+{
+ int word, lword, rc;
+ s8 *leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
+
+ /* determine the word within the dmap that holds the hint
+ * (i.e. blkno). also, determine the last word in the dmap
+ * that we'll include in our examination.
+ */
+ word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
+ lword = min(word + 4, LPERDMAP);
+
+ /* examine the leaves for sufficient free space.
+ */
+ for (; word < lword; word++) {
+ /* does the leaf describe sufficient free space ?
+ */
+ if (leaf[word] < l2nb)
+ continue;
+
+ /* determine the block number within the file system
+ * of the first block described by this dmap word.
+ */
+ blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
+
+ /* if not all bits of the dmap word are free, get the
+ * starting bit number within the dmap word of the required
+ * string of free bits and adjust the block number with the
+ * value.
+ */
+ if (leaf[word] < BUDMIN)
+ blkno +=
+ dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
+
+ /* allocate the blocks.
+ */
+ if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
+ *results = blkno;
+
+ return (rc);
+ }
+
+ return (ENOSPC);
+}
+
+
+/*
+ * NAME: dbAllocAG()
+ *
+ * FUNCTION: attempt to allocate the specified number of contiguous
+ * free blocks within the specified allocation group.
+ *
+ * unless the allocation group size is equal to the number
+ * of blocks per dmap, the dmap control pages will be used to
+ * find the required free space, if available. we start the
+ * search at the highest dmap control page level which
+ * distinctly describes the allocation group's free space
+ * (i.e. the highest level at which the allocation group's
+ * free space is not mixed in with that of any other group).
+ * in addition, we start the search within this level at a
+ * height of the dmapctl dmtree at which the nodes distinctly
+ * describe the allocation group's free space. at this height,
+ * the allocation group's free space may be represented by 1
+ * or two sub-trees, depending on the allocation group size.
+ * we search the top nodes of these subtrees left to right for
+ * sufficient free space. if sufficient free space is found,
+ * the subtree is searched to find the leftmost leaf that
+ * has free space. once we have made it to the leaf, we
+ * move the search to the next lower level dmap control page
+ * corresponding to this leaf. we continue down the dmap control
+ * pages until we find the dmap that contains or starts the
+ * sufficient free space and we allocate at this dmap.
+ *
+ * if the allocation group size is equal to the dmap size,
+ * we'll start at the dmap corresponding to the allocation
+ * group and attempt the allocation at this level.
+ *
+ * the dmap control page search is also not performed if the
+ * allocation group is completely free and we go to the first
+ * dmap of the allocation group to do the allocation. this is
+ * done because the allocation group may be part (not the first
+ * part) of a larger binary buddy system, causing the dmap
+ * control pages to indicate no free space (NOFREE) within
+ * the allocation group.
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * agno - allocation group number.
+ * nblocks - actual number of contiguous free blocks desired.
+ * l2nb - log2 number of contiguous free blocks desired.
+ * results - on successful return, set to the starting block number
+ * of the newly allocated range.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ *
+ * note: IWRITE_LOCK(ipmap) held on entry/exit;
+ */
+static int
+dbAllocAG(bmap_t * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
+{
+ metapage_t *mp;
+ dmapctl_t *dcp;
+ int rc, ti, i, k, m, n, agperlev;
+ s64 blkno, lblkno;
+ int budmin;
+
+ /* allocation request should not be for more than the
+ * allocation group size.
+ */
+ assert(l2nb <= bmp->db_agl2size);
+
+ /* determine the starting block number of the allocation
+ * group.
+ */
+ blkno = (s64) agno << bmp->db_agl2size;
+
+ /* check if the allocation group size is the minimum allocation
+ * group size or if the allocation group is completely free. if
+ * the allocation group size is the minimum size of BPERDMAP (i.e.
+ * 1 dmap), there is no need to search the dmap control page (below)
+ * that fully describes the allocation group since the allocation
+ * group is already fully described by a dmap. in this case, we
+ * just call dbAllocCtl() to search the dmap tree and allocate the
+ * required space if available.
+ *
+ * if the allocation group is completely free, dbAllocCtl() is
+ * also called to allocate the required space. this is done for
+ * two reasons. first, it makes no sense searching the dmap control
+ * pages for free space when we know that free space exists. second,
+ * the dmap control pages may indicate that the allocation group
+ * has no free space if the allocation group is part (not the first
+ * part) of a larger binary buddy system.
+ */
+ if (bmp->db_agsize == BPERDMAP
+ || bmp->db_agfree[agno] == bmp->db_agsize) {
+ rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+ /* assert(!(rc == ENOSPC && bmp->db_agfree[agno] == bmp->db_agsize)); */
+ if ((rc == ENOSPC) &&
+ (bmp->db_agfree[agno] == bmp->db_agsize)) {
+ jERROR(1,
+ ("dbAllocAG: removed assert, but still need to debug here\nblkno = 0x%Lx, nblocks = 0x%Lx\n",
+ (unsigned long long) blkno,
+ (unsigned long long) nblocks));
+ }
+ return (rc);
+ }
+
+ /* the buffer for the dmap control page that fully describes the
+ * allocation group.
+ */
+ lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
+ mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL)
+ return (EIO);
+ dcp = (dmapctl_t *) mp->data;
+ budmin = dcp->budmin;
+
+ /* search the subtree(s) of the dmap control page that describes
+ * the allocation group, looking for sufficient free space. to begin,
+ * determine how many allocation groups are represented in a dmap
+ * control page at the control page level (i.e. L0, L1, L2) that
+ * fully describes an allocation group. next, determine the starting
+ * tree index of this allocation group within the control page.
+ */
+ agperlev =
+ (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
+ ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
+
+ /* dmap control page trees fan-out by 4 and a single allocation
+ * group may be described by 1 or 2 subtrees within the ag level
+ * dmap control page, depending upon the ag size. examine the ag's
+ * subtrees for sufficient free space, starting with the leftmost
+ * subtree.
+ */
+ for (i = 0; i < bmp->db_agwidth; i++, ti++) {
+ /* is there sufficient free space ?
+ */
+ if (l2nb > dcp->stree[ti])
+ continue;
+
+ /* sufficient free space found in a subtree. now search down
+ * the subtree to find the leftmost leaf that describes this
+ * free space.
+ */
+ for (k = bmp->db_agheigth; k > 0; k--) {
+ for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
+ if (l2nb <= dcp->stree[m + n]) {
+ ti = m + n;
+ break;
+ }
+ }
+ assert(n < 4);
+ }
+
+ /* determine the block number within the file system
+ * that corresponds to this leaf.
+ */
+ if (bmp->db_aglevel == 2)
+ blkno = 0;
+ else if (bmp->db_aglevel == 1)
+ blkno &= ~(MAXL1SIZE - 1);
+ else /* bmp->db_aglevel == 0 */
+ blkno &= ~(MAXL0SIZE - 1);
+
+ blkno +=
+ ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
+
+ /* release the buffer in preparation for going down
+ * the next level of dmap control pages.
+ */
+ release_metapage(mp);
+
+ /* check if we need to continue to search down the lower
+ * level dmap control pages. we need to if the number of
+ * blocks required is less than maximum number of blocks
+ * described at the next lower level.
+ */
+ if (l2nb < budmin) {
+
+ /* search the lower level dmap control pages to get
+ * the starting block number of the the dmap that
+ * contains or starts off the free space.
+ */
+ if ((rc =
+ dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
+ &blkno))) {
+ assert(rc != ENOSPC);
+ return (rc);
+ }
+ }
+
+ /* allocate the blocks.
+ */
+ rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+ assert(rc != ENOSPC);
+ return (rc);
+ }
+
+ /* no space in the allocation group. release the buffer and
+ * return ENOSPC.
+ */
+ release_metapage(mp);
+
+ return (ENOSPC);
+}
+
+
+/*
+ * NAME: dbAllocAny()
+ *
+ * FUNCTION: attempt to allocate the specified number of contiguous
+ * free blocks anywhere in the file system.
+ *
+ * dbAllocAny() attempts to find the sufficient free space by
+ * searching down the dmap control pages, starting with the
+ * highest level (i.e. L0, L1, L2) control page. if free space
+ * large enough to satisfy the desired free space is found, the
+ * desired free space is allocated.
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * nblocks - actual number of contiguous free blocks desired.
+ * l2nb - log2 number of contiguous free blocks desired.
+ * results - on successful return, set to the starting block number
+ * of the newly allocated range.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocAny(bmap_t * bmp, s64 nblocks, int l2nb, s64 * results)
+{
+ int rc;
+ s64 blkno = 0;
+
+ /* starting with the top level dmap control page, search
+ * down the dmap control levels for sufficient free space.
+ * if free space is found, dbFindCtl() returns the starting
+ * block number of the dmap that contains or starts off the
+ * range of free space.
+ */
+ if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
+ return (rc);
+
+ /* allocate the blocks.
+ */
+ rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+ assert(rc != ENOSPC);
+ return (rc);
+}
+
+
+/*
+ * NAME: dbFindCtl()
+ *
+ * FUNCTION: starting at a specified dmap control page level and block
+ * number, search down the dmap control levels for a range of
+ * contiguous free blocks large enough to satisfy an allocation
+ * request for the specified number of free blocks.
+ *
+ * if sufficient contiguous free blocks are found, this routine
+ * returns the starting block number within a dmap page that
+ * contains or starts a range of contiqious free blocks that
+ * is sufficient in size.
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * level - starting dmap control page level.
+ * l2nb - log2 number of contiguous free blocks desired.
+ * *blkno - on entry, starting block number for conducting the search.
+ * on successful return, the first block within a dmap page
+ * that contains or starts a range of contiguous free blocks.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFindCtl(bmap_t * bmp, int l2nb, int level, s64 * blkno)
+{
+ int rc, leafidx, lev;
+ s64 b, lblkno;
+ dmapctl_t *dcp;
+ int budmin;
+ metapage_t *mp;
+
+ /* starting at the specified dmap control page level and block
+ * number, search down the dmap control levels for the starting
+ * block number of a dmap page that contains or starts off
+ * sufficient free blocks.
+ */
+ for (lev = level, b = *blkno; lev >= 0; lev--) {
+ /* get the buffer of the dmap control page for the block
+ * number and level (i.e. L0, L1, L2).
+ */
+ lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
+ mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL)
+ return (EIO);
+ dcp = (dmapctl_t *) mp->data;
+ budmin = dcp->budmin;
+
+ /* search the tree within the dmap control page for
+ * sufficent free space. if sufficient free space is found,
+ * dbFindLeaf() returns the index of the leaf at which
+ * free space was found.
+ */
+ rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
+
+ /* release the buffer.
+ */
+ release_metapage(mp);
+
+ /* space found ?
+ */
+ if (rc) {
+ assert(lev == level);
+ return (ENOSPC);
+ }
+
+ /* adjust the block number to reflect the location within
+ * the dmap control page (i.e. the leaf) at which free
+ * space was found.
+ */
+ b += (((s64) leafidx) << budmin);
+
+ /* we stop the search at this dmap control page level if
+ * the number of blocks required is greater than or equal
+ * to the maximum number of blocks described at the next
+ * (lower) level.
+ */
+ if (l2nb >= budmin)
+ break;
+ }
+
+ *blkno = b;
+ return (0);
+}
+
+
+/*
+ * NAME: dbAllocCtl()
+ *
+ * FUNCTION: attempt to allocate a specified number of contiguous
+ * blocks starting within a specific dmap.
+ *
+ * this routine is called by higher level routines that search
+ * the dmap control pages above the actual dmaps for contiguous
+ * free space. the result of successful searches by these
+ * routines are the starting block numbers within dmaps, with
+ * the dmaps themselves containing the desired contiguous free
+ * space or starting a contiguous free space of desired size
+ * that is made up of the blocks of one or more dmaps. these
+ * calls should not fail due to insufficent resources.
+ *
+ * this routine is called in some cases where it is not known
+ * whether it will fail due to insufficient resources. more
+ * specifically, this occurs when allocating from an allocation
+ * group whose size is equal to the number of blocks per dmap.
+ * in this case, the dmap control pages are not examined prior
+ * to calling this routine (to save pathlength) and the call
+ * might fail.
+ *
+ * for a request size that fits within a dmap, this routine relies
+ * upon the dmap's dmtree to find the requested contiguous free
+ * space. for request sizes that are larger than a dmap, the
+ * requested free space will start at the first block of the
+ * first dmap (i.e. blkno).
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * nblocks - actual number of contiguous free blocks to allocate.
+ * l2nb - log2 number of contiguous free blocks to allocate.
+ * blkno - starting block number of the dmap to start the allocation
+ * from.
+ * results - on successful return, set to the starting block number
+ * of the newly allocated range.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAllocCtl(bmap_t * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
+{
+ int rc, nb;
+ s64 b, lblkno, n;
+ metapage_t *mp;
+ dmap_t *dp;
+
+ /* check if the allocation request is confined to a single dmap.
+ */
+ if (l2nb <= L2BPERDMAP) {
+ /* get the buffer for the dmap.
+ */
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL)
+ return (EIO);
+ dp = (dmap_t *) mp->data;
+
+ /* try to allocate the blocks.
+ */
+ rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
+ if (rc == 0)
+ mark_metapage_dirty(mp);
+
+ release_metapage(mp);
+
+ return (rc);
+ }
+
+ /* allocation request involving multiple dmaps. it must start on
+ * a dmap boundary.
+ */
+ assert((blkno & (BPERDMAP - 1)) == 0);
+
+ /* allocate the blocks dmap by dmap.
+ */
+ for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
+ /* get the buffer for the dmap.
+ */
+ lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
+ mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ rc = EIO;
+ goto backout;
+ }
+ dp = (dmap_t *) mp->data;
+
+ /* the dmap better be all free.
+ */
+ assert(dp->tree.stree[ROOT] == L2BPERDMAP);
+
+ /* determine how many blocks to allocate from this dmap.
+ */
+ nb = min(n, (s64)BPERDMAP);
+
+ /* allocate the blocks from the dmap.
+ */
+ if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
+ release_metapage(mp);
+ goto backout;
+ }
+
+ /* write the buffer.
+ */
+ write_metapage(mp);
+ }
+
+ /* set the results (starting block number) and return.
+ */
+ *results = blkno;
+ return (0);
+
+ /* something failed in handling an allocation request involving
+ * multiple dmaps. we'll try to clean up by backing out any
+ * allocation that has already happened for this request. if
+ * we fail in backing out the allocation, we'll mark the file
+ * system to indicate that blocks have been leaked.
+ */
+ backout:
+
+ /* try to backout the allocations dmap by dmap.
+ */
+ for (n = nblocks - n, b = blkno; n > 0;
+ n -= BPERDMAP, b += BPERDMAP) {
+ /* get the buffer for this dmap.
+ */
+ lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
+ mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ /* could not back out. mark the file system
+ * to indicate that we have leaked blocks.
+ */
+ fsDirty(); /* !!! */
+ jERROR(1,
+ ("dbAllocCtl: I/O Error: Block Leakage.\n"));
+ continue;
+ }
+ dp = (dmap_t *) mp->data;
+
+ /* free the blocks is this dmap.
+ */
+ if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
+ /* could not back out. mark the file system
+ * to indicate that we have leaked blocks.
+ */
+ release_metapage(mp);
+ fsDirty(); /* !!! */
+ jERROR(1, ("dbAllocCtl: Block Leakage.\n"));
+ continue;
+ }
+
+ /* write the buffer.
+ */
+ write_metapage(mp);
+ }
+
+ return (rc);
+}
+
+
+/*
+ * NAME: dbAllocDmapLev()
+ *
+ * FUNCTION: attempt to allocate a specified number of contiguous blocks
+ * from a specified dmap.
+ *
+ * this routine checks if the contiguous blocks are available.
+ * if so, nblocks of blocks are allocated; otherwise, ENOSPC is
+ * returned.
+ *
+ * PARAMETERS:
+ * mp - pointer to bmap descriptor
+ * dp - pointer to dmap to attempt to allocate blocks from.
+ * l2nb - log2 number of contiguous block desired.
+ * nblocks - actual number of contiguous block desired.
+ * results - on successful return, set to the starting block number
+ * of the newly allocated range.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient disk resources
+ * EIO - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
+ * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
+ */
+static int
+dbAllocDmapLev(bmap_t * bmp,
+ dmap_t * dp, int nblocks, int l2nb, s64 * results)
+{
+ s64 blkno;
+ int leafidx, rc;
+
+ /* can't be more than a dmaps worth of blocks */
+ assert(l2nb <= L2BPERDMAP);
+
+ /* search the tree within the dmap page for sufficient
+ * free space. if sufficient free space is found, dbFindLeaf()
+ * returns the index of the leaf at which free space was found.
+ */
+ if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
+ return (ENOSPC);
+
+ /* determine the block number within the file system corresponding
+ * to the leaf at which free space was found.
+ */
+ blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
+
+ /* if not all bits of the dmap word are free, get the starting
+ * bit number within the dmap word of the required string of free
+ * bits and adjust the block number with this value.
+ */
+ if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
+ blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
+
+ /* allocate the blocks */
+ if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
+ *results = blkno;
+
+ return (rc);
+}
+
+
+/*
+ * NAME: dbAllocDmap()
+ *
+ * FUNCTION: adjust the disk allocation map to reflect the allocation
+ * of a specified block range within a dmap.
+ *
+ * this routine allocates the specified blocks from the dmap
+ * through a call to dbAllocBits(). if the allocation of the
+ * block range causes the maximum string of free blocks within
+ * the dmap to change (i.e. the value of the root of the dmap's
+ * dmtree), this routine will cause this change to be reflected
+ * up through the appropriate levels of the dmap control pages
+ * by a call to dbAdjCtl() for the L0 dmap control page that
+ * covers this dmap.
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * dp - pointer to dmap to allocate the block range from.
+ * blkno - starting block number of the block to be allocated.
+ * nblocks - number of blocks to be allocated.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocDmap(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks)
+{
+ s8 oldroot;
+ int rc;
+
+ /* save the current value of the root (i.e. maximum free string)
+ * of the dmap tree.
+ */
+ oldroot = dp->tree.stree[ROOT];
+
+ /* allocate the specified (blocks) bits */
+ dbAllocBits(bmp, dp, blkno, nblocks);
+
+ /* if the root has not changed, done. */
+ if (dp->tree.stree[ROOT] == oldroot)
+ return (0);
+
+ /* root changed. bubble the change up to the dmap control pages.
+ * if the adjustment of the upper level control pages fails,
+ * backout the bit allocation (thus making everything consistent).
+ */
+ if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
+ dbFreeBits(bmp, dp, blkno, nblocks);
+
+ return (rc);
+}
+
+
+/*
+ * NAME: dbFreeDmap()
+ *
+ * FUNCTION: adjust the disk allocation map to reflect the allocation
+ * of a specified block range within a dmap.
+ *
+ * this routine frees the specified blocks from the dmap through
+ * a call to dbFreeBits(). if the deallocation of the block range
+ * causes the maximum string of free blocks within the dmap to
+ * change (i.e. the value of the root of the dmap's dmtree), this
+ * routine will cause this change to be reflected up through the
+ * appropriate levels of the dmap control pages by a call to
+ * dbAdjCtl() for the L0 dmap control page that covers this dmap.
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * dp - pointer to dmap to free the block range from.
+ * blkno - starting block number of the block to be freed.
+ * nblocks - number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFreeDmap(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks)
+{
+ s8 oldroot;
+ int rc, word;
+
+ /* save the current value of the root (i.e. maximum free string)
+ * of the dmap tree.
+ */
+ oldroot = dp->tree.stree[ROOT];
+
+ /* free the specified (blocks) bits */
+ dbFreeBits(bmp, dp, blkno, nblocks);
+
+ /* if the root has not changed, done. */
+ if (dp->tree.stree[ROOT] == oldroot)
+ return (0);
+
+ /* root changed. bubble the change up to the dmap control pages.
+ * if the adjustment of the upper level control pages fails,
+ * backout the deallocation.
+ */
+ if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
+ word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
+
+ /* as part of backing out the deallocation, we will have
+ * to back split the dmap tree if the deallocation caused
+ * the freed blocks to become part of a larger binary buddy
+ * system.
+ */
+ if (dp->tree.stree[word] == NOFREE)
+ dbBackSplit((dmtree_t *) & dp->tree, word);
+
+ dbAllocBits(bmp, dp, blkno, nblocks);
+ }
+
+ return (rc);
+}
+
+
+/*
+ * NAME: dbAllocBits()
+ *
+ * FUNCTION: allocate a specified block range from a dmap.
+ *
+ * this routine updates the dmap to reflect the working
+ * state allocation of the specified block range. it directly
+ * updates the bits of the working map and causes the adjustment
+ * of the binary buddy system described by the dmap's dmtree
+ * leaves to reflect the bits allocated. it also causes the
+ * dmap's dmtree, as a whole, to reflect the allocated range.
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * dp - pointer to dmap to allocate bits from.
+ * blkno - starting block number of the bits to be allocated.
+ * nblocks - number of bits to be allocated.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbAllocBits(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks)
+{
+ int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
+ dmtree_t *tp = (dmtree_t *) & dp->tree;
+ int size;
+ s8 *leaf;
+
+ /* pick up a pointer to the leaves of the dmap tree */
+ leaf = dp->tree.stree + LEAFIND;
+
+ /* determine the bit number and word within the dmap of the
+ * starting block.
+ */
+ dbitno = blkno & (BPERDMAP - 1);
+ word = dbitno >> L2DBWORD;
+
+ /* block range better be within the dmap */
+ assert(dbitno + nblocks <= BPERDMAP);
+
+ /* allocate the bits of the dmap's words corresponding to the block
+ * range. not all bits of the first and last words may be contained
+ * within the block range. if this is the case, we'll work against
+ * those words (i.e. partial first and/or last) on an individual basis
+ * (a single pass), allocating the bits of interest by hand and
+ * updating the leaf corresponding to the dmap word. a single pass
+ * will be used for all dmap words fully contained within the
+ * specified range. within this pass, the bits of all fully contained
+ * dmap words will be marked as free in a single shot and the leaves
+ * will be updated. a single leaf may describe the free space of
+ * multiple dmap words, so we may update only a subset of the actual
+ * leaves corresponding to the dmap words of the block range.
+ */
+ for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+ /* determine the bit number within the word and
+ * the number of bits within the word.
+ */
+ wbitno = dbitno & (DBWORD - 1);
+ nb = min(rembits, DBWORD - wbitno);
+
+ /* check if only part of a word is to be allocated.
+ */
+ if (nb < DBWORD) {
+ /* allocate (set to 1) the appropriate bits within
+ * this dmap word.
+ */
+ dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
+ >> wbitno);
+
+ /* update the leaf for this dmap word. in addition
+ * to setting the leaf value to the binary buddy max
+ * of the updated dmap word, dbSplit() will split
+ * the binary system of the leaves if need be.
+ */
+ dbSplit(tp, word, BUDMIN,
+ dbMaxBud((u8 *) & dp->wmap[word]));
+
+ word += 1;
+ } else {
+ /* one or more dmap words are fully contained
+ * within the block range. determine how many
+ * words and allocate (set to 1) the bits of these
+ * words.
+ */
+ nwords = rembits >> L2DBWORD;
+ memset(&dp->wmap[word], (int) ONES, nwords * 4);
+
+ /* determine how many bits.
+ */
+ nb = nwords << L2DBWORD;
+
+ /* now update the appropriate leaves to reflect
+ * the allocated words.
+ */
+ for (; nwords > 0; nwords -= nw) {
+ assert(leaf[word] >= BUDMIN);
+
+ /* determine what the leaf value should be
+ * updated to as the minimum of the l2 number
+ * of bits being allocated and the l2 number
+ * of bits currently described by this leaf.
+ */
+ size = min((int)leaf[word], NLSTOL2BSZ(nwords));
+
+ /* update the leaf to reflect the allocation.
+ * in addition to setting the leaf value to
+ * NOFREE, dbSplit() will split the binary
+ * system of the leaves to reflect the current
+ * allocation (size).
+ */
+ dbSplit(tp, word, size, NOFREE);
+
+ /* get the number of dmap words handled */
+ nw = BUDSIZE(size, BUDMIN);
+ word += nw;
+ }
+ }
+ }
+
+ /* update the free count for this dmap */
+ dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+
+ BMAP_LOCK(bmp);
+
+ /* if this allocation group is completely free,
+ * update the maximum allocation group number if this allocation
+ * group is the new max.
+ */
+ agno = blkno >> bmp->db_agl2size;
+ if (agno > bmp->db_maxag)
+ bmp->db_maxag = agno;
+
+ /* update the free count for the allocation group and map */
+ bmp->db_agfree[agno] -= nblocks;
+ bmp->db_nfree -= nblocks;
+
+ BMAP_UNLOCK(bmp);
+}
+
+
+/*
+ * NAME: dbFreeBits()
+ *
+ * FUNCTION: free a specified block range from a dmap.
+ *
+ * this routine updates the dmap to reflect the working
+ * state allocation of the specified block range. it directly
+ * updates the bits of the working map and causes the adjustment
+ * of the binary buddy system described by the dmap's dmtree
+ * leaves to reflect the bits freed. it also causes the dmap's
+ * dmtree, as a whole, to reflect the deallocated range.
+ *
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * dp - pointer to dmap to free bits from.
+ * blkno - starting block number of the bits to be freed.
+ * nblocks - number of bits to be freed.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbFreeBits(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks)
+{
+ int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
+ dmtree_t *tp = (dmtree_t *) & dp->tree;
+ int size;
+
+ /* determine the bit number and word within the dmap of the
+ * starting block.
+ */
+ dbitno = blkno & (BPERDMAP - 1);
+ word = dbitno >> L2DBWORD;
+
+ /* block range better be within the dmap.
+ */
+ assert(dbitno + nblocks <= BPERDMAP);
+
+ /* free the bits of the dmaps words corresponding to the block range.
+ * not all bits of the first and last words may be contained within
+ * the block range. if this is the case, we'll work against those
+ * words (i.e. partial first and/or last) on an individual basis
+ * (a single pass), freeing the bits of interest by hand and updating
+ * the leaf corresponding to the dmap word. a single pass will be used
+ * for all dmap words fully contained within the specified range.
+ * within this pass, the bits of all fully contained dmap words will
+ * be marked as free in a single shot and the leaves will be updated. a
+ * single leaf may describe the free space of multiple dmap words,
+ * so we may update only a subset of the actual leaves corresponding
+ * to the dmap words of the block range.
+ *
+ * dbJoin() is used to update leaf values and will join the binary
+ * buddy system of the leaves if the new leaf values indicate this
+ * should be done.
+ */
+ for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+ /* determine the bit number within the word and
+ * the number of bits within the word.
+ */
+ wbitno = dbitno & (DBWORD - 1);
+ nb = min(rembits, DBWORD - wbitno);
+
+ /* check if only part of a word is to be freed.
+ */
+ if (nb < DBWORD) {
+ /* free (zero) the appropriate bits within this
+ * dmap word.
+ */
+ dp->wmap[word] &=
+ cpu_to_le32(~(ONES << (DBWORD - nb)
+ >> wbitno));
+
+ /* update the leaf for this dmap word.
+ */
+ dbJoin(tp, word,
+ dbMaxBud((u8 *) & dp->wmap[word]));
+
+ word += 1;
+ } else {
+ /* one or more dmap words are fully contained
+ * within the block range. determine how many
+ * words and free (zero) the bits of these words.
+ */
+ nwords = rembits >> L2DBWORD;
+ memset(&dp->wmap[word], 0, nwords * 4);
+
+ /* determine how many bits.
+ */
+ nb = nwords << L2DBWORD;
+
+ /* now update the appropriate leaves to reflect
+ * the freed words.
+ */
+ for (; nwords > 0; nwords -= nw) {
+ /* determine what the leaf value should be
+ * updated to as the minimum of the l2 number
+ * of bits being freed and the l2 (max) number
+ * of bits that can be described by this leaf.
+ */
+ size =
+ min(LITOL2BSZ
+ (word, L2LPERDMAP, BUDMIN),
+ NLSTOL2BSZ(nwords));
+
+ /* update the leaf.
+ */
+ dbJoin(tp, word, size);
+
+ /* get the number of dmap words handled.
+ */
+ nw = BUDSIZE(size, BUDMIN);
+ word += nw;
+ }
+ }
+ }
+
+ /* update the free count for this dmap.
+ */
+ dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+
+ BMAP_LOCK(bmp);
+
+ /* update the free count for the allocation group and
+ * map.
+ */
+ agno = blkno >> bmp->db_agl2size;
+ bmp->db_nfree += nblocks;
+ bmp->db_agfree[agno] += nblocks;
+
+ /* check if this allocation group is not completely free and
+ * if it is currently the maximum (rightmost) allocation group.
+ * if so, establish the new maximum allocation group number by
+ * searching left for the first allocation group with allocation.
+ */
+ if ((bmp->db_agfree[agno] == bmp->db_agsize
+ && agno == bmp->db_maxag) || (agno == bmp->db_numag - 1
+ && bmp->db_agfree[agno] ==
+ (bmp-> db_mapsize &
+ (BPERDMAP - 1)))) {
+ while (bmp->db_maxag > 0) {
+ bmp->db_maxag -= 1;
+ if (bmp->db_agfree[bmp->db_maxag] !=
+ bmp->db_agsize)
+ break;
+ }
+
+ /* re-establish the allocation group preference if the
+ * current preference is right of the maximum allocation
+ * group.
+ */
+ if (bmp->db_agpref > bmp->db_maxag)
+ bmp->db_agpref = bmp->db_maxag;
+ }
+
+ BMAP_UNLOCK(bmp);
+}
+
+
+/*
+ * NAME: dbAdjCtl()
+ *
+ * FUNCTION: adjust a dmap control page at a specified level to reflect
+ * the change in a lower level dmap or dmap control page's
+ * maximum string of free blocks (i.e. a change in the root
+ * of the lower level object's dmtree) due to the allocation
+ * or deallocation of a range of blocks with a single dmap.
+ *
+ * on entry, this routine is provided with the new value of
+ * the lower level dmap or dmap control page root and the
+ * starting block number of the block range whose allocation
+ * or deallocation resulted in the root change. this range
+ * is respresented by a single leaf of the current dmapctl
+ * and the leaf will be updated with this value, possibly
+ * causing a binary buddy system within the leaves to be
+ * split or joined. the update may also cause the dmapctl's
+ * dmtree to be updated.
+ *
+ * if the adjustment of the dmap control page, itself, causes its
+ * root to change, this change will be bubbled up to the next dmap
+ * control level by a recursive call to this routine, specifying
+ * the new root value and the next dmap control page level to
+ * be adjusted.
+ * PARAMETERS:
+ * bmp - pointer to bmap descriptor
+ * blkno - the first block of a block range within a dmap. it is
+ * the allocation or deallocation of this block range that
+ * requires the dmap control page to be adjusted.
+ * newval - the new value of the lower level dmap or dmap control
+ * page root.
+ * alloc - TRUE if adjustment is due to an allocation.
+ * level - current level of dmap control page (i.e. L0, L1, L2) to
+ * be adjusted.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAdjCtl(bmap_t * bmp, s64 blkno, int newval, int alloc, int level)
+{
+ metapage_t *mp;
+ s8 oldroot;
+ int oldval;
+ s64 lblkno;
+ dmapctl_t *dcp;
+ int rc, leafno, ti;
+
+ /* get the buffer for the dmap control page for the specified
+ * block number and control page level.
+ */
+ lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
+ mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL)
+ return (EIO);
+ dcp = (dmapctl_t *) mp->data;
+
+ /* determine the leaf number corresponding to the block and
+ * the index within the dmap control tree.
+ */
+ leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
+ ti = leafno + le32_to_cpu(dcp->leafidx);
+
+ /* save the current leaf value and the current root level (i.e.
+ * maximum l2 free string described by this dmapctl).
+ */
+ oldval = dcp->stree[ti];
+ oldroot = dcp->stree[ROOT];
+
+ /* check if this is a control page update for an allocation.
+ * if so, update the leaf to reflect the new leaf value using
+ * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
+ * the leaf with the new value. in addition to updating the
+ * leaf, dbSplit() will also split the binary buddy system of
+ * the leaves, if required, and bubble new values within the
+ * dmapctl tree, if required. similarly, dbJoin() will join
+ * the binary buddy system of leaves and bubble new values up
+ * the dmapctl tree as required by the new leaf value.
+ */
+ if (alloc) {
+ /* check if we are in the middle of a binary buddy
+ * system. this happens when we are performing the
+ * first allocation out of an allocation group that
+ * is part (not the first part) of a larger binary
+ * buddy system. if we are in the middle, back split
+ * the system prior to calling dbSplit() which assumes
+ * that it is at the front of a binary buddy system.
+ */
+ if (oldval == NOFREE) {
+ dbBackSplit((dmtree_t *) dcp, leafno);
+ oldval = dcp->stree[ti];
+ }
+ dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
+ } else {
+ dbJoin((dmtree_t *) dcp, leafno, newval);
+ }
+
+ /* check if the root of the current dmap control page changed due
+ * to the update and if the current dmap control page is not at
+ * the current top level (i.e. L0, L1, L2) of the map. if so (i.e.
+ * root changed and this is not the top level), call this routine
+ * again (recursion) for the next higher level of the mapping to
+ * reflect the change in root for the current dmap control page.
+ */
+ if (dcp->stree[ROOT] != oldroot) {
+ /* are we below the top level of the map. if so,
+ * bubble the root up to the next higher level.
+ */
+ if (level < bmp->db_maxlevel) {
+ /* bubble up the new root of this dmap control page to
+ * the next level.
+ */
+ if ((rc =
+ dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
+ level + 1))) {
+ /* something went wrong in bubbling up the new
+ * root value, so backout the changes to the
+ * current dmap control page.
+ */
+ if (alloc) {
+ dbJoin((dmtree_t *) dcp, leafno,
+ oldval);
+ } else {
+ /* the dbJoin() above might have
+ * caused a larger binary buddy system
+ * to form and we may now be in the
+ * middle of it. if this is the case,
+ * back split the buddies.
+ */
+ if (dcp->stree[ti] == NOFREE)
+ dbBackSplit((dmtree_t *)
+ dcp, leafno);
+ dbSplit((dmtree_t *) dcp, leafno,
+ dcp->budmin, oldval);
+ }
+
+ /* release the buffer and return the error.
+ */
+ release_metapage(mp);
+ return (rc);
+ }
+ } else {
+ /* we're at the top level of the map. update
+ * the bmap control page to reflect the size
+ * of the maximum free buddy system.
+ */
+ assert(level == bmp->db_maxlevel);
+ assert(bmp->db_maxfreebud == oldroot);
+ bmp->db_maxfreebud = dcp->stree[ROOT];
+ }
+ }
+
+ /* write the buffer.
+ */
+ write_metapage(mp);
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbSplit()
+ *
+ * FUNCTION: update the leaf of a dmtree with a new value, splitting
+ * the leaf from the binary buddy system of the dmtree's
+ * leaves, as required.
+ *
+ * PARAMETERS:
+ * tp - pointer to the tree containing the leaf.
+ * leafno - the number of the leaf to be updated.
+ * splitsz - the size the binary buddy system starting at the leaf
+ * must be split to, specified as the log2 number of blocks.
+ * newval - the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
+{
+ int budsz;
+ int cursz;
+ s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+
+ /* check if the leaf needs to be split.
+ */
+ if (leaf[leafno] > tp->dmt_budmin) {
+ /* the split occurs by cutting the buddy system in half
+ * at the specified leaf until we reach the specified
+ * size. pick up the starting split size (current size
+ * - 1 in l2) and the corresponding buddy size.
+ */
+ cursz = leaf[leafno] - 1;
+ budsz = BUDSIZE(cursz, tp->dmt_budmin);
+
+ /* split until we reach the specified size.
+ */
+ while (cursz >= splitsz) {
+ /* update the buddy's leaf with its new value.
+ */
+ dbAdjTree(tp, leafno ^ budsz, cursz);
+
+ /* on to the next size and buddy.
+ */
+ cursz -= 1;
+ budsz >>= 1;
+ }
+ }
+
+ /* adjust the dmap tree to reflect the specified leaf's new
+ * value.
+ */
+ dbAdjTree(tp, leafno, newval);
+}
+
+
+/*
+ * NAME: dbBackSplit()
+ *
+ * FUNCTION: back split the binary buddy system of dmtree leaves
+ * that hold a specified leaf until the specified leaf
+ * starts its own binary buddy system.
+ *
+ * the allocators typically perform allocations at the start
+ * of binary buddy systems and dbSplit() is used to accomplish
+ * any required splits. in some cases, however, allocation
+ * may occur in the middle of a binary system and requires a
+ * back split, with the split proceeding out from the middle of
+ * the system (less efficient) rather than the start of the
+ * system (more efficient). the cases in which a back split
+ * is required are rare and are limited to the first allocation
+ * within an allocation group which is a part (not first part)
+ * of a larger binary buddy system and a few exception cases
+ * in which a previous join operation must be backed out.
+ *
+ * PARAMETERS:
+ * tp - pointer to the tree containing the leaf.
+ * leafno - the number of the leaf to be updated.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbBackSplit(dmtree_t * tp, int leafno)
+{
+ int budsz, bud, w, bsz, size;
+ int cursz;
+ s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+
+ /* leaf should be part (not first part) of a binary
+ * buddy system.
+ */
+ assert(leaf[leafno] == NOFREE);
+
+ /* the back split is accomplished by iteratively finding the leaf
+ * that starts the buddy system that contains the specified leaf and
+ * splitting that system in two. this iteration continues until
+ * the specified leaf becomes the start of a buddy system.
+ *
+ * determine maximum possible l2 size for the specified leaf.
+ */
+ size =
+ LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
+ tp->dmt_budmin);
+
+ /* determine the number of leaves covered by this size. this
+ * is the buddy size that we will start with as we search for
+ * the buddy system that contains the specified leaf.
+ */
+ budsz = BUDSIZE(size, tp->dmt_budmin);
+
+ /* back split.
+ */
+ while (leaf[leafno] == NOFREE) {
+ /* find the leftmost buddy leaf.
+ */
+ for (w = leafno, bsz = budsz;; bsz <<= 1,
+ w = (w < bud) ? w : bud) {
+ assert(bsz < le32_to_cpu(tp->dmt_nleafs));
+
+ /* determine the buddy.
+ */
+ bud = w ^ bsz;
+
+ /* check if this buddy is the start of the system.
+ */
+ if (leaf[bud] != NOFREE) {
+ /* split the leaf at the start of the
+ * system in two.
+ */
+ cursz = leaf[bud] - 1;
+ dbSplit(tp, bud, cursz, cursz);
+ break;
+ }
+ }
+ }
+
+ assert(leaf[leafno] == size);
+}
+
+
+/*
+ * NAME: dbJoin()
+ *
+ * FUNCTION: update the leaf of a dmtree with a new value, joining
+ * the leaf with other leaves of the dmtree into a multi-leaf
+ * binary buddy system, as required.
+ *
+ * PARAMETERS:
+ * tp - pointer to the tree containing the leaf.
+ * leafno - the number of the leaf to be updated.
+ * newval - the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ */
+static void dbJoin(dmtree_t * tp, int leafno, int newval)
+{
+ int budsz, buddy;
+ s8 *leaf;
+
+ /* can the new leaf value require a join with other leaves ?
+ */
+ if (newval >= tp->dmt_budmin) {
+ /* pickup a pointer to the leaves of the tree.
+ */
+ leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+
+ /* try to join the specified leaf into a large binary
+ * buddy system. the join proceeds by attempting to join
+ * the specified leafno with its buddy (leaf) at new value.
+ * if the join occurs, we attempt to join the left leaf
+ * of the joined buddies with its buddy at new value + 1.
+ * we continue to join until we find a buddy that cannot be
+ * joined (does not have a value equal to the size of the
+ * last join) or until all leaves have been joined into a
+ * single system.
+ *
+ * get the buddy size (number of words covered) of
+ * the new value.
+ */
+ budsz = BUDSIZE(newval, tp->dmt_budmin);
+
+ /* try to join.
+ */
+ while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
+ /* get the buddy leaf.
+ */
+ buddy = leafno ^ budsz;
+
+ /* if the leaf's new value is greater than its
+ * buddy's value, we join no more.
+ */
+ if (newval > leaf[buddy])
+ break;
+
+ assert(newval == leaf[buddy]);
+
+ /* check which (leafno or buddy) is the left buddy.
+ * the left buddy gets to claim the blocks resulting
+ * from the join while the right gets to claim none.
+ * the left buddy is also eligable to participate in
+ * a join at the next higher level while the right
+ * is not.
+ *
+ */
+ if (leafno < buddy) {
+ /* leafno is the left buddy.
+ */
+ dbAdjTree(tp, buddy, NOFREE);
+ } else {
+ /* buddy is the left buddy and becomes
+ * leafno.
+ */
+ dbAdjTree(tp, leafno, NOFREE);
+ leafno = buddy;
+ }
+
+ /* on to try the next join.
+ */
+ newval += 1;
+ budsz <<= 1;
+ }
+ }
+
+ /* update the leaf value.
+ */
+ dbAdjTree(tp, leafno, newval);
+}
+
+
+/*
+ * NAME: dbAdjTree()
+ *
+ * FUNCTION: update a leaf of a dmtree with a new value, adjusting
+ * the dmtree, as required, to reflect the new leaf value.
+ * the combination of any buddies must already be done before
+ * this is called.
+ *
+ * PARAMETERS:
+ * tp - pointer to the tree to be adjusted.
+ * leafno - the number of the leaf to be updated.
+ * newval - the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ */
+static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
+{
+ int lp, pp, k;
+ int max;
+
+ /* pick up the index of the leaf for this leafno.
+ */
+ lp = leafno + le32_to_cpu(tp->dmt_leafidx);
+
+ /* is the current value the same as the old value ? if so,
+ * there is nothing to do.
+ */
+ if (tp->dmt_stree[lp] == newval)
+ return;
+
+ /* set the new value.
+ */
+ tp->dmt_stree[lp] = newval;
+
+ /* bubble the new value up the tree as required.
+ */
+ for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
+ /* get the index of the first leaf of the 4 leaf
+ * group containing the specified leaf (leafno).
+ */
+ lp = ((lp - 1) & ~0x03) + 1;
+
+ /* get the index of the parent of this 4 leaf group.
+ */
+ pp = (lp - 1) >> 2;
+
+ /* determine the maximum of the 4 leaves.
+ */
+ max = TREEMAX(&tp->dmt_stree[lp]);
+
+ /* if the maximum of the 4 is the same as the
+ * parent's value, we're done.
+ */
+ if (tp->dmt_stree[pp] == max)
+ break;
+
+ /* parent gets new value.
+ */
+ tp->dmt_stree[pp] = max;
+
+ /* parent becomes leaf for next go-round.
+ */
+ lp = pp;
+ }
+}
+
+
+/*
+ * NAME: dbFindLeaf()
+ *
+ * FUNCTION: search a dmtree_t for sufficient free blocks, returning
+ * the index of a leaf describing the free blocks if
+ * sufficient free blocks are found.
+ *
+ * the search starts at the top of the dmtree_t tree and
+ * proceeds down the tree to the leftmost leaf with sufficient
+ * free space.
+ *
+ * PARAMETERS:
+ * tp - pointer to the tree to be searched.
+ * l2nb - log2 number of free blocks to search for.
+ * leafidx - return pointer to be set to the index of the leaf
+ * describing at least l2nb free blocks if sufficient
+ * free blocks are found.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOSPC - insufficient free blocks.
+ */
+static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
+{
+ int ti, n = 0, k, x = 0;
+
+ /* first check the root of the tree to see if there is
+ * sufficient free space.
+ */
+ if (l2nb > tp->dmt_stree[ROOT])
+ return (ENOSPC);
+
+ /* sufficient free space available. now search down the tree
+ * starting at the next level for the leftmost leaf that
+ * describes sufficient free space.
+ */
+ for (k = le32_to_cpu(tp->dmt_height), ti = 1;
+ k > 0; k--, ti = ((ti + n) << 2) + 1) {
+ /* search the four nodes at this level, starting from
+ * the left.
+ */
+ for (x = ti, n = 0; n < 4; n++) {
+ /* sufficient free space found. move to the next
+ * level (or quit if this is the last level).
+ */
+ if (l2nb <= tp->dmt_stree[x + n])
+ break;
+ }
+
+ /* better have found something since the higher
+ * levels of the tree said it was here.
+ */
+ assert(n < 4);
+ }
+
+ /* set the return to the leftmost leaf describing sufficient
+ * free space.
+ */
+ *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbFindBits()
+ *
+ * FUNCTION: find a specified number of binary buddy free bits within a
+ * dmap bitmap word value.
+ *
+ * this routine searches the bitmap value for (1 << l2nb) free
+ * bits at (1 << l2nb) alignments within the value.
+ *
+ * PARAMETERS:
+ * word - dmap bitmap word value.
+ * l2nb - number of free bits specified as a log2 number.
+ *
+ * RETURN VALUES:
+ * starting bit number of free bits.
+ */
+static int dbFindBits(u32 word, int l2nb)
+{
+ int bitno, nb;
+ u32 mask;
+
+ /* get the number of bits.
+ */
+ nb = 1 << l2nb;
+ assert(nb <= DBWORD);
+
+ /* complement the word so we can use a mask (i.e. 0s represent
+ * free bits) and compute the mask.
+ */
+ word = ~word;
+ mask = ONES << (DBWORD - nb);
+
+ /* scan the word for nb free bits at nb alignments.
+ */
+ for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
+ if ((mask & word) == mask)
+ break;
+ }
+
+ ASSERT(bitno < 32);
+
+ /* return the bit number.
+ */
+ return (bitno);
+}
+
+
+/*
+ * NAME: dbMaxBud(u8 *cp)
+ *
+ * FUNCTION: determine the largest binary buddy string of free
+ * bits within 32-bits of the map.
+ *
+ * PARAMETERS:
+ * cp - pointer to the 32-bit value.
+ *
+ * RETURN VALUES:
+ * largest binary buddy of free bits within a dmap word.
+ */
+static int dbMaxBud(u8 * cp)
+{
+ signed char tmp1, tmp2;
+
+ /* check if the wmap word is all free. if so, the
+ * free buddy size is BUDMIN.
+ */
+ if (*((uint *) cp) == 0)
+ return (BUDMIN);
+
+ /* check if the wmap word is half free. if so, the
+ * free buddy size is BUDMIN-1.
+ */
+ if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
+ return (BUDMIN - 1);
+
+ /* not all free or half free. determine the free buddy
+ * size thru table lookup using quarters of the wmap word.
+ */
+ tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
+ tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
+ return (max(tmp1, tmp2));
+}
+
+
+/*
+ * NAME: cnttz(uint word)
+ *
+ * FUNCTION: determine the number of trailing zeros within a 32-bit
+ * value.
+ *
+ * PARAMETERS:
+ * value - 32-bit value to be examined.
+ *
+ * RETURN VALUES:
+ * count of trailing zeros
+ */
+int cnttz(u32 word)
+{
+ int n;
+
+ for (n = 0; n < 32; n++, word >>= 1) {
+ if (word & 0x01)
+ break;
+ }
+
+ return (n);
+}
+
+
+/*
+ * NAME: cntlz(u32 value)
+ *
+ * FUNCTION: determine the number of leading zeros within a 32-bit
+ * value.
+ *
+ * PARAMETERS:
+ * value - 32-bit value to be examined.
+ *
+ * RETURN VALUES:
+ * count of leading zeros
+ */
+int cntlz(u32 value)
+{
+ int n;
+
+ for (n = 0; n < 32; n++, value <<= 1) {
+ if (value & HIGHORDER)
+ break;
+ }
+ return (n);
+}
+
+
+/*
+ * NAME: blkstol2(s64 nb)
+ *
+ * FUNCTION: convert a block count to its log2 value. if the block
+ * count is not a l2 multiple, it is rounded up to the next
+ * larger l2 multiple.
+ *
+ * PARAMETERS:
+ * nb - number of blocks
+ *
+ * RETURN VALUES:
+ * log2 number of blocks
+ */
+int blkstol2(s64 nb)
+{
+ int l2nb;
+ s64 mask; /* meant to be signed */
+
+ mask = (s64) 1 << (64 - 1);
+
+ /* count the leading bits.
+ */
+ for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
+ /* leading bit found.
+ */
+ if (nb & mask) {
+ /* determine the l2 value.
+ */
+ l2nb = (64 - 1) - l2nb;
+
+ /* check if we need to round up.
+ */
+ if (~mask & nb)
+ l2nb++;
+
+ return (l2nb);
+ }
+ }
+ assert(0);
+ return 0; /* fix compiler warning */
+}
+
+
+/*
+ * NAME: fsDirty()
+ *
+ * FUNCTION: xxx
+ *
+ * PARAMETERS:
+ * ipmnt - mount inode
+ *
+ * RETURN VALUES:
+ * none
+ */
+void fsDirty()
+{
+ printk("fsDirty(): bye-bye\n");
+ assert(0);
+}
+
+
+/*
+ * NAME: dbAllocBottomUp()
+ *
+ * FUNCTION: alloc the specified block range from the working block
+ * allocation map.
+ *
+ * the blocks will be alloc from the working map one dmap
+ * at a time.
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode;
+ * blkno - starting block number to be freed.
+ * nblocks - number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error
+ */
+int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
+{
+ metapage_t *mp;
+ dmap_t *dp;
+ int nb, rc;
+ s64 lblkno, rem;
+ struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+ bmap_t *bmp = JFS_SBI(ip->i_sb)->bmap;
+
+ IREAD_LOCK(ipbmap);
+
+ /* block to be allocated better be within the mapsize. */
+ ASSERT(nblocks <= bmp->db_mapsize - blkno);
+
+ /*
+ * allocate the blocks a dmap at a time.
+ */
+ mp = NULL;
+ for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
+ /* release previous dmap if any */
+ if (mp) {
+ write_metapage(mp);
+ }
+
+ /* get the buffer for the current dmap. */
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ IREAD_UNLOCK(ipbmap);
+ return (EIO);
+ }
+ dp = (dmap_t *) mp->data;
+
+ /* determine the number of blocks to be allocated from
+ * this dmap.
+ */
+ nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
+
+ DBFREECK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+
+ /* allocate the blocks. */
+ if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
+ release_metapage(mp);
+ IREAD_UNLOCK(ipbmap);
+ return (rc);
+ }
+
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+ }
+
+ /* write the last buffer. */
+ write_metapage(mp);
+
+ IREAD_UNLOCK(ipbmap);
+
+ return (0);
+}
+
+
+static int dbAllocDmapBU(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks)
+{
+ int rc;
+ int dbitno, word, rembits, nb, nwords, wbitno, agno;
+ s8 oldroot, *leaf;
+ dmaptree_t *tp = (dmaptree_t *) & dp->tree;
+
+ /* save the current value of the root (i.e. maximum free string)
+ * of the dmap tree.
+ */
+ oldroot = tp->stree[ROOT];
+
+ /* pick up a pointer to the leaves of the dmap tree */
+ leaf = tp->stree + LEAFIND;
+
+ /* determine the bit number and word within the dmap of the
+ * starting block.
+ */
+ dbitno = blkno & (BPERDMAP - 1);
+ word = dbitno >> L2DBWORD;
+
+ /* block range better be within the dmap */
+ assert(dbitno + nblocks <= BPERDMAP);
+
+ /* allocate the bits of the dmap's words corresponding to the block
+ * range. not all bits of the first and last words may be contained
+ * within the block range. if this is the case, we'll work against
+ * those words (i.e. partial first and/or last) on an individual basis
+ * (a single pass), allocating the bits of interest by hand and
+ * updating the leaf corresponding to the dmap word. a single pass
+ * will be used for all dmap words fully contained within the
+ * specified range. within this pass, the bits of all fully contained
+ * dmap words will be marked as free in a single shot and the leaves
+ * will be updated. a single leaf may describe the free space of
+ * multiple dmap words, so we may update only a subset of the actual
+ * leaves corresponding to the dmap words of the block range.
+ */
+ for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+ /* determine the bit number within the word and
+ * the number of bits within the word.
+ */
+ wbitno = dbitno & (DBWORD - 1);
+ nb = min(rembits, DBWORD - wbitno);
+
+ /* check if only part of a word is to be allocated.
+ */
+ if (nb < DBWORD) {
+ /* allocate (set to 1) the appropriate bits within
+ * this dmap word.
+ */
+ dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
+ >> wbitno);
+
+ word += 1;
+ } else {
+ /* one or more dmap words are fully contained
+ * within the block range. determine how many
+ * words and allocate (set to 1) the bits of these
+ * words.
+ */
+ nwords = rembits >> L2DBWORD;
+ memset(&dp->wmap[word], (int) ONES, nwords * 4);
+
+ /* determine how many bits */
+ nb = nwords << L2DBWORD;
+ }
+ }
+
+ /* update the free count for this dmap */
+ dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+
+ /* reconstruct summary tree */
+ dbInitDmapTree(dp);
+
+ BMAP_LOCK(bmp);
+
+ /* if this allocation group is completely free,
+ * update the highest active allocation group number
+ * if this allocation group is the new max.
+ */
+ agno = blkno >> bmp->db_agl2size;
+ if (agno > bmp->db_maxag)
+ bmp->db_maxag = agno;
+
+ /* update the free count for the allocation group and map */
+ bmp->db_agfree[agno] -= nblocks;
+ bmp->db_nfree -= nblocks;
+
+ BMAP_UNLOCK(bmp);
+
+ /* if the root has not changed, done. */
+ if (tp->stree[ROOT] == oldroot)
+ return (0);
+
+ /* root changed. bubble the change up to the dmap control pages.
+ * if the adjustment of the upper level control pages fails,
+ * backout the bit allocation (thus making everything consistent).
+ */
+ if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
+ dbFreeBits(bmp, dp, blkno, nblocks);
+
+ return (rc);
+}
+
+
+/*
+ * NAME: dbExtendFS()
+ *
+ * FUNCTION: extend bmap from blkno for nblocks;
+ * dbExtendFS() updates bmap ready for dbAllocBottomUp();
+ *
+ * L2
+ * |
+ * L1---------------------------------L1
+ * | |
+ * L0---------L0---------L0 L0---------L0---------L0
+ * | | | | | |
+ * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm;
+ * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
+ *
+ * <---old---><----------------------------extend----------------------->
+ */
+int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
+ int nbperpage = sbi->nbperpage;
+ int i, i0 = TRUE, j, j0 = TRUE, k, n;
+ s64 newsize;
+ s64 p;
+ metapage_t *mp, *l2mp, *l1mp, *l0mp;
+ dmapctl_t *l2dcp, *l1dcp, *l0dcp;
+ dmap_t *dp;
+ s8 *l0leaf, *l1leaf, *l2leaf;
+ bmap_t *bmp = sbi->bmap;
+ int agno, l2agsize, oldl2agsize;
+ s64 ag_rem;
+
+ newsize = blkno + nblocks;
+
+ jEVENT(0, ("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld\n",
+ (long long) blkno, (long long) nblocks,
+ (long long) newsize));
+
+ /*
+ * initialize bmap control page.
+ *
+ * all the data in bmap control page should exclude
+ * the mkfs hidden dmap page.
+ */
+
+ /* update mapsize */
+ bmp->db_mapsize = newsize;
+ bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
+
+ /* compute new AG size */
+ l2agsize = dbGetL2AGSize(newsize);
+ oldl2agsize = bmp->db_agl2size;
+
+ bmp->db_agl2size = l2agsize;
+ bmp->db_agsize = 1 << l2agsize;
+
+ /* compute new number of AG */
+ agno = bmp->db_numag;
+ bmp->db_numag = newsize >> l2agsize;
+ bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
+
+ /*
+ * reconfigure db_agfree[]
+ * from old AG configuration to new AG configuration;
+ *
+ * coalesce contiguous k (newAGSize/oldAGSize) AGs;
+ * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
+ * note: new AG size = old AG size * (2**x).
+ */
+ if (l2agsize == oldl2agsize)
+ goto extend;
+ k = 1 << (l2agsize - oldl2agsize);
+ ag_rem = bmp->db_agfree[0]; /* save agfree[0] */
+ for (i = 0, n = 0; i < agno; n++) {
+ bmp->db_agfree[n] = 0; /* init collection point */
+
+ /* coalesce cotiguous k AGs; */
+ for (j = 0; j < k && i < agno; j++, i++) {
+ /* merge AGi to AGn */
+ bmp->db_agfree[n] += bmp->db_agfree[i];
+ }
+ }
+ bmp->db_agfree[0] += ag_rem; /* restore agfree[0] */
+
+ for (; n < MAXAG; n++)
+ bmp->db_agfree[n] = 0;
+
+ /*
+ * update highest active ag number
+ */
+
+ bmp->db_maxag = bmp->db_maxag / k;
+
+ /*
+ * extend bmap
+ *
+ * update bit maps and corresponding level control pages;
+ * global control page db_nfree, db_agfree[agno], db_maxfreebud;
+ */
+ extend:
+ /* get L2 page */
+ p = BMAPBLKNO + nbperpage; /* L2 page */
+ l2mp = read_metapage(ipbmap, p, PSIZE, 0);
+ assert(l2mp);
+ l2dcp = (dmapctl_t *) l2mp->data;
+
+ /* compute start L1 */
+ k = blkno >> L2MAXL1SIZE;
+ l2leaf = l2dcp->stree + CTLLEAFIND + k;
+ p = BLKTOL1(blkno, sbi->l2nbperpage); /* L1 page */
+
+ /*
+ * extend each L1 in L2
+ */
+ for (; k < LPERCTL; k++, p += nbperpage) {
+ /* get L1 page */
+ if (j0) {
+ /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
+ l1mp = read_metapage(ipbmap, p, PSIZE, 0);
+ if (l1mp == NULL)
+ goto errout;
+ l1dcp = (dmapctl_t *) l1mp->data;
+
+ /* compute start L0 */
+ j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
+ l1leaf = l1dcp->stree + CTLLEAFIND + j;
+ p = BLKTOL0(blkno, sbi->l2nbperpage);
+ j0 = FALSE;
+ } else {
+ /* assign/init L1 page */
+ l1mp = get_metapage(ipbmap, p, PSIZE, 0);
+ if (l1mp == NULL)
+ goto errout;
+
+ l1dcp = (dmapctl_t *) l1mp->data;
+
+ /* compute start L0 */
+ j = 0;
+ l1leaf = l1dcp->stree + CTLLEAFIND;
+ p += nbperpage; /* 1st L0 of L1.k */
+ }
+
+ /*
+ * extend each L0 in L1
+ */
+ for (; j < LPERCTL; j++) {
+ /* get L0 page */
+ if (i0) {
+ /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
+
+ l0mp = read_metapage(ipbmap, p, PSIZE, 0);
+ if (l0mp == NULL)
+ goto errout;
+ l0dcp = (dmapctl_t *) l0mp->data;
+
+ /* compute start dmap */
+ i = (blkno & (MAXL0SIZE - 1)) >>
+ L2BPERDMAP;
+ l0leaf = l0dcp->stree + CTLLEAFIND + i;
+ p = BLKTODMAP(blkno,
+ sbi->l2nbperpage);
+ i0 = FALSE;
+ } else {
+ /* assign/init L0 page */
+ l0mp = get_metapage(ipbmap, p, PSIZE, 0);
+ if (l0mp == NULL)
+ goto errout;
+
+ l0dcp = (dmapctl_t *) l0mp->data;
+
+ /* compute start dmap */
+ i = 0;
+ l0leaf = l0dcp->stree + CTLLEAFIND;
+ p += nbperpage; /* 1st dmap of L0.j */
+ }
+
+ /*
+ * extend each dmap in L0
+ */
+ for (; i < LPERCTL; i++) {
+ /*
+ * reconstruct the dmap page, and
+ * initialize corresponding parent L0 leaf
+ */
+ if ((n = blkno & (BPERDMAP - 1))) {
+ /* read in dmap page: */
+ mp = read_metapage(ipbmap, p,
+ PSIZE, 0);
+ if (mp == NULL)
+ goto errout;
+ n = min(nblocks, (s64)BPERDMAP - n);
+ } else {
+ /* assign/init dmap page */
+ mp = read_metapage(ipbmap, p,
+ PSIZE, 0);
+ if (mp == NULL)
+ goto errout;
+
+ n = min(nblocks, (s64)BPERDMAP);
+ }
+
+ dp = (dmap_t *) mp->data;
+ *l0leaf = dbInitDmap(dp, blkno, n);
+
+ bmp->db_nfree += n;
+ agno = le64_to_cpu(dp->start) >> l2agsize;
+ bmp->db_agfree[agno] += n;
+
+ write_metapage(mp);
+
+ l0leaf++;
+ p += nbperpage;
+
+ blkno += n;
+ nblocks -= n;
+ if (nblocks == 0)
+ break;
+ } /* for each dmap in a L0 */
+
+ /*
+ * build current L0 page from its leaves, and
+ * initialize corresponding parent L1 leaf
+ */
+ *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
+ write_metapage(l0mp);
+
+ if (nblocks)
+ l1leaf++; /* continue for next L0 */
+ else {
+ /* more than 1 L0 ? */
+ if (j > 0)
+ break; /* build L1 page */
+ else {
+ /* summarize in global bmap page */
+ bmp->db_maxfreebud = *l1leaf;
+ release_metapage(l1mp);
+ release_metapage(l2mp);
+ goto finalize;
+ }
+ }
+ } /* for each L0 in a L1 */
+
+ /*
+ * build current L1 page from its leaves, and
+ * initialize corresponding parent L2 leaf
+ */
+ *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
+ write_metapage(l1mp);
+
+ if (nblocks)
+ l2leaf++; /* continue for next L1 */
+ else {
+ /* more than 1 L1 ? */
+ if (k > 0)
+ break; /* build L2 page */
+ else {
+ /* summarize in global bmap page */
+ bmp->db_maxfreebud = *l2leaf;
+ release_metapage(l2mp);
+ goto finalize;
+ }
+ }
+ } /* for each L1 in a L2 */
+
+ assert(0);
+
+ /*
+ * finalize bmap control page
+ */
+ finalize:
+
+ return 0;
+
+ errout:
+ return EIO;
+}
+
+
+/*
+ * dbFinalizeBmap()
+ */
+void dbFinalizeBmap(struct inode *ipbmap)
+{
+ bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+ int actags, inactags, l2nl;
+ s64 ag_rem, actfree, inactfree, avgfree;
+ int i, n;
+
+ /*
+ * finalize bmap control page
+ */
+//finalize:
+ /*
+ * compute db_agpref: preferred ag to allocate from
+ * (the leftmost ag with average free space in it);
+ */
+//agpref:
+ /* get the number of active ags and inacitve ags */
+ actags = bmp->db_maxag + 1;
+ inactags = bmp->db_numag - actags;
+ ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */
+
+ /* determine how many blocks are in the inactive allocation
+ * groups. in doing this, we must account for the fact that
+ * the rightmost group might be a partial group (i.e. file
+ * system size is not a multiple of the group size).
+ */
+ inactfree = (inactags && ag_rem) ?
+ ((inactags - 1) << bmp->db_agl2size) + ag_rem
+ : inactags << bmp->db_agl2size;
+
+ /* determine how many free blocks are in the active
+ * allocation groups plus the average number of free blocks
+ * within the active ags.
+ */
+ actfree = bmp->db_nfree - inactfree;
+ avgfree = (u32) actfree / (u32) actags;
+
+ /* if the preferred allocation group has not average free space.
+ * re-establish the preferred group as the leftmost
+ * group with average free space.
+ */
+ if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
+ for (bmp->db_agpref = 0; bmp->db_agpref < actags;
+ bmp->db_agpref++) {
+ if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
+ break;
+ }
+ assert(bmp->db_agpref < bmp->db_numag);
+ }
+
+ /*
+ * compute db_aglevel, db_agheigth, db_width, db_agstart:
+ * an ag is covered in aglevel dmapctl summary tree,
+ * at agheight level height (from leaf) with agwidth number of nodes
+ * each, which starts at agstart index node of the smmary tree node
+ * array;
+ */
+ bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
+ l2nl =
+ bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
+ bmp->db_agheigth = l2nl >> 1;
+ bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
+ for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
+ i--) {
+ bmp->db_agstart += n;
+ n <<= 2;
+ }
+
+/*
+printk("bmap: agpref:%d aglevel:%d agheigth:%d agwidth:%d\n",
+ bmp->db_agpref, bmp->db_aglevel, bmp->db_agheigth, bmp->db_agwidth);
+*/
+}
+
+
+/*
+ * NAME: dbInitDmap()/ujfs_idmap_page()
+ *
+ * FUNCTION: initialize working/persistent bitmap of the dmap page
+ * for the specified number of blocks:
+ *
+ * at entry, the bitmaps had been initialized as free (ZEROS);
+ * The number of blocks will only account for the actually
+ * existing blocks. Blocks which don't actually exist in
+ * the aggregate will be marked as allocated (ONES);
+ *
+ * PARAMETERS:
+ * dp - pointer to page of map
+ * nblocks - number of blocks this page
+ *
+ * RETURNS: NONE
+ */
+static int dbInitDmap(dmap_t * dp, s64 Blkno, int nblocks)
+{
+ int blkno, w, b, r, nw, nb, i;
+/*
+printk("sbh_dmap: in dbInitDmap blkno:%Ld nblocks:%ld\n", Blkno, nblocks);
+*/
+
+ /* starting block number within the dmap */
+ blkno = Blkno & (BPERDMAP - 1);
+
+ if (blkno == 0) {
+ dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
+ dp->start = cpu_to_le64(Blkno);
+
+ if (nblocks == BPERDMAP) {
+ memset(&dp->wmap[0], 0, LPERDMAP * 4);
+ memset(&dp->pmap[0], 0, LPERDMAP * 4);
+ goto initTree;
+ }
+ } else {
+ dp->nblocks =
+ cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
+ dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+ }
+
+ /* word number containing start block number */
+ w = blkno >> L2DBWORD;
+
+ /*
+ * free the bits corresponding to the block range (ZEROS):
+ * note: not all bits of the first and last words may be contained
+ * within the block range.
+ */
+ for (r = nblocks; r > 0; r -= nb, blkno += nb) {
+ /* number of bits preceding range to be freed in the word */
+ b = blkno & (DBWORD - 1);
+ /* number of bits to free in the word */
+ nb = min(r, DBWORD - b);
+
+ /* is partial word to be freed ? */
+ if (nb < DBWORD) {
+ /* free (set to 0) from the bitmap word */
+ dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
+ >> b));
+ dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
+ >> b));
+
+ /* skip the word freed */
+ w++;
+ } else {
+ /* free (set to 0) contiguous bitmap words */
+ nw = r >> L2DBWORD;
+ memset(&dp->wmap[w], 0, nw * 4);
+ memset(&dp->pmap[w], 0, nw * 4);
+
+ /* skip the words freed */
+ nb = nw << L2DBWORD;
+ w += nw;
+ }
+ }
+
+ /*
+ * mark bits following the range to be freed (non-existing
+ * blocks) as allocated (ONES)
+ */
+/*
+printk("sbh_dmap: in dbInitDmap, preparing to mark unbacked, blkno:%ld nblocks:%ld\n",
+ blkno, nblocks);
+*/
+
+ if (blkno == BPERDMAP)
+ goto initTree;
+
+ /* the first word beyond the end of existing blocks */
+ w = blkno >> L2DBWORD;
+
+ /* does nblocks fall on a 32-bit boundary ? */
+ b = blkno & (DBWORD - 1);
+/*
+printk("sbh_dmap: in dbInitDmap, b:%ld w:%ld mask: %lx\n", b, w, (ONES>>b));
+*/
+ if (b) {
+ /* mark a partial word allocated */
+ dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
+ w++;
+ }
+
+ /* set the rest of the words in the page to allocated (ONES) */
+ for (i = w; i < LPERDMAP; i++)
+ dp->pmap[i] = dp->wmap[i] = ONES;
+
+ /*
+ * init tree
+ */
+ initTree:
+ return (dbInitDmapTree(dp));
+}
+
+
+/*
+ * NAME: dbInitDmapTree()/ujfs_complete_dmap()
+ *
+ * FUNCTION: initialize summary tree of the specified dmap:
+ *
+ * at entry, bitmap of the dmap has been initialized;
+ *
+ * PARAMETERS:
+ * dp - dmap to complete
+ * blkno - starting block number for this dmap
+ * treemax - will be filled in with max free for this dmap
+ *
+ * RETURNS: max free string at the root of the tree
+ */
+static int dbInitDmapTree(dmap_t * dp)
+{
+ dmaptree_t *tp;
+ s8 *cp;
+ int i;
+
+ /* init fixed info of tree */
+ tp = &dp->tree;
+ tp->nleafs = cpu_to_le32(LPERDMAP);
+ tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
+ tp->leafidx = cpu_to_le32(LEAFIND);
+ tp->height = cpu_to_le32(4);
+ tp->budmin = BUDMIN;
+
+ /* init each leaf from corresponding wmap word:
+ * note: leaf is set to NOFREE(-1) if all blocks of corresponding
+ * bitmap word are allocated.
+ */
+ cp = tp->stree + le32_to_cpu(tp->leafidx);
+ for (i = 0; i < LPERDMAP; i++)
+ *cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
+
+ /* build the dmap's binary buddy summary tree */
+ return (dbInitTree(tp));
+}
+
+
+/*
+ * NAME: dbInitTree()/ujfs_adjtree()
+ *
+ * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl.
+ *
+ * at entry, the leaves of the tree has been initialized
+ * from corresponding bitmap word or root of summary tree
+ * of the child control page;
+ * configure binary buddy system at the leaf level, then
+ * bubble up the values of the leaf nodes up the tree.
+ *
+ * PARAMETERS:
+ * cp - Pointer to the root of the tree
+ * l2leaves- Number of leaf nodes as a power of 2
+ * l2min - Number of blocks that can be covered by a leaf
+ * as a power of 2
+ *
+ * RETURNS: max free string at the root of the tree
+ */
+static int dbInitTree(dmaptree_t * dtp)
+{
+ int l2max, l2free, bsize, nextb, i;
+ int child, parent, nparent;
+ s8 *tp, *cp, *cp1;
+
+ tp = dtp->stree;
+
+ /* Determine the maximum free string possible for the leaves */
+ l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
+
+ /*
+ * configure the leaf levevl into binary buddy system
+ *
+ * Try to combine buddies starting with a buddy size of 1
+ * (i.e. two leaves). At a buddy size of 1 two buddy leaves
+ * can be combined if both buddies have a maximum free of l2min;
+ * the combination will result in the left-most buddy leaf having
+ * a maximum free of l2min+1.
+ * After processing all buddies for a given size, process buddies
+ * at the next higher buddy size (i.e. current size * 2) and
+ * the next maximum free (current free + 1).
+ * This continues until the maximum possible buddy combination
+ * yields maximum free.
+ */
+ for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
+ l2free++, bsize = nextb) {
+ /* get next buddy size == current buddy pair size */
+ nextb = bsize << 1;
+
+ /* scan each adjacent buddy pair at current buddy size */
+ for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
+ i < le32_to_cpu(dtp->nleafs);
+ i += nextb, cp += nextb) {
+ /* coalesce if both adjacent buddies are max free */
+ if (*cp == l2free && *(cp + bsize) == l2free) {
+ *cp = l2free + 1; /* left take right */
+ *(cp + bsize) = -1; /* right give left */
+ }
+ }
+ }
+
+ /*
+ * bubble summary information of leaves up the tree.
+ *
+ * Starting at the leaf node level, the four nodes described by
+ * the higher level parent node are compared for a maximum free and
+ * this maximum becomes the value of the parent node.
+ * when all lower level nodes are processed in this fashion then
+ * move up to the next level (parent becomes a lower level node) and
+ * continue the process for that level.
+ */
+ for (child = le32_to_cpu(dtp->leafidx),
+ nparent = le32_to_cpu(dtp->nleafs) >> 2;
+ nparent > 0; nparent >>= 2, child = parent) {
+ /* get index of 1st node of parent level */
+ parent = (child - 1) >> 2;
+
+ /* set the value of the parent node as the maximum
+ * of the four nodes of the current level.
+ */
+ for (i = 0, cp = tp + child, cp1 = tp + parent;
+ i < nparent; i++, cp += 4, cp1++)
+ *cp1 = TREEMAX(cp);
+ }
+
+ return (*tp);
+}
+
+
+/*
+ * dbInitDmapCtl()
+ *
+ * function: initialize dmapctl page
+ */
+static int dbInitDmapCtl(dmapctl_t * dcp, int level, int i)
+{ /* start leaf index not covered by range */
+ s8 *cp;
+
+ dcp->nleafs = cpu_to_le32(LPERCTL);
+ dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
+ dcp->leafidx = cpu_to_le32(CTLLEAFIND);
+ dcp->height = cpu_to_le32(5);
+ dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
+
+ /*
+ * initialize the leaves of current level that were not covered
+ * by the specified input block range (i.e. the leaves have no
+ * low level dmapctl or dmap).
+ */
+ cp = &dcp->stree[CTLLEAFIND + i];
+ for (; i < LPERCTL; i++)
+ *cp++ = NOFREE;
+
+ /* build the dmap's binary buddy summary tree */
+ return (dbInitTree((dmaptree_t *) dcp));
+}
+
+
+/*
+ * NAME: dbGetL2AGSize()/ujfs_getagl2size()
+ *
+ * FUNCTION: Determine log2(allocation group size) from aggregate size
+ *
+ * PARAMETERS:
+ * nblocks - Number of blocks in aggregate
+ *
+ * RETURNS: log2(allocation group size) in aggregate blocks
+ */
+static int dbGetL2AGSize(s64 nblocks)
+{
+ s64 sz;
+ s64 m;
+ int l2sz;
+
+ if (nblocks < BPERDMAP * MAXAG)
+ return (L2BPERDMAP);
+
+ /* round up aggregate size to power of 2 */
+ m = ((u64) 1 << (64 - 1));
+ for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
+ if (m & nblocks)
+ break;
+ }
+
+ sz = (s64) 1 << l2sz;
+ if (sz < nblocks)
+ l2sz += 1;
+
+ /* agsize = roundupSize/max_number_of_ag */
+ return (l2sz - L2MAXAG);
+}
+
+
+/*
+ * NAME: dbMapFileSizeToMapSize()
+ *
+ * FUNCTION: compute number of blocks the block allocation map file
+ * can cover from the map file size;
+ *
+ * RETURNS: Number of blocks which can be covered by this block map file;
+ */
+
+/*
+ * maximum number of map pages at each level including control pages
+ */
+#define MAXL0PAGES (1 + LPERCTL)
+#define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES)
+#define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES)
+
+/*
+ * convert number of map pages to the zero origin top dmapctl level
+ */
+#define BMAPPGTOLEV(npages) \
+ (((npages) <= 3 + MAXL0PAGES) ? 0 \
+ : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
+
+s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
+{
+ struct super_block *sb = ipbmap->i_sb;
+ s64 nblocks;
+ s64 npages, ndmaps;
+ int level, i;
+ int complete, factor;
+
+ nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
+ npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
+ level = BMAPPGTOLEV(npages);
+
+ /* At each level, accumulate the number of dmap pages covered by
+ * the number of full child levels below it;
+ * repeat for the last incomplete child level.
+ */
+ ndmaps = 0;
+ npages--; /* skip the first global control page */
+ /* skip higher level control pages above top level covered by map */
+ npages -= (2 - level);
+ npages--; /* skip top level's control page */
+ for (i = level; i >= 0; i--) {
+ factor =
+ (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
+ complete = (u32) npages / factor;
+ ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL
+ : ((i == 1) ? LPERCTL : 1));
+
+ /* pages in last/incomplete child */
+ npages = (u32) npages % factor;
+ /* skip incomplete child's level control page */
+ npages--;
+ }
+
+ /* convert the number of dmaps into the number of blocks
+ * which can be covered by the dmaps;
+ */
+ nblocks = ndmaps << L2BPERDMAP;
+
+ return (nblocks);
+}
+
+
+#ifdef _JFS_DEBUG_DMAP
+/*
+ * DBinitmap()
+ */
+static void DBinitmap(s64 size, struct inode *ipbmap, u32 ** results)
+{
+ int npages;
+ u32 *dbmap, *d;
+ int n;
+ s64 lblkno, cur_block;
+ dmap_t *dp;
+ metapage_t *mp;
+
+ npages = size / 32768;
+ npages += (size % 32768) ? 1 : 0;
+
+ dbmap = (u32 *) xmalloc(npages * 4096, L2PSIZE, kernel_heap);
+ if (dbmap == NULL)
+ assert(0);
+
+ for (n = 0, d = dbmap; n < npages; n++, d += 1024)
+ bzero(d, 4096);
+
+ /* Need to initialize from disk map pages
+ */
+ for (d = dbmap, cur_block = 0; cur_block < size;
+ cur_block += BPERDMAP, d += LPERDMAP) {
+ lblkno = BLKTODMAP(cur_block,
+ JFS_SBI(ipbmap->i_sb)->bmap->
+ db_l2nbperpage);
+ mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ assert(0);
+ }
+ dp = (dmap_t *) mp->data;
+
+ for (n = 0; n < LPERDMAP; n++)
+ d[n] = le32_to_cpu(dp->wmap[n]);
+
+ release_metapage(mp);
+ }
+
+ *results = dbmap;
+}
+
+
+/*
+ * DBAlloc()
+ */
+void DBAlloc(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
+{
+ int word, nb, bitno;
+ u32 mask;
+
+ assert(blkno > 0 && blkno < mapsize);
+ assert(nblocks > 0 && nblocks <= mapsize);
+
+ assert(blkno + nblocks <= mapsize);
+
+ dbmap += (blkno / 32);
+ while (nblocks > 0) {
+ bitno = blkno & (32 - 1);
+ nb = min(nblocks, 32 - bitno);
+
+ mask = (0xffffffff << (32 - nb) >> bitno);
+ assert((mask & *dbmap) == 0);
+ *dbmap |= mask;
+
+ dbmap++;
+ blkno += nb;
+ nblocks -= nb;
+ }
+}
+
+
+/*
+ * DBFree()
+ */
+static void DBFree(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
+{
+ int word, nb, bitno;
+ u32 mask;
+
+ assert(blkno > 0 && blkno < mapsize);
+ assert(nblocks > 0 && nblocks <= mapsize);
+
+ assert(blkno + nblocks <= mapsize);
+
+ dbmap += (blkno / 32);
+ while (nblocks > 0) {
+ bitno = blkno & (32 - 1);
+ nb = min(nblocks, 32 - bitno);
+
+ mask = (0xffffffff << (32 - nb) >> bitno);
+ assert((mask & *dbmap) == mask);
+ *dbmap &= ~mask;
+
+ dbmap++;
+ blkno += nb;
+ nblocks -= nb;
+ }
+}
+
+
+/*
+ * DBAllocCK()
+ */
+static void DBAllocCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
+{
+ int word, nb, bitno;
+ u32 mask;
+
+ assert(blkno > 0 && blkno < mapsize);
+ assert(nblocks > 0 && nblocks <= mapsize);
+
+ assert(blkno + nblocks <= mapsize);
+
+ dbmap += (blkno / 32);
+ while (nblocks > 0) {
+ bitno = blkno & (32 - 1);
+ nb = min(nblocks, 32 - bitno);
+
+ mask = (0xffffffff << (32 - nb) >> bitno);
+ assert((mask & *dbmap) == mask);
+
+ dbmap++;
+ blkno += nb;
+ nblocks -= nb;
+ }
+}
+
+
+/*
+ * DBFreeCK()
+ */
+static void DBFreeCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
+{
+ int word, nb, bitno;
+ u32 mask;
+
+ assert(blkno > 0 && blkno < mapsize);
+ assert(nblocks > 0 && nblocks <= mapsize);
+
+ assert(blkno + nblocks <= mapsize);
+
+ dbmap += (blkno / 32);
+ while (nblocks > 0) {
+ bitno = blkno & (32 - 1);
+ nb = min(nblocks, 32 - bitno);
+
+ mask = (0xffffffff << (32 - nb) >> bitno);
+ assert((mask & *dbmap) == 0);
+
+ dbmap++;
+ blkno += nb;
+ nblocks -= nb;
+ }
+}
+
+
+/*
+ * dbPrtMap()
+ */
+static void dbPrtMap(bmap_t * bmp)
+{
+ printk(" mapsize: %d%d\n", bmp->db_mapsize);
+ printk(" nfree: %d%d\n", bmp->db_nfree);
+ printk(" numag: %d\n", bmp->db_numag);
+ printk(" agsize: %d%d\n", bmp->db_agsize);
+ printk(" agl2size: %d\n", bmp->db_agl2size);
+ printk(" agwidth: %d\n", bmp->db_agwidth);
+ printk(" agstart: %d\n", bmp->db_agstart);
+ printk(" agheigth: %d\n", bmp->db_agheigth);
+ printk(" aglevel: %d\n", bmp->db_aglevel);
+ printk(" maxlevel: %d\n", bmp->db_maxlevel);
+ printk(" maxag: %d\n", bmp->db_maxag);
+ printk(" agpref: %d\n", bmp->db_agpref);
+ printk(" l2nbppg: %d\n", bmp->db_l2nbperpage);
+}
+
+
+/*
+ * dbPrtCtl()
+ */
+static void dbPrtCtl(dmapctl_t * dcp)
+{
+ int i, j, n;
+
+ printk(" height: %08x\n", le32_to_cpu(dcp->height));
+ printk(" leafidx: %08x\n", le32_to_cpu(dcp->leafidx));
+ printk(" budmin: %08x\n", dcp->budmin);
+ printk(" nleafs: %08x\n", le32_to_cpu(dcp->nleafs));
+ printk(" l2nleafs: %08x\n", le32_to_cpu(dcp->l2nleafs));
+
+ printk("\n Tree:\n");
+ for (i = 0; i < CTLLEAFIND; i += 8) {
+ n = min(8, CTLLEAFIND - i);
+
+ for (j = 0; j < n; j++)
+ printf(" [%03x]: %02x", i + j,
+ (char) dcp->stree[i + j]);
+ printf("\n");
+ }
+
+ printk("\n Tree Leaves:\n");
+ for (i = 0; i < LPERCTL; i += 8) {
+ n = min(8, LPERCTL - i);
+
+ for (j = 0; j < n; j++)
+ printf(" [%03x]: %02x",
+ i + j,
+ (char) dcp->stree[i + j + CTLLEAFIND]);
+ printf("\n");
+ }
+}
+#endif /* _JFS_DEBUG_DMAP */
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
new file mode 100644
index 000000000000..75b2808c1259
--- /dev/null
+++ b/fs/jfs/jfs_dmap.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * jfs_dmap.h: block allocation map manager
+ */
+
+#ifndef _H_JFS_DMAP
+#define _H_JFS_DMAP
+
+#include "jfs_txnmgr.h"
+
+#define BMAPVERSION 1 /* version number */
+#define TREESIZE (256+64+16+4+1) /* size of a dmap tree */
+#define LEAFIND (64+16+4+1) /* index of 1st leaf of a dmap tree */
+#define LPERDMAP 256 /* num leaves per dmap tree */
+#define L2LPERDMAP 8 /* l2 number of leaves per dmap tree */
+#define DBWORD 32 /* # of blks covered by a map word */
+#define L2DBWORD 5 /* l2 # of blks covered by a mword */
+#define BUDMIN L2DBWORD /* max free string in a map word */
+#define BPERDMAP (LPERDMAP * DBWORD) /* num of blks per dmap */
+#define L2BPERDMAP 13 /* l2 num of blks per dmap */
+#define CTLTREESIZE (1024+256+64+16+4+1) /* size of a dmapctl tree */
+#define CTLLEAFIND (256+64+16+4+1) /* idx of 1st leaf of a dmapctl tree */
+#define LPERCTL 1024 /* num of leaves per dmapctl tree */
+#define L2LPERCTL 10 /* l2 num of leaves per dmapctl tree */
+#define ROOT 0 /* index of the root of a tree */
+#define NOFREE ((s8) -1) /* no blocks free */
+#define MAXAG 128 /* max number of allocation groups */
+#define L2MAXAG 7 /* l2 max num of AG */
+#define L2MINAGSZ 25 /* l2 of minimum AG size in bytes */
+#define BMAPBLKNO 0 /* lblkno of bmap within the map */
+
+/*
+ * maximum l2 number of disk blocks at the various dmapctl levels.
+ */
+#define L2MAXL0SIZE (L2BPERDMAP + 1 * L2LPERCTL)
+#define L2MAXL1SIZE (L2BPERDMAP + 2 * L2LPERCTL)
+#define L2MAXL2SIZE (L2BPERDMAP + 3 * L2LPERCTL)
+
+/*
+ * maximum number of disk blocks at the various dmapctl levels.
+ */
+#define MAXL0SIZE ((s64)1 << L2MAXL0SIZE)
+#define MAXL1SIZE ((s64)1 << L2MAXL1SIZE)
+#define MAXL2SIZE ((s64)1 << L2MAXL2SIZE)
+
+#define MAXMAPSIZE MAXL2SIZE /* maximum aggregate map size */
+
+/*
+ * determine the maximum free string for four (lower level) nodes
+ * of the tree.
+ */
+static __inline signed char TREEMAX(signed char *cp)
+{
+ signed char tmp1, tmp2;
+
+ tmp1 = max(*(cp+2), *(cp+3));
+ tmp2 = max(*(cp), *(cp+1));
+
+ return max(tmp1, tmp2);
+}
+
+/*
+ * convert disk block number to the logical block number of the dmap
+ * describing the disk block. s is the log2(number of logical blocks per page)
+ *
+ * The calculation figures out how many logical pages are in front of the dmap.
+ * - the number of dmaps preceding it
+ * - the number of L0 pages preceding its L0 page
+ * - the number of L1 pages preceding its L1 page
+ * - 3 is added to account for the L2, L1, and L0 page for this dmap
+ * - 1 is added to account for the control page of the map.
+ */
+#define BLKTODMAP(b,s) \
+ ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
+
+/*
+ * convert disk block number to the logical block number of the LEVEL 0
+ * dmapctl describing the disk block. s is the log2(number of logical blocks
+ * per page)
+ *
+ * The calculation figures out how many logical pages are in front of the L0.
+ * - the number of dmap pages preceding it
+ * - the number of L0 pages preceding it
+ * - the number of L1 pages preceding its L1 page
+ * - 2 is added to account for the L2, and L1 page for this L0
+ * - 1 is added to account for the control page of the map.
+ */
+#define BLKTOL0(b,s) \
+ (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
+
+/*
+ * convert disk block number to the logical block number of the LEVEL 1
+ * dmapctl describing the disk block. s is the log2(number of logical blocks
+ * per page)
+ *
+ * The calculation figures out how many logical pages are in front of the L1.
+ * - the number of dmap pages preceding it
+ * - the number of L0 pages preceding it
+ * - the number of L1 pages preceding it
+ * - 1 is added to account for the L2 page
+ * - 1 is added to account for the control page of the map.
+ */
+#define BLKTOL1(b,s) \
+ (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s))
+
+/*
+ * convert disk block number to the logical block number of the dmapctl
+ * at the specified level which describes the disk block.
+ */
+#define BLKTOCTL(b,s,l) \
+ (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
+
+/*
+ * convert aggregate map size to the zero origin dmapctl level of the
+ * top dmapctl.
+ */
+#define BMAPSZTOLEV(size) \
+ (((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2)
+
+/* convert disk block number to allocation group number.
+ */
+#define BLKTOAG(b,sbi) ((b) >> ((sbi)->bmap->db_agl2size))
+
+/* convert allocation group number to starting disk block
+ * number.
+ */
+#define AGTOBLK(a,ip) \
+ ((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size))
+
+/*
+ * dmap summary tree
+ *
+ * dmaptree_t must be consistent with dmapctl_t.
+ */
+typedef struct {
+ s32 nleafs; /* 4: number of tree leafs */
+ s32 l2nleafs; /* 4: l2 number of tree leafs */
+ s32 leafidx; /* 4: index of first tree leaf */
+ s32 height; /* 4: height of the tree */
+ s8 budmin; /* 1: min l2 tree leaf value to combine */
+ s8 stree[TREESIZE]; /* TREESIZE: tree */
+ u8 pad[2]; /* 2: pad to word boundary */
+} dmaptree_t; /* - 360 - */
+
+/*
+ * dmap page per 8K blocks bitmap
+ */
+typedef struct {
+ s32 nblocks; /* 4: num blks covered by this dmap */
+ s32 nfree; /* 4: num of free blks in this dmap */
+ s64 start; /* 8: starting blkno for this dmap */
+ dmaptree_t tree; /* 360: dmap tree */
+ u8 pad[1672]; /* 1672: pad to 2048 bytes */
+ u32 wmap[LPERDMAP]; /* 1024: bits of the working map */
+ u32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */
+} dmap_t; /* - 4096 - */
+
+/*
+ * disk map control page per level.
+ *
+ * dmapctl_t must be consistent with dmaptree_t.
+ */
+typedef struct {
+ s32 nleafs; /* 4: number of tree leafs */
+ s32 l2nleafs; /* 4: l2 number of tree leafs */
+ s32 leafidx; /* 4: index of the first tree leaf */
+ s32 height; /* 4: height of tree */
+ s8 budmin; /* 1: minimum l2 tree leaf value */
+ s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */
+ u8 pad[2714]; /* 2714: pad to 4096 */
+} dmapctl_t; /* - 4096 - */
+
+/*
+ * common definition for dmaptree_t within dmap and dmapctl
+ */
+typedef union {
+ dmaptree_t t1;
+ dmapctl_t t2;
+} dmtree_t;
+
+/* macros for accessing fields within dmtree_t */
+#define dmt_nleafs t1.nleafs
+#define dmt_l2nleafs t1.l2nleafs
+#define dmt_leafidx t1.leafidx
+#define dmt_height t1.height
+#define dmt_budmin t1.budmin
+#define dmt_stree t1.stree
+
+/*
+ * on-disk aggregate disk allocation map descriptor.
+ */
+typedef struct {
+ s64 dn_mapsize; /* 8: number of blocks in aggregate */
+ s64 dn_nfree; /* 8: num free blks in aggregate map */
+ s32 dn_l2nbperpage; /* 4: number of blks per page */
+ s32 dn_numag; /* 4: total number of ags */
+ s32 dn_maxlevel; /* 4: number of active ags */
+ s32 dn_maxag; /* 4: max active alloc group number */
+ s32 dn_agpref; /* 4: preferred alloc group (hint) */
+ s32 dn_aglevel; /* 4: dmapctl level holding the AG */
+ s32 dn_agheigth; /* 4: height in dmapctl of the AG */
+ s32 dn_agwidth; /* 4: width in dmapctl of the AG */
+ s32 dn_agstart; /* 4: start tree index at AG height */
+ s32 dn_agl2size; /* 4: l2 num of blks per alloc group */
+ s64 dn_agfree[MAXAG]; /* 8*MAXAG: per AG free count */
+ s64 dn_agsize; /* 8: num of blks per alloc group */
+ s8 dn_maxfreebud; /* 1: max free buddy system */
+ u8 pad[3007]; /* 3007: pad to 4096 */
+} dbmap_t; /* - 4096 - */
+
+/*
+ * in-memory aggregate disk allocation map descriptor.
+ */
+typedef struct bmap {
+ dbmap_t db_bmap; /* on-disk aggregate map descriptor */
+ struct inode *db_ipbmap; /* ptr to aggregate map incore inode */
+ struct semaphore db_bmaplock; /* aggregate map lock */
+ u32 *db_DBmap;
+} bmap_t;
+
+/* macros for accessing fields within in-memory aggregate map descriptor */
+#define db_mapsize db_bmap.dn_mapsize
+#define db_nfree db_bmap.dn_nfree
+#define db_agfree db_bmap.dn_agfree
+#define db_agsize db_bmap.dn_agsize
+#define db_agl2size db_bmap.dn_agl2size
+#define db_agwidth db_bmap.dn_agwidth
+#define db_agheigth db_bmap.dn_agheigth
+#define db_agstart db_bmap.dn_agstart
+#define db_numag db_bmap.dn_numag
+#define db_maxlevel db_bmap.dn_maxlevel
+#define db_aglevel db_bmap.dn_aglevel
+#define db_agpref db_bmap.dn_agpref
+#define db_maxag db_bmap.dn_maxag
+#define db_maxfreebud db_bmap.dn_maxfreebud
+#define db_l2nbperpage db_bmap.dn_l2nbperpage
+
+/*
+ * macros for various conversions needed by the allocators.
+ * blkstol2(), cntlz(), and cnttz() are operating system dependent functions.
+ */
+/* convert number of blocks to log2 number of blocks, rounding up to
+ * the next log2 value if blocks is not a l2 multiple.
+ */
+#define BLKSTOL2(d) (blkstol2(d))
+
+/* convert number of leafs to log2 leaf value */
+#define NLSTOL2BSZ(n) (31 - cntlz((n)) + BUDMIN)
+
+/* convert leaf index to log2 leaf value */
+#define LITOL2BSZ(n,m,b) ((((n) == 0) ? (m) : cnttz((n))) + (b))
+
+/* convert a block number to a dmap control leaf index */
+#define BLKTOCTLLEAF(b,m) \
+ (((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m))
+
+/* convert log2 leaf value to buddy size */
+#define BUDSIZE(s,m) (1 << ((s) - (m)))
+
+/*
+ * external references.
+ */
+extern int dbMount(struct inode *ipbmap);
+
+extern int dbUnmount(struct inode *ipbmap, int mounterror);
+
+extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks);
+
+extern int dbUpdatePMap(struct inode *ipbmap,
+ int free, s64 blkno, s64 nblocks, tblock_t * tblk);
+
+extern int dbNextAG(struct inode *ipbmap);
+
+extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results);
+
+extern int dbAllocExact(struct inode *ip, s64 blkno, int nblocks);
+
+extern int dbReAlloc(struct inode *ipbmap,
+ s64 blkno, s64 nblocks, s64 addnblocks, s64 * results);
+
+extern int dbSync(struct inode *ipbmap);
+extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks);
+extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks);
+extern void dbFinalizeBmap(struct inode *ipbmap);
+extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
+#endif /* _H_JFS_DMAP */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
new file mode 100644
index 000000000000..42e1b280ca12
--- /dev/null
+++ b/fs/jfs/jfs_dtree.c
@@ -0,0 +1,4539 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+*/
+
+/*
+ * jfs_dtree.c: directory B+-tree manager
+ *
+ * B+-tree with variable length key directory:
+ *
+ * each directory page is structured as an array of 32-byte
+ * directory entry slots initialized as a freelist
+ * to avoid search/compaction of free space at insertion.
+ * when an entry is inserted, a number of slots are allocated
+ * from the freelist as required to store variable length data
+ * of the entry; when the entry is deleted, slots of the entry
+ * are returned to freelist.
+ *
+ * leaf entry stores full name as key and file serial number
+ * (aka inode number) as data.
+ * internal/router entry stores sufffix compressed name
+ * as key and simple extent descriptor as data.
+ *
+ * each directory page maintains a sorted entry index table
+ * which stores the start slot index of sorted entries
+ * to allow binary search on the table.
+ *
+ * directory starts as a root/leaf page in on-disk inode
+ * inline data area.
+ * when it becomes full, it starts a leaf of a external extent
+ * of length of 1 block. each time the first leaf becomes full,
+ * it is extended rather than split (its size is doubled),
+ * until its length becoms 4 KBytes, from then the extent is split
+ * with new 4 Kbyte extent when it becomes full
+ * to reduce external fragmentation of small directories.
+ *
+ * blah, blah, blah, for linear scan of directory in pieces by
+ * readdir().
+ *
+ *
+ * case-insensitive directory file system
+ *
+ * names are stored in case-sensitive way in leaf entry.
+ * but stored, searched and compared in case-insensitive (uppercase) order
+ * (i.e., both search key and entry key are folded for search/compare):
+ * (note that case-sensitive order is BROKEN in storage, e.g.,
+ * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
+ *
+ * entries which folds to the same key makes up a equivalent class
+ * whose members are stored as contiguous cluster (may cross page boundary)
+ * but whose order is arbitrary and acts as duplicate, e.g.,
+ * abc, Abc, aBc, abC)
+ *
+ * once match is found at leaf, requires scan forward/backward
+ * either for, in case-insensitive search, duplicate
+ * or for, in case-sensitive search, for exact match
+ *
+ * router entry must be created/stored in case-insensitive way
+ * in internal entry:
+ * (right most key of left page and left most key of right page
+ * are folded, and its suffix compression is propagated as router
+ * key in parent)
+ * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
+ * should be made the router key for the split)
+ *
+ * case-insensitive search:
+ *
+ * fold search key;
+ *
+ * case-insensitive search of B-tree:
+ * for internal entry, router key is already folded;
+ * for leaf entry, fold the entry key before comparison.
+ *
+ * if (leaf entry case-insensitive match found)
+ * if (next entry satisfies case-insensitive match)
+ * return EDUPLICATE;
+ * if (prev entry satisfies case-insensitive match)
+ * return EDUPLICATE;
+ * return match;
+ * else
+ * return no match;
+ *
+ * serialization:
+ * target directory inode lock is being held on entry/exit
+ * of all main directory service routines.
+ *
+ * log based recovery:
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/locks.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dmap.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+
+/* dtree split parameter */
+typedef struct {
+ metapage_t *mp;
+ s16 index;
+ s16 nslot;
+ component_t *key;
+ ddata_t *data;
+ pxdlist_t *pxdlist;
+} dtsplit_t;
+
+#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
+
+/* get page buffer for specified block address */
+#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
+{\
+ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
+ if (!(RC))\
+ {\
+ if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
+ ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
+ {\
+ jERROR(1,("DT_GETPAGE: dtree page corrupt\n"));\
+ BT_PUTPAGE(MP);\
+ updateSuper((IP)->i_sb, FM_DIRTY);\
+ MP = NULL;\
+ RC = EIO;\
+ }\
+ }\
+}
+
+/* for consistency */
+#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
+
+#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
+ BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
+
+/*
+ * forward references
+ */
+static int dtSplitUp(tid_t tid, struct inode *ip,
+ dtsplit_t * split, btstack_t * btstack);
+
+static int dtSplitPage(tid_t tid, struct inode *ip, dtsplit_t * split,
+ metapage_t ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
+
+static int dtExtendPage(tid_t tid, struct inode *ip,
+ dtsplit_t * split, btstack_t * btstack);
+
+static int dtSplitRoot(tid_t tid, struct inode *ip,
+ dtsplit_t * split, metapage_t ** rmpp);
+
+static int dtDeleteUp(tid_t tid, struct inode *ip, metapage_t * fmp,
+ dtpage_t * fp, btstack_t * btstack);
+
+static int dtSearchNode(struct inode *ip,
+ s64 lmxaddr, pxd_t * kpxd, btstack_t * btstack);
+
+static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
+
+static int dtReadFirst(struct inode *ip, btstack_t * btstack);
+
+static int dtReadNext(struct inode *ip,
+ loff_t * offset, btstack_t * btstack);
+
+static int dtCompare(component_t * key, dtpage_t * p, int si);
+
+static int ciCompare(component_t * key, dtpage_t * p, int si, int flag);
+
+static void dtGetKey(dtpage_t * p, int i, component_t * key, int flag);
+
+static void ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
+ int ri, component_t * key, int flag);
+
+static void dtInsertEntry(dtpage_t * p, int index, component_t * key,
+ ddata_t * data, dtlock_t ** dtlock);
+
+static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
+ dtlock_t ** sdtlock, dtlock_t ** ddtlock,
+ int do_index);
+
+static void dtDeleteEntry(dtpage_t * p, int fi, dtlock_t ** dtlock);
+
+static void dtTruncateEntry(dtpage_t * p, int ti, dtlock_t ** dtlock);
+
+static void dtLinelockFreelist(dtpage_t * p, int m, dtlock_t ** dtlock);
+
+#define ciToUpper(c) UniStrupr((c)->name)
+
+/*
+ * find_index()
+ *
+ * Returns dtree page containing directory table entry for specified
+ * index and pointer to its entry.
+ *
+ * mp must be released by caller.
+ */
+static dir_table_slot_t *find_index(struct inode *ip, u32 index,
+ metapage_t ** mp)
+{
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+ s64 blkno;
+ s64 offset;
+ int page_offset;
+ dir_table_slot_t *slot;
+ static int maxWarnings = 10;
+
+ if (index < 2) {
+ if (maxWarnings) {
+ jERROR(1, ("find_entry called with index = %d\n",
+ index));
+ maxWarnings--;
+ }
+ return 0;
+ }
+
+ if (index >= jfs_ip->next_index) {
+ jFYI(1, ("find_entry called with index >= next_index\n"));
+ return 0;
+ }
+
+ if (jfs_ip->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
+ /*
+ * Inline directory table
+ */
+ *mp = 0;
+ slot = &jfs_ip->i_dirtable[index - 2];
+ } else {
+ offset = (index - 2) * sizeof(dir_table_slot_t);
+ page_offset = offset & (PSIZE - 1);
+ blkno = ((offset + 1) >> L2PSIZE) <<
+ JFS_SBI(ip->i_sb)->l2nbperpage;
+
+ if (*mp && ((*mp)->index != blkno)) {
+ release_metapage(*mp);
+ *mp = 0;
+ }
+ if (*mp == 0)
+ *mp = read_metapage(ip, blkno, PSIZE, 0);
+ if (*mp == 0) {
+ jERROR(1,
+ ("free_index: error reading directory table\n"));
+ return 0;
+ }
+
+ slot =
+ (dir_table_slot_t *) ((char *) (*mp)->data +
+ page_offset);
+ }
+ return slot;
+}
+
+static inline void lock_index(tid_t tid, struct inode *ip, metapage_t * mp,
+ u32 index)
+{
+ tlock_t *tlck;
+ linelock_t *llck;
+ lv_t *lv;
+
+ tlck = txLock(tid, ip, mp, tlckDATA);
+ llck = (linelock_t *) tlck->lock;
+
+ if (llck->index >= llck->maxcnt)
+ llck = txLinelock(llck);
+ lv = &llck->lv[llck->index];
+
+ /*
+ * Linelock slot size is twice the size of directory table
+ * slot size. 512 entries per page.
+ */
+ lv->offset = ((index - 2) & 511) >> 1;
+ lv->length = 1;
+ llck->index++;
+}
+
+/*
+ * add_index()
+ *
+ * Adds an entry to the directory index table. This is used to provide
+ * each directory entry with a persistent index in which to resume
+ * directory traversals
+ */
+static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
+{
+ struct super_block *sb = ip->i_sb;
+ struct jfs_sb_info *sbi = JFS_SBI(sb);
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+ u64 blkno;
+ dir_table_slot_t *dirtab_slot;
+ u32 index;
+ linelock_t *llck;
+ lv_t *lv;
+ metapage_t *mp;
+ s64 offset;
+ uint page_offset;
+ int rc;
+ tlock_t *tlck;
+ s64 xaddr;
+
+ ASSERT(DO_INDEX(ip));
+
+ if (jfs_ip->next_index < 2) {
+ jERROR(1, ("next_index = %d. Please fix this!\n",
+ jfs_ip->next_index));
+ jfs_ip->next_index = 2;
+ }
+
+ index = jfs_ip->next_index++;
+
+ if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
+ /*
+ * i_size reflects size of index table, or 8 bytes per entry.
+ */
+ ip->i_size = (loff_t) (index - 1) << 3;
+
+ /*
+ * dir table fits inline within inode
+ */
+ dirtab_slot = &jfs_ip->i_dirtable[index-2];
+ dirtab_slot->flag = DIR_INDEX_VALID;
+ dirtab_slot->slot = slot;
+ DTSaddress(dirtab_slot, bn);
+
+ set_cflag(COMMIT_Dirtable, ip);
+
+ return index;
+ }
+ if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
+ /*
+ * It's time to move the inline table to an external
+ * page and begin to build the xtree
+ */
+
+ /*
+ * Save the table, we're going to overwrite it with the
+ * xtree root
+ */
+ dir_table_slot_t temp_table[12];
+ memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
+
+ /*
+ * Initialize empty x-tree
+ */
+ xtInitRoot(tid, ip);
+
+ /*
+ * Allocate the first block & add it to the xtree
+ */
+ xaddr = 0;
+ if ((rc =
+ xtInsert(tid, ip, 0, 0, sbi->nbperpage,
+ &xaddr, 0))) {
+ jFYI(1, ("add_index: xtInsert failed!\n"));
+ return -1;
+ }
+ ip->i_size = PSIZE;
+ ip->i_blocks += LBLK2PBLK(sb, sbi->nbperpage);
+
+ if ((mp = get_metapage(ip, 0, ip->i_blksize, 0)) == 0) {
+ jERROR(1, ("add_index: get_metapage failed!\n"));
+ xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+ return -1;
+ }
+ tlck = txLock(tid, ip, mp, tlckDATA);
+ llck = (linelock_t *) & tlck->lock;
+ ASSERT(llck->index == 0);
+ lv = &llck->lv[0];
+
+ lv->offset = 0;
+ lv->length = 6; /* tlckDATA slot size is 16 bytes */
+ llck->index++;
+
+ memcpy(mp->data, temp_table, sizeof(temp_table));
+
+ mark_metapage_dirty(mp);
+ release_metapage(mp);
+
+ /*
+ * Logging is now directed by xtree tlocks
+ */
+ clear_cflag(COMMIT_Dirtable, ip);
+ }
+
+ offset = (index - 2) * sizeof(dir_table_slot_t);
+ page_offset = offset & (PSIZE - 1);
+ blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
+ if (page_offset == 0) {
+ /*
+ * This will be the beginning of a new page
+ */
+ xaddr = 0;
+ if ((rc =
+ xtInsert(tid, ip, 0, blkno, sbi->nbperpage,
+ &xaddr, 0))) {
+ jFYI(1, ("add_index: xtInsert failed!\n"));
+ jfs_ip->next_index--;
+ return -1;
+ }
+ ip->i_size += PSIZE;
+ ip->i_blocks += LBLK2PBLK(sb, sbi->nbperpage);
+
+ if ((mp = get_metapage(ip, blkno, PSIZE, 0)))
+ memset(mp->data, 0, PSIZE); /* Just looks better */
+ else
+ xtTruncate(tid, ip, offset, COMMIT_PWMAP);
+ } else
+ mp = read_metapage(ip, blkno, PSIZE, 0);
+
+ if (mp == 0) {
+ jERROR(1, ("add_index: get/read_metapage failed!\n"));
+ return -1;
+ }
+
+ lock_index(tid, ip, mp, index);
+
+ dirtab_slot =
+ (dir_table_slot_t *) ((char *) mp->data + page_offset);
+ dirtab_slot->flag = DIR_INDEX_VALID;
+ dirtab_slot->slot = slot;
+ DTSaddress(dirtab_slot, bn);
+
+ mark_metapage_dirty(mp);
+ release_metapage(mp);
+
+ return index;
+}
+
+/*
+ * free_index()
+ *
+ * Marks an entry to the directory index table as free.
+ */
+static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
+{
+ dir_table_slot_t *dirtab_slot;
+ metapage_t *mp = 0;
+
+ dirtab_slot = find_index(ip, index, &mp);
+
+ if (dirtab_slot == 0)
+ return;
+
+ dirtab_slot->flag = DIR_INDEX_FREE;
+ dirtab_slot->slot = dirtab_slot->addr1 = 0;
+ dirtab_slot->addr2 = cpu_to_le32(next);
+
+ if (mp) {
+ lock_index(tid, ip, mp, index);
+ mark_metapage_dirty(mp);
+ release_metapage(mp);
+ } else
+ set_cflag(COMMIT_Dirtable, ip);
+}
+
+/*
+ * modify_index()
+ *
+ * Changes an entry in the directory index table
+ */
+static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
+ int slot, metapage_t ** mp)
+{
+ dir_table_slot_t *dirtab_slot;
+
+ dirtab_slot = find_index(ip, index, mp);
+
+ if (dirtab_slot == 0)
+ return;
+
+ DTSaddress(dirtab_slot, bn);
+ dirtab_slot->slot = slot;
+
+ if (*mp) {
+ lock_index(tid, ip, *mp, index);
+ mark_metapage_dirty(*mp);
+ } else
+ set_cflag(COMMIT_Dirtable, ip);
+}
+
+/*
+ * get_index()
+ *
+ * reads a directory table slot
+ */
+static int get_index(struct inode *ip, u32 index,
+ dir_table_slot_t * dirtab_slot)
+{
+ metapage_t *mp = 0;
+ dir_table_slot_t *slot;
+
+ slot = find_index(ip, index, &mp);
+ if (slot == 0) {
+ return -EIO;
+ }
+
+ memcpy(dirtab_slot, slot, sizeof(dir_table_slot_t));
+
+ if (mp)
+ release_metapage(mp);
+
+ return 0;
+}
+
+/*
+ * dtSearch()
+ *
+ * function:
+ * Search for the entry with specified key
+ *
+ * parameter:
+ *
+ * return: 0 - search result on stack, leaf page pinned;
+ * errno - I/O error
+ */
+int dtSearch(struct inode *ip,
+ component_t * key, ino_t * data, btstack_t * btstack, int flag)
+{
+ int rc = 0;
+ int cmp = 1; /* init for empty page */
+ s64 bn;
+ metapage_t *mp;
+ dtpage_t *p;
+ s8 *stbl;
+ int base, index, lim;
+ btframe_t *btsp;
+ pxd_t *pxd;
+ int psize = 288; /* initial in-line directory */
+ ino_t inumber;
+ component_t ciKey;
+ struct super_block *sb = ip->i_sb;
+
+ ciKey.name =
+ (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+ GFP_NOFS);
+ if (ciKey.name == 0) {
+ rc = ENOMEM;
+ goto dtSearch_Exit2;
+ }
+
+
+ /* uppercase search key for c-i directory */
+ UniStrcpy(ciKey.name, key->name);
+ ciKey.namlen = key->namlen;
+
+ /* only uppercase if case-insensitive support is on */
+ if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
+ ciToUpper(&ciKey);
+ }
+ BT_CLR(btstack); /* reset stack */
+
+ /* init level count for max pages to split */
+ btstack->nsplit = 1;
+
+ /*
+ * search down tree from root:
+ *
+ * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+ * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+ *
+ * if entry with search key K is not found
+ * internal page search find the entry with largest key Ki
+ * less than K which point to the child page to search;
+ * leaf page search find the entry with smallest key Kj
+ * greater than K so that the returned index is the position of
+ * the entry to be shifted right for insertion of new entry.
+ * for empty tree, search key is greater than any key of the tree.
+ *
+ * by convention, root bn = 0.
+ */
+ for (bn = 0;;) {
+ /* get/pin the page to search */
+ DT_GETPAGE(ip, bn, mp, psize, p, rc);
+ if (rc)
+ goto dtSearch_Exit1;
+
+ /* get sorted entry table of the page */
+ stbl = DT_GETSTBL(p);
+
+ /*
+ * binary search with search key K on the current page.
+ */
+ for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
+ index = base + (lim >> 1);
+
+ if (p->header.flag & BT_LEAF) {
+ /* uppercase leaf name to compare */
+ cmp =
+ ciCompare(&ciKey, p, stbl[index],
+ JFS_SBI(sb)->mntflag);
+ } else {
+ /* router key is in uppercase */
+
+ cmp = dtCompare(&ciKey, p, stbl[index]);
+
+
+ }
+ if (cmp == 0) {
+ /*
+ * search hit
+ */
+ /* search hit - leaf page:
+ * return the entry found
+ */
+ if (p->header.flag & BT_LEAF) {
+ inumber = le32_to_cpu(
+ ((ldtentry_t *) & p->slot[stbl[index]])->inumber);
+
+ /*
+ * search for JFS_LOOKUP
+ */
+ if (flag == JFS_LOOKUP) {
+ *data = inumber;
+ rc = 0;
+ goto out;
+ }
+
+ /*
+ * search for JFS_CREATE
+ */
+ if (flag == JFS_CREATE) {
+ *data = inumber;
+ rc = EEXIST;
+ goto out;
+ }
+
+ /*
+ * search for JFS_REMOVE or JFS_RENAME
+ */
+ if ((flag == JFS_REMOVE ||
+ flag == JFS_RENAME) &&
+ *data != inumber) {
+ rc = ESTALE;
+ goto out;
+ }
+
+ /*
+ * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
+ */
+ /* save search result */
+ *data = inumber;
+ btsp = btstack->top;
+ btsp->bn = bn;
+ btsp->index = index;
+ btsp->mp = mp;
+
+ rc = 0;
+ goto dtSearch_Exit1;
+ }
+
+ /* search hit - internal page:
+ * descend/search its child page
+ */
+ goto getChild;
+ }
+
+ if (cmp > 0) {
+ base = index + 1;
+ --lim;
+ }
+ }
+
+ /*
+ * search miss
+ *
+ * base is the smallest index with key (Kj) greater than
+ * search key (K) and may be zero or (maxindex + 1) index.
+ */
+ /*
+ * search miss - leaf page
+ *
+ * return location of entry (base) where new entry with
+ * search key K is to be inserted.
+ */
+ if (p->header.flag & BT_LEAF) {
+ /*
+ * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
+ */
+ if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
+ flag == JFS_RENAME) {
+ rc = ENOENT;
+ goto out;
+ }
+
+ /*
+ * search for JFS_CREATE|JFS_FINDDIR:
+ *
+ * save search result
+ */
+ *data = 0;
+ btsp = btstack->top;
+ btsp->bn = bn;
+ btsp->index = base;
+ btsp->mp = mp;
+
+ rc = 0;
+ goto dtSearch_Exit1;
+ }
+
+ /*
+ * search miss - internal page
+ *
+ * if base is non-zero, decrement base by one to get the parent
+ * entry of the child page to search.
+ */
+ index = base ? base - 1 : base;
+
+ /*
+ * go down to child page
+ */
+ getChild:
+ /* update max. number of pages to split */
+ if (btstack->nsplit >= 8) {
+ /* Something's corrupted, mark filesytem dirty so
+ * chkdsk will fix it.
+ */
+ jERROR(1, ("stack overrun in dtSearch!\n"));
+ updateSuper(sb, FM_DIRTY);
+ rc = EIO;
+ goto out;
+ }
+ btstack->nsplit++;
+
+ /* push (bn, index) of the parent page/entry */
+ BT_PUSH(btstack, bn, index);
+
+ /* get the child page block number */
+ pxd = (pxd_t *) & p->slot[stbl[index]];
+ bn = addressPXD(pxd);
+ psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
+
+ /* unpin the parent page */
+ DT_PUTPAGE(mp);
+ }
+
+ out:
+ DT_PUTPAGE(mp);
+
+ dtSearch_Exit1:
+
+ kfree(ciKey.name);
+
+ dtSearch_Exit2:
+
+ return rc;
+}
+
+
+/*
+ * dtInsert()
+ *
+ * function: insert an entry to directory tree
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ * errno - failure;
+ */
+int dtInsert(tid_t tid, struct inode *ip,
+ component_t * name, ino_t * fsn, btstack_t * btstack)
+{
+ int rc = 0;
+ metapage_t *mp; /* meta-page buffer */
+ dtpage_t *p; /* base B+-tree index page */
+ s64 bn;
+ int index;
+ dtsplit_t split; /* split information */
+ ddata_t data;
+ dtlock_t *dtlck;
+ int n;
+ tlock_t *tlck;
+ lv_t *lv;
+
+ /*
+ * retrieve search result
+ *
+ * dtSearch() returns (leaf page pinned, index at which to insert).
+ * n.b. dtSearch() may return index of (maxindex + 1) of
+ * the full page.
+ */
+ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
+
+ /*
+ * insert entry for new key
+ */
+ if (DO_INDEX(ip)) {
+ if (JFS_IP(ip)->next_index == -1) {
+ DT_PUTPAGE(mp);
+ return EMLINK;
+ }
+ n = NDTLEAF(name->namlen);
+ data.leaf.tid = tid;
+ data.leaf.ip = ip;
+ } else {
+ n = NDTLEAF_LEGACY(name->namlen);
+ data.leaf.ip = 0; /* signifies legacy directory format */
+ }
+ data.leaf.ino = cpu_to_le32(*fsn);
+
+ /*
+ * leaf page does not have enough room for new entry:
+ *
+ * extend/split the leaf page;
+ *
+ * dtSplitUp() will insert the entry and unpin the leaf page.
+ */
+ if (n > p->header.freecnt) {
+ split.mp = mp;
+ split.index = index;
+ split.nslot = n;
+ split.key = name;
+ split.data = &data;
+ rc = dtSplitUp(tid, ip, &split, btstack);
+ return rc;
+ }
+
+ /*
+ * leaf page does have enough room for new entry:
+ *
+ * insert the new data entry into the leaf page;
+ */
+ BT_MARK_DIRTY(mp, ip);
+ /*
+ * acquire a transaction lock on the leaf page
+ */
+ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+ dtlck = (dtlock_t *) & tlck->lock;
+ ASSERT(dtlck->index == 0);
+ lv = (lv_t *) & dtlck->lv[0];
+
+ /* linelock header */
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ dtInsertEntry(p, index, name, &data, &dtlck);
+
+ /* linelock stbl of non-root leaf page */
+ if (!(p->header.flag & BT_ROOT)) {
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+ n = index >> L2DTSLOTSIZE;
+ lv->offset = p->header.stblindex + n;
+ lv->length =
+ ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
+ dtlck->index++;
+ }
+
+ /* unpin the leaf page */
+ DT_PUTPAGE(mp);
+
+ return 0;
+}
+
+
+/*
+ * dtSplitUp()
+ *
+ * function: propagate insertion bottom up;
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ * errno - failure;
+ * leaf page unpinned;
+ */
+static int dtSplitUp(tid_t tid,
+ struct inode *ip, dtsplit_t * split, btstack_t * btstack)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ int rc = 0;
+ metapage_t *smp;
+ dtpage_t *sp; /* split page */
+ metapage_t *rmp;
+ dtpage_t *rp; /* new right page split from sp */
+ pxd_t rpxd; /* new right page extent descriptor */
+ metapage_t *lmp;
+ dtpage_t *lp; /* left child page */
+ int skip; /* index of entry of insertion */
+ btframe_t *parent; /* parent page entry on traverse stack */
+ s64 xaddr, nxaddr;
+ int xlen, xsize;
+ pxdlist_t pxdlist;
+ pxd_t *pxd;
+ component_t key = { 0, 0 };
+ ddata_t *data = split->data;
+ int n;
+ dtlock_t *dtlck;
+ tlock_t *tlck;
+ lv_t *lv;
+
+ /* get split page */
+ smp = split->mp;
+ sp = DT_PAGE(ip, smp);
+
+ key.name =
+ (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t),
+ GFP_NOFS);
+ if (key.name == 0) {
+ DT_PUTPAGE(smp);
+ rc = ENOMEM;
+ goto dtSplitUp_Exit;
+ }
+
+ /*
+ * split leaf page
+ *
+ * The split routines insert the new entry, and
+ * acquire txLock as appropriate.
+ */
+ /*
+ * split root leaf page:
+ */
+ if (sp->header.flag & BT_ROOT) {
+ /*
+ * allocate a single extent child page
+ */
+ xlen = 1;
+ n = sbi->bsize >> L2DTSLOTSIZE;
+ n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */
+ n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
+ if (n <= split->nslot)
+ xlen++;
+ if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)))
+ goto freeKeyName;
+
+ pxdlist.maxnpxd = 1;
+ pxdlist.npxd = 0;
+ pxd = &pxdlist.pxd[0];
+ PXDaddress(pxd, xaddr);
+ PXDlength(pxd, xlen);
+ split->pxdlist = &pxdlist;
+ rc = dtSplitRoot(tid, ip, split, &rmp);
+
+ DT_PUTPAGE(rmp);
+ DT_PUTPAGE(smp);
+
+ goto freeKeyName;
+ }
+
+ /*
+ * extend first leaf page
+ *
+ * extend the 1st extent if less than buffer page size
+ * (dtExtendPage() reurns leaf page unpinned)
+ */
+ pxd = &sp->header.self;
+ xlen = lengthPXD(pxd);
+ xsize = xlen << sbi->l2bsize;
+ if (xsize < PSIZE) {
+ xaddr = addressPXD(pxd);
+ n = xsize >> L2DTSLOTSIZE;
+ n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */
+ if ((n + sp->header.freecnt) <= split->nslot)
+ n = xlen + (xlen << 1);
+ else
+ n = xlen;
+ if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
+ (s64) n, &nxaddr)))
+ goto extendOut;
+
+ pxdlist.maxnpxd = 1;
+ pxdlist.npxd = 0;
+ pxd = &pxdlist.pxd[0];
+ PXDaddress(pxd, nxaddr)
+ PXDlength(pxd, xlen + n);
+ split->pxdlist = &pxdlist;
+ if ((rc = dtExtendPage(tid, ip, split, btstack))) {
+ nxaddr = addressPXD(pxd);
+ if (xaddr != nxaddr) {
+ /* free relocated extent */
+ xlen = lengthPXD(pxd);
+ dbFree(ip, nxaddr, (s64) xlen);
+ } else {
+ /* free extended delta */
+ xlen = lengthPXD(pxd) - n;
+ xaddr = addressPXD(pxd) + xlen;
+ dbFree(ip, xaddr, (s64) n);
+ }
+ }
+
+ extendOut:
+ DT_PUTPAGE(smp);
+ goto freeKeyName;
+ }
+
+ /*
+ * split leaf page <sp> into <sp> and a new right page <rp>.
+ *
+ * return <rp> pinned and its extent descriptor <rpxd>
+ */
+ /*
+ * allocate new directory page extent and
+ * new index page(s) to cover page split(s)
+ *
+ * allocation hint: ?
+ */
+ n = btstack->nsplit;
+ pxdlist.maxnpxd = pxdlist.npxd = 0;
+ xlen = sbi->nbperpage;
+ for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
+ if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
+ PXDaddress(pxd, xaddr);
+ PXDlength(pxd, xlen);
+ pxdlist.maxnpxd++;
+ continue;
+ }
+
+ DT_PUTPAGE(smp);
+
+ /* undo allocation */
+ goto splitOut;
+ }
+
+ split->pxdlist = &pxdlist;
+ if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
+ DT_PUTPAGE(smp);
+
+ /* undo allocation */
+ goto splitOut;
+ }
+
+ /*
+ * propagate up the router entry for the leaf page just split
+ *
+ * insert a router entry for the new page into the parent page,
+ * propagate the insert/split up the tree by walking back the stack
+ * of (bn of parent page, index of child page entry in parent page)
+ * that were traversed during the search for the page that split.
+ *
+ * the propagation of insert/split up the tree stops if the root
+ * splits or the page inserted into doesn't have to split to hold
+ * the new entry.
+ *
+ * the parent entry for the split page remains the same, and
+ * a new entry is inserted at its right with the first key and
+ * block number of the new right page.
+ *
+ * There are a maximum of 4 pages pinned at any time:
+ * two children, left parent and right parent (when the parent splits).
+ * keep the child pages pinned while working on the parent.
+ * make sure that all pins are released at exit.
+ */
+ while ((parent = BT_POP(btstack)) != NULL) {
+ /* parent page specified by stack frame <parent> */
+
+ /* keep current child pages (<lp>, <rp>) pinned */
+ lmp = smp;
+ lp = sp;
+
+ /*
+ * insert router entry in parent for new right child page <rp>
+ */
+ /* get the parent page <sp> */
+ DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
+ if (rc) {
+ DT_PUTPAGE(lmp);
+ DT_PUTPAGE(rmp);
+ goto splitOut;
+ }
+
+ /*
+ * The new key entry goes ONE AFTER the index of parent entry,
+ * because the split was to the right.
+ */
+ skip = parent->index + 1;
+
+ /*
+ * compute the key for the router entry
+ *
+ * key suffix compression:
+ * for internal pages that have leaf pages as children,
+ * retain only what's needed to distinguish between
+ * the new entry and the entry on the page to its left.
+ * If the keys compare equal, retain the entire key.
+ *
+ * note that compression is performed only at computing
+ * router key at the lowest internal level.
+ * further compression of the key between pairs of higher
+ * level internal pages loses too much information and
+ * the search may fail.
+ * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
+ * results in two adjacent parent entries (a)(xx).
+ * if split occurs between these two entries, and
+ * if compression is applied, the router key of parent entry
+ * of right page (x) will divert search for x into right
+ * subtree and miss x in the left subtree.)
+ *
+ * the entire key must be retained for the next-to-leftmost
+ * internal key at any level of the tree, or search may fail
+ * (e.g., ?)
+ */
+ switch (rp->header.flag & BT_TYPE) {
+ case BT_LEAF:
+ /*
+ * compute the length of prefix for suffix compression
+ * between last entry of left page and first entry
+ * of right page
+ */
+ if ((sp->header.flag & BT_ROOT && skip > 1) ||
+ sp->header.prev != 0 || skip > 1) {
+ /* compute uppercase router prefix key */
+ ciGetLeafPrefixKey(lp,
+ lp->header.nextindex - 1,
+ rp, 0, &key, sbi->mntflag);
+ } else {
+ /* next to leftmost entry of
+ lowest internal level */
+
+ /* compute uppercase router key */
+ dtGetKey(rp, 0, &key, sbi->mntflag);
+ key.name[key.namlen] = 0;
+
+ if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
+ ciToUpper(&key);
+ }
+
+ n = NDTINTERNAL(key.namlen);
+ break;
+
+ case BT_INTERNAL:
+ dtGetKey(rp, 0, &key, sbi->mntflag);
+ n = NDTINTERNAL(key.namlen);
+ break;
+
+ default:
+ jERROR(2, ("dtSplitUp(): UFO!\n"));
+ break;
+ }
+
+ /* unpin left child page */
+ DT_PUTPAGE(lmp);
+
+ /*
+ * compute the data for the router entry
+ */
+ data->xd = rpxd; /* child page xd */
+
+ /*
+ * parent page is full - split the parent page
+ */
+ if (n > sp->header.freecnt) {
+ /* init for parent page split */
+ split->mp = smp;
+ split->index = skip; /* index at insert */
+ split->nslot = n;
+ split->key = &key;
+ /* split->data = data; */
+
+ /* unpin right child page */
+ DT_PUTPAGE(rmp);
+
+ /* The split routines insert the new entry,
+ * acquire txLock as appropriate.
+ * return <rp> pinned and its block number <rbn>.
+ */
+ rc = (sp->header.flag & BT_ROOT) ?
+ dtSplitRoot(tid, ip, split, &rmp) :
+ dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
+ if (rc) {
+ DT_PUTPAGE(smp);
+ goto splitOut;
+ }
+
+ /* smp and rmp are pinned */
+ }
+ /*
+ * parent page is not full - insert router entry in parent page
+ */
+ else {
+ BT_MARK_DIRTY(smp, ip);
+ /*
+ * acquire a transaction lock on the parent page
+ */
+ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
+ dtlck = (dtlock_t *) & tlck->lock;
+ ASSERT(dtlck->index == 0);
+ lv = (lv_t *) & dtlck->lv[0];
+
+ /* linelock header */
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ /* linelock stbl of non-root parent page */
+ if (!(sp->header.flag & BT_ROOT)) {
+ lv++;
+ n = skip >> L2DTSLOTSIZE;
+ lv->offset = sp->header.stblindex + n;
+ lv->length =
+ ((sp->header.nextindex -
+ 1) >> L2DTSLOTSIZE) - n + 1;
+ dtlck->index++;
+ }
+
+ dtInsertEntry(sp, skip, &key, data, &dtlck);
+
+ /* exit propagate up */
+ break;
+ }
+ }
+
+ /* unpin current split and its right page */
+ DT_PUTPAGE(smp);
+ DT_PUTPAGE(rmp);
+
+ /*
+ * free remaining extents allocated for split
+ */
+ splitOut:
+ n = pxdlist.npxd;
+ pxd = &pxdlist.pxd[n];
+ for (; n < pxdlist.maxnpxd; n++, pxd++)
+ dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
+
+ freeKeyName:
+ kfree(key.name);
+
+ dtSplitUp_Exit:
+
+ return rc;
+}
+
+
+/*
+ * dtSplitPage()
+ *
+ * function: Split a non-root page of a btree.
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ * errno - failure;
+ * return split and new page pinned;
+ */
+static int dtSplitPage(tid_t tid, struct inode *ip, dtsplit_t * split,
+ metapage_t ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
+{
+ struct super_block *sb = ip->i_sb;
+ int rc = 0;
+ metapage_t *smp;
+ dtpage_t *sp;
+ metapage_t *rmp;
+ dtpage_t *rp; /* new right page allocated */
+ s64 rbn; /* new right page block number */
+ metapage_t *mp;
+ dtpage_t *p;
+ s64 nextbn;
+ pxdlist_t *pxdlist;
+ pxd_t *pxd;
+ int skip, nextindex, half, left, nxt, off, si;
+ ldtentry_t *ldtentry;
+ idtentry_t *idtentry;
+ u8 *stbl;
+ dtslot_t *f;
+ int fsi, stblsize;
+ int n;
+ dtlock_t *sdtlck, *rdtlck;
+ tlock_t *tlck;
+ dtlock_t *dtlck;
+ lv_t *slv, *rlv, *lv;
+
+ /* get split page */
+ smp = split->mp;
+ sp = DT_PAGE(ip, smp);
+
+ /*
+ * allocate the new right page for the split
+ */
+ pxdlist = split->pxdlist;
+ pxd = &pxdlist->pxd[pxdlist->npxd];
+ pxdlist->npxd++;
+ rbn = addressPXD(pxd);
+ rmp = get_metapage(ip, rbn, PSIZE, 1);
+ if (rmp == NULL)
+ return EIO;
+
+ jEVENT(0,
+ ("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p\n", ip, smp, rmp));
+
+ BT_MARK_DIRTY(rmp, ip);
+ /*
+ * acquire a transaction lock on the new right page
+ */
+ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
+ rdtlck = (dtlock_t *) & tlck->lock;
+
+ rp = (dtpage_t *) rmp->data;
+ *rpp = rp;
+ rp->header.self = *pxd;
+
+ BT_MARK_DIRTY(smp, ip);
+ /*
+ * acquire a transaction lock on the split page
+ *
+ * action:
+ */
+ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
+ sdtlck = (dtlock_t *) & tlck->lock;
+
+ /* linelock header of split page */
+ ASSERT(sdtlck->index == 0);
+ slv = (lv_t *) & sdtlck->lv[0];
+ slv->offset = 0;
+ slv->length = 1;
+ sdtlck->index++;
+
+ /*
+ * initialize/update sibling pointers between sp and rp
+ */
+ nextbn = le64_to_cpu(sp->header.next);
+ rp->header.next = cpu_to_le64(nextbn);
+ rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
+ sp->header.next = cpu_to_le64(rbn);
+
+ /*
+ * initialize new right page
+ */
+ rp->header.flag = sp->header.flag;
+
+ /* compute sorted entry table at start of extent data area */
+ rp->header.nextindex = 0;
+ rp->header.stblindex = 1;
+
+ n = PSIZE >> L2DTSLOTSIZE;
+ rp->header.maxslot = n;
+ stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */
+
+ /* init freelist */
+ fsi = rp->header.stblindex + stblsize;
+ rp->header.freelist = fsi;
+ rp->header.freecnt = rp->header.maxslot - fsi;
+
+ /*
+ * sequential append at tail: append without split
+ *
+ * If splitting the last page on a level because of appending
+ * a entry to it (skip is maxentry), it's likely that the access is
+ * sequential. Adding an empty page on the side of the level is less
+ * work and can push the fill factor much higher than normal.
+ * If we're wrong it's no big deal, we'll just do the split the right
+ * way next time.
+ * (It may look like it's equally easy to do a similar hack for
+ * reverse sorted data, that is, split the tree left,
+ * but it's not. Be my guest.)
+ */
+ if (nextbn == 0 && split->index == sp->header.nextindex) {
+ /* linelock header + stbl (first slot) of new page */
+ rlv = (lv_t *) & rdtlck->lv[rdtlck->index];
+ rlv->offset = 0;
+ rlv->length = 2;
+ rdtlck->index++;
+
+ /*
+ * initialize freelist of new right page
+ */
+ f = &rp->slot[fsi];
+ for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+ f->next = fsi;
+ f->next = -1;
+
+ /* insert entry at the first entry of the new right page */
+ dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
+
+ goto out;
+ }
+
+ /*
+ * non-sequential insert (at possibly middle page)
+ */
+
+ /*
+ * update prev pointer of previous right sibling page;
+ */
+ if (nextbn != 0) {
+ DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ BT_MARK_DIRTY(mp, ip);
+ /*
+ * acquire a transaction lock on the next page
+ */
+ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+ jEVENT(0,
+ ("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p\n",
+ tlck, ip, mp));
+ dtlck = (dtlock_t *) & tlck->lock;
+
+ /* linelock header of previous right sibling page */
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ p->header.prev = cpu_to_le64(rbn);
+
+ DT_PUTPAGE(mp);
+ }
+
+ /*
+ * split the data between the split and right pages.
+ */
+ skip = split->index;
+ half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */
+ left = 0;
+
+ /*
+ * compute fill factor for split pages
+ *
+ * <nxt> traces the next entry to move to rp
+ * <off> traces the next entry to stay in sp
+ */
+ stbl = (u8 *) & sp->slot[sp->header.stblindex];
+ nextindex = sp->header.nextindex;
+ for (nxt = off = 0; nxt < nextindex; ++off) {
+ if (off == skip)
+ /* check for fill factor with new entry size */
+ n = split->nslot;
+ else {
+ si = stbl[nxt];
+ switch (sp->header.flag & BT_TYPE) {
+ case BT_LEAF:
+ ldtentry = (ldtentry_t *) & sp->slot[si];
+ if (DO_INDEX(ip))
+ n = NDTLEAF(ldtentry->namlen);
+ else
+ n = NDTLEAF_LEGACY(ldtentry->
+ namlen);
+ break;
+
+ case BT_INTERNAL:
+ idtentry = (idtentry_t *) & sp->slot[si];
+ n = NDTINTERNAL(idtentry->namlen);
+ break;
+
+ default:
+ break;
+ }
+
+ ++nxt; /* advance to next entry to move in sp */
+ }
+
+ left += n;
+ if (left >= half)
+ break;
+ }
+
+ /* <nxt> poins to the 1st entry to move */
+
+ /*
+ * move entries to right page
+ *
+ * dtMoveEntry() initializes rp and reserves entry for insertion
+ *
+ * split page moved out entries are linelocked;
+ * new/right page moved in entries are linelocked;
+ */
+ /* linelock header + stbl of new right page */
+ rlv = (lv_t *) & rdtlck->lv[rdtlck->index];
+ rlv->offset = 0;
+ rlv->length = 5;
+ rdtlck->index++;
+
+ dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
+
+ sp->header.nextindex = nxt;
+
+ /*
+ * finalize freelist of new right page
+ */
+ fsi = rp->header.freelist;
+ f = &rp->slot[fsi];
+ for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+ f->next = fsi;
+ f->next = -1;
+
+ /*
+ * Update directory index table for entries now in right page
+ */
+ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
+ mp = 0;
+ stbl = DT_GETSTBL(rp);
+ for (n = 0; n < rp->header.nextindex; n++) {
+ ldtentry = (ldtentry_t *) & rp->slot[stbl[n]];
+ modify_index(tid, ip, le32_to_cpu(ldtentry->index),
+ rbn, n, &mp);
+ }
+ if (mp)
+ release_metapage(mp);
+ }
+
+ /*
+ * the skipped index was on the left page,
+ */
+ if (skip <= off) {
+ /* insert the new entry in the split page */
+ dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
+
+ /* linelock stbl of split page */
+ if (sdtlck->index >= sdtlck->maxcnt)
+ sdtlck = (dtlock_t *) txLinelock(sdtlck);
+ slv = (lv_t *) & sdtlck->lv[sdtlck->index];
+ n = skip >> L2DTSLOTSIZE;
+ slv->offset = sp->header.stblindex + n;
+ slv->length =
+ ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
+ sdtlck->index++;
+ }
+ /*
+ * the skipped index was on the right page,
+ */
+ else {
+ /* adjust the skip index to reflect the new position */
+ skip -= nxt;
+
+ /* insert the new entry in the right page */
+ dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
+ }
+
+ out:
+ *rmpp = rmp;
+ *rpxdp = *pxd;
+
+ ip->i_blocks += LBLK2PBLK(sb, lengthPXD(pxd));
+
+ jEVENT(0, ("dtSplitPage: ip:0x%p sp:0x%p rp:0x%p\n", ip, sp, rp));
+ return 0;
+}
+
+
+/*
+ * dtExtendPage()
+ *
+ * function: extend 1st/only directory leaf page
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ * errno - failure;
+ * return extended page pinned;
+ */
+static int dtExtendPage(tid_t tid,
+ struct inode *ip, dtsplit_t * split, btstack_t * btstack)
+{
+ struct super_block *sb = ip->i_sb;
+ int rc;
+ metapage_t *smp, *pmp, *mp;
+ dtpage_t *sp, *pp;
+ pxdlist_t *pxdlist;
+ pxd_t *pxd, *tpxd;
+ int xlen, xsize;
+ int newstblindex, newstblsize;
+ int oldstblindex, oldstblsize;
+ int fsi, last;
+ dtslot_t *f;
+ btframe_t *parent;
+ int n;
+ dtlock_t *dtlck;
+ s64 xaddr, txaddr;
+ tlock_t *tlck;
+ pxdlock_t *pxdlock;
+ lv_t *lv;
+ uint type;
+ ldtentry_t *ldtentry;
+ u8 *stbl;
+
+ /* get page to extend */
+ smp = split->mp;
+ sp = DT_PAGE(ip, smp);
+
+ /* get parent/root page */
+ parent = BT_POP(btstack);
+ DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
+ if (rc)
+ return (rc);
+
+ /*
+ * extend the extent
+ */
+ pxdlist = split->pxdlist;
+ pxd = &pxdlist->pxd[pxdlist->npxd];
+ pxdlist->npxd++;
+
+ xaddr = addressPXD(pxd);
+ tpxd = &sp->header.self;
+ txaddr = addressPXD(tpxd);
+ /* in-place extension */
+ if (xaddr == txaddr) {
+ type = tlckEXTEND;
+ }
+ /* relocation */
+ else {
+ type = tlckNEW;
+
+ /* save moved extent descriptor for later free */
+ tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
+ pxdlock = (pxdlock_t *) & tlck->lock;
+ pxdlock->flag = mlckFREEPXD;
+ pxdlock->pxd = sp->header.self;
+ pxdlock->index = 1;
+
+ /*
+ * Update directory index table to reflect new page address
+ */
+ if (DO_INDEX(ip)) {
+ mp = 0;
+ stbl = DT_GETSTBL(sp);
+ for (n = 0; n < sp->header.nextindex; n++) {
+ ldtentry =
+ (ldtentry_t *) & sp->slot[stbl[n]];
+ modify_index(tid, ip,
+ le32_to_cpu(ldtentry->index),
+ xaddr, n, &mp);
+ }
+ if (mp)
+ release_metapage(mp);
+ }
+ }
+
+ /*
+ * extend the page
+ */
+ sp->header.self = *pxd;
+
+ jEVENT(0,
+ ("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p\n", ip, smp, sp));
+
+ BT_MARK_DIRTY(smp, ip);
+ /*
+ * acquire a transaction lock on the extended/leaf page
+ */
+ tlck = txLock(tid, ip, smp, tlckDTREE | type);
+ dtlck = (dtlock_t *) & tlck->lock;
+ lv = (lv_t *) & dtlck->lv[0];
+
+ /* update buffer extent descriptor of extended page */
+ xlen = lengthPXD(pxd);
+ xsize = xlen << JFS_SBI(sb)->l2bsize;
+#ifdef _STILL_TO_PORT
+ bmSetXD(smp, xaddr, xsize);
+#endif /* _STILL_TO_PORT */
+
+ /*
+ * copy old stbl to new stbl at start of extended area
+ */
+ oldstblindex = sp->header.stblindex;
+ oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
+ newstblindex = sp->header.maxslot;
+ n = xsize >> L2DTSLOTSIZE;
+ newstblsize = (n + 31) >> L2DTSLOTSIZE;
+ memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
+ sp->header.nextindex);
+
+ /*
+ * in-line extension: linelock old area of extended page
+ */
+ if (type == tlckEXTEND) {
+ /* linelock header */
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+ lv++;
+
+ /* linelock new stbl of extended page */
+ lv->offset = newstblindex;
+ lv->length = newstblsize;
+ }
+ /*
+ * relocation: linelock whole relocated area
+ */
+ else {
+ lv->offset = 0;
+ lv->length = sp->header.maxslot + newstblsize;
+ }
+
+ dtlck->index++;
+
+ sp->header.maxslot = n;
+ sp->header.stblindex = newstblindex;
+ /* sp->header.nextindex remains the same */
+
+ /*
+ * add old stbl region at head of freelist
+ */
+ fsi = oldstblindex;
+ f = &sp->slot[fsi];
+ last = sp->header.freelist;
+ for (n = 0; n < oldstblsize; n++, fsi++, f++) {
+ f->next = last;
+ last = fsi;
+ }
+ sp->header.freelist = last;
+ sp->header.freecnt += oldstblsize;
+
+ /*
+ * append free region of newly extended area at tail of freelist
+ */
+ /* init free region of newly extended area */
+ fsi = n = newstblindex + newstblsize;
+ f = &sp->slot[fsi];
+ for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
+ f->next = fsi;
+ f->next = -1;
+
+ /* append new free region at tail of old freelist */
+ fsi = sp->header.freelist;
+ if (fsi == -1)
+ sp->header.freelist = n;
+ else {
+ do {
+ f = &sp->slot[fsi];
+ fsi = f->next;
+ } while (fsi != -1);
+
+ f->next = n;
+ }
+
+ sp->header.freecnt += sp->header.maxslot - n;
+
+ /*
+ * insert the new entry
+ */
+ dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
+
+ BT_MARK_DIRTY(pmp, ip);
+ /*
+ * linelock any freeslots residing in old extent
+ */
+ if (type == tlckEXTEND) {
+ n = sp->header.maxslot >> 2;
+ if (sp->header.freelist < n)
+ dtLinelockFreelist(sp, n, &dtlck);
+ }
+
+ /*
+ * update parent entry on the parent/root page
+ */
+ /*
+ * acquire a transaction lock on the parent/root page
+ */
+ tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
+ dtlck = (dtlock_t *) & tlck->lock;
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+
+ /* linelock parent entry - 1st slot */
+ lv->offset = 1;
+ lv->length = 1;
+ dtlck->index++;
+
+ /* update the parent pxd for page extension */
+ tpxd = (pxd_t *) & pp->slot[1];
+ *tpxd = *pxd;
+
+ /* Since the directory might have an EA and/or ACL associated with it
+ * we need to make sure we take that into account when setting the
+ * i_nblocks
+ */
+ ip->i_blocks = LBLK2PBLK(ip->i_sb, xlen +
+ ((JFS_IP(ip)->ea.flag & DXD_EXTENT) ?
+ lengthDXD(&JFS_IP(ip)->ea) : 0) +
+ ((JFS_IP(ip)->acl.flag & DXD_EXTENT) ?
+ lengthDXD(&JFS_IP(ip)->acl) : 0));
+
+ jEVENT(0,
+ ("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p\n", ip, smp, sp));
+
+
+ DT_PUTPAGE(pmp);
+ return 0;
+}
+
+
+/*
+ * dtSplitRoot()
+ *
+ * function:
+ * split the full root page into
+ * original/root/split page and new right page
+ * i.e., root remains fixed in tree anchor (inode) and
+ * the root is copied to a single new right child page
+ * since root page << non-root page, and
+ * the split root page contains a single entry for the
+ * new right child page.
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ * errno - failure;
+ * return new page pinned;
+ */
+static int dtSplitRoot(tid_t tid,
+ struct inode *ip, dtsplit_t * split, metapage_t ** rmpp)
+{
+ struct super_block *sb = ip->i_sb;
+ metapage_t *smp;
+ dtroot_t *sp;
+ metapage_t *rmp;
+ dtpage_t *rp;
+ s64 rbn;
+ int xlen;
+ int xsize;
+ dtslot_t *f;
+ s8 *stbl;
+ int fsi, stblsize, n;
+ idtentry_t *s;
+ pxd_t *ppxd;
+ pxdlist_t *pxdlist;
+ pxd_t *pxd;
+ dtlock_t *dtlck;
+ tlock_t *tlck;
+ lv_t *lv;
+
+ /* get split root page */
+ smp = split->mp;
+ sp = &JFS_IP(ip)->i_dtroot;
+
+ /*
+ * allocate/initialize a single (right) child page
+ *
+ * N.B. at first split, a one (or two) block to fit new entry
+ * is allocated; at subsequent split, a full page is allocated;
+ */
+ pxdlist = split->pxdlist;
+ pxd = &pxdlist->pxd[pxdlist->npxd];
+ pxdlist->npxd++;
+ rbn = addressPXD(pxd);
+ xlen = lengthPXD(pxd);
+ xsize = xlen << JFS_SBI(sb)->l2bsize;
+ rmp = get_metapage(ip, rbn, xsize, 1);
+ rp = rmp->data;
+
+ BT_MARK_DIRTY(rmp, ip);
+ /*
+ * acquire a transaction lock on the new right page
+ */
+ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
+ dtlck = (dtlock_t *) & tlck->lock;
+
+ rp->header.flag =
+ (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
+ rp->header.self = *pxd;
+
+ /* initialize sibling pointers */
+ rp->header.next = 0;
+ rp->header.prev = 0;
+
+ /*
+ * move in-line root page into new right page extent
+ */
+ /* linelock header + copied entries + new stbl (1st slot) in new page */
+ ASSERT(dtlck->index == 0);
+ lv = (lv_t *) & dtlck->lv[0];
+ lv->offset = 0;
+ lv->length = 10; /* 1 + 8 + 1 */
+ dtlck->index++;
+
+ n = xsize >> L2DTSLOTSIZE;
+ rp->header.maxslot = n;
+ stblsize = (n + 31) >> L2DTSLOTSIZE;
+
+ /* copy old stbl to new stbl at start of extended area */
+ rp->header.stblindex = DTROOTMAXSLOT;
+ stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
+ memcpy(stbl, sp->header.stbl, sp->header.nextindex);
+ rp->header.nextindex = sp->header.nextindex;
+
+ /* copy old data area to start of new data area */
+ memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
+
+ /*
+ * append free region of newly extended area at tail of freelist
+ */
+ /* init free region of newly extended area */
+ fsi = n = DTROOTMAXSLOT + stblsize;
+ f = &rp->slot[fsi];
+ for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+ f->next = fsi;
+ f->next = -1;
+
+ /* append new free region at tail of old freelist */
+ fsi = sp->header.freelist;
+ if (fsi == -1)
+ rp->header.freelist = n;
+ else {
+ rp->header.freelist = fsi;
+
+ do {
+ f = &rp->slot[fsi];
+ fsi = f->next;
+ } while (fsi != -1);
+
+ f->next = n;
+ }
+
+ rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
+
+ /*
+ * Update directory index table for entries now in right page
+ */
+ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
+ metapage_t *mp = 0;
+ ldtentry_t *ldtentry;
+
+ stbl = DT_GETSTBL(rp);
+ for (n = 0; n < rp->header.nextindex; n++) {
+ ldtentry = (ldtentry_t *) & rp->slot[stbl[n]];
+ modify_index(tid, ip, le32_to_cpu(ldtentry->index),
+ rbn, n, &mp);
+ }
+ if (mp)
+ release_metapage(mp);
+ }
+ /*
+ * Update directory index table for entries now in right page
+ */
+ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
+ metapage_t *mp = 0;
+ ldtentry_t *ldtentry;
+
+ stbl = DT_GETSTBL(rp);
+ for (n = 0; n < rp->header.nextindex; n++) {
+ ldtentry = (ldtentry_t *) & rp->slot[stbl[n]];
+ modify_index(tid, ip, le32_to_cpu(ldtentry->index),
+ rbn, n, &mp);
+ }
+ if (mp)
+ release_metapage(mp);
+ }
+ /*
+ * insert the new entry into the new right/child page
+ * (skip index in the new right page will not change)
+ */
+ dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
+
+ /*
+ * reset parent/root page
+ *
+ * set the 1st entry offset to 0, which force the left-most key
+ * at any level of the tree to be less than any search key.
+ *
+ * The btree comparison code guarantees that the left-most key on any
+ * level of the tree is never used, so it doesn't need to be filled in.
+ */
+ BT_MARK_DIRTY(smp, ip);
+ /*
+ * acquire a transaction lock on the root page (in-memory inode)
+ */
+ tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
+ dtlck = (dtlock_t *) & tlck->lock;
+
+ /* linelock root */
+ ASSERT(dtlck->index == 0);
+ lv = (lv_t *) & dtlck->lv[0];
+ lv->offset = 0;
+ lv->length = DTROOTMAXSLOT;
+ dtlck->index++;
+
+ /* update page header of root */
+ if (sp->header.flag & BT_LEAF) {
+ sp->header.flag &= ~BT_LEAF;
+ sp->header.flag |= BT_INTERNAL;
+ }
+
+ /* init the first entry */
+ s = (idtentry_t *) & sp->slot[DTENTRYSTART];
+ ppxd = (pxd_t *) s;
+ *ppxd = *pxd;
+ s->next = -1;
+ s->namlen = 0;
+
+ stbl = sp->header.stbl;
+ stbl[0] = DTENTRYSTART;
+ sp->header.nextindex = 1;
+
+ /* init freelist */
+ fsi = DTENTRYSTART + 1;
+ f = &sp->slot[fsi];
+
+ /* init free region of remaining area */
+ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
+ f->next = fsi;
+ f->next = -1;
+
+ sp->header.freelist = DTENTRYSTART + 1;
+ sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
+
+ *rmpp = rmp;
+
+ ip->i_blocks += LBLK2PBLK(ip->i_sb, lengthPXD(pxd));
+ return 0;
+}
+
+
+/*
+ * dtDelete()
+ *
+ * function: delete the entry(s) referenced by a key.
+ *
+ * parameter:
+ *
+ * return:
+ */
+int dtDelete(tid_t tid,
+ struct inode *ip, component_t * key, ino_t * ino, int flag)
+{
+ int rc = 0;
+ s64 bn;
+ metapage_t *mp, *imp;
+ dtpage_t *p;
+ int index;
+ btstack_t btstack;
+ dtlock_t *dtlck;
+ tlock_t *tlck;
+ lv_t *lv;
+ int i;
+ ldtentry_t *ldtentry;
+ u8 *stbl;
+ u32 table_index, next_index;
+ metapage_t *nmp;
+ dtpage_t *np;
+
+ /*
+ * search for the entry to delete:
+ *
+ * dtSearch() returns (leaf page pinned, index at which to delete).
+ */
+ if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
+ return rc;
+
+ /* retrieve search result */
+ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+ /*
+ * We need to find put the index of the next entry into the
+ * directory index table in order to resume a readdir from this
+ * entry.
+ */
+ if (DO_INDEX(ip)) {
+ stbl = DT_GETSTBL(p);
+ ldtentry = (ldtentry_t *) & p->slot[stbl[index]];
+ table_index = le32_to_cpu(ldtentry->index);
+ if (index == (p->header.nextindex - 1)) {
+ /*
+ * Last entry in this leaf page
+ */
+ if ((p->header.flag & BT_ROOT)
+ || (p->header.next == 0))
+ next_index = -1;
+ else {
+ /* Read next leaf page */
+ DT_GETPAGE(ip, le64_to_cpu(p->header.next),
+ nmp, PSIZE, np, rc);
+ if (rc)
+ next_index = -1;
+ else {
+ stbl = DT_GETSTBL(np);
+ ldtentry =
+ (ldtentry_t *) & np->
+ slot[stbl[0]];
+ next_index =
+ le32_to_cpu(ldtentry->index);
+ DT_PUTPAGE(nmp);
+ }
+ }
+ } else {
+ ldtentry =
+ (ldtentry_t *) & p->slot[stbl[index + 1]];
+ next_index = le32_to_cpu(ldtentry->index);
+ }
+ free_index(tid, ip, table_index, next_index);
+ }
+ /*
+ * the leaf page becomes empty, delete the page
+ */
+ if (p->header.nextindex == 1) {
+ /* delete empty page */
+ rc = dtDeleteUp(tid, ip, mp, p, &btstack);
+ }
+ /*
+ * the leaf page has other entries remaining:
+ *
+ * delete the entry from the leaf page.
+ */
+ else {
+ BT_MARK_DIRTY(mp, ip);
+ /*
+ * acquire a transaction lock on the leaf page
+ */
+ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+ dtlck = (dtlock_t *) & tlck->lock;
+
+ /*
+ * Do not assume that dtlck->index will be zero. During a
+ * rename within a directory, this transaction may have
+ * modified this page already when adding the new entry.
+ */
+
+ /* linelock header */
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ /* linelock stbl of non-root leaf page */
+ if (!(p->header.flag & BT_ROOT)) {
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+ i = index >> L2DTSLOTSIZE;
+ lv->offset = p->header.stblindex + i;
+ lv->length =
+ ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
+ i + 1;
+ dtlck->index++;
+ }
+
+ /* free the leaf entry */
+ dtDeleteEntry(p, index, &dtlck);
+
+ /*
+ * Update directory index table for entries moved in stbl
+ */
+ if (DO_INDEX(ip) && index < p->header.nextindex) {
+ imp = 0;
+ stbl = DT_GETSTBL(p);
+ for (i = index; i < p->header.nextindex; i++) {
+ ldtentry =
+ (ldtentry_t *) & p->slot[stbl[i]];
+ modify_index(tid, ip,
+ le32_to_cpu(ldtentry->index),
+ bn, i, &imp);
+ }
+ if (imp)
+ release_metapage(imp);
+ }
+
+ DT_PUTPAGE(mp);
+ }
+
+ return rc;
+}
+
+
+/*
+ * dtDeleteUp()
+ *
+ * function:
+ * free empty pages as propagating deletion up the tree
+ *
+ * parameter:
+ *
+ * return:
+ */
+static int dtDeleteUp(tid_t tid, struct inode *ip,
+ metapage_t * fmp, dtpage_t * fp, btstack_t * btstack)
+{
+ int rc = 0;
+ metapage_t *mp;
+ dtpage_t *p;
+ int index, nextindex;
+ int xlen;
+ btframe_t *parent;
+ dtlock_t *dtlck;
+ tlock_t *tlck;
+ lv_t *lv;
+ pxdlock_t *pxdlock;
+ int i;
+
+ /*
+ * keep the root leaf page which has become empty
+ */
+ if (BT_IS_ROOT(fmp)) {
+ /*
+ * reset the root
+ *
+ * dtInitRoot() acquires txlock on the root
+ */
+ dtInitRoot(tid, ip, PARENT(ip));
+
+ DT_PUTPAGE(fmp);
+
+ return 0;
+ }
+
+ /*
+ * free the non-root leaf page
+ */
+ /*
+ * acquire a transaction lock on the page
+ *
+ * write FREEXTENT|NOREDOPAGE log record
+ * N.B. linelock is overlaid as freed extent descriptor, and
+ * the buffer page is freed;
+ */
+ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
+ pxdlock = (pxdlock_t *) & tlck->lock;
+ pxdlock->flag = mlckFREEPXD;
+ pxdlock->pxd = fp->header.self;
+ pxdlock->index = 1;
+
+ /* update sibling pointers */
+ if ((rc = dtRelink(tid, ip, fp)))
+ return rc;
+
+ xlen = lengthPXD(&fp->header.self);
+ ip->i_blocks -= LBLK2PBLK(ip->i_sb, xlen);
+
+ /* free/invalidate its buffer page */
+ discard_metapage(fmp);
+
+ /*
+ * propagate page deletion up the directory tree
+ *
+ * If the delete from the parent page makes it empty,
+ * continue all the way up the tree.
+ * stop if the root page is reached (which is never deleted) or
+ * if the entry deletion does not empty the page.
+ */
+ while ((parent = BT_POP(btstack)) != NULL) {
+ /* pin the parent page <sp> */
+ DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ /*
+ * free the extent of the child page deleted
+ */
+ index = parent->index;
+
+ /*
+ * delete the entry for the child page from parent
+ */
+ nextindex = p->header.nextindex;
+
+ /*
+ * the parent has the single entry being deleted:
+ *
+ * free the parent page which has become empty.
+ */
+ if (nextindex == 1) {
+ /*
+ * keep the root internal page which has become empty
+ */
+ if (p->header.flag & BT_ROOT) {
+ /*
+ * reset the root
+ *
+ * dtInitRoot() acquires txlock on the root
+ */
+ dtInitRoot(tid, ip, PARENT(ip));
+
+ DT_PUTPAGE(mp);
+
+ return 0;
+ }
+ /*
+ * free the parent page
+ */
+ else {
+ /*
+ * acquire a transaction lock on the page
+ *
+ * write FREEXTENT|NOREDOPAGE log record
+ */
+ tlck =
+ txMaplock(tid, ip,
+ tlckDTREE | tlckFREE);
+ pxdlock = (pxdlock_t *) & tlck->lock;
+ pxdlock->flag = mlckFREEPXD;
+ pxdlock->pxd = p->header.self;
+ pxdlock->index = 1;
+
+ /* update sibling pointers */
+ if ((rc = dtRelink(tid, ip, p)))
+ return rc;
+
+ xlen = lengthPXD(&p->header.self);
+ ip->i_blocks -= LBLK2PBLK(ip->i_sb, xlen);
+
+ /* free/invalidate its buffer page */
+ discard_metapage(mp);
+
+ /* propagate up */
+ continue;
+ }
+ }
+
+ /*
+ * the parent has other entries remaining:
+ *
+ * delete the router entry from the parent page.
+ */
+ BT_MARK_DIRTY(mp, ip);
+ /*
+ * acquire a transaction lock on the page
+ *
+ * action: router entry deletion
+ */
+ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+ dtlck = (dtlock_t *) & tlck->lock;
+
+ /* linelock header */
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ /* linelock stbl of non-root leaf page */
+ if (!(p->header.flag & BT_ROOT)) {
+ if (dtlck->index < dtlck->maxcnt)
+ lv++;
+ else {
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[0];
+ }
+ i = index >> L2DTSLOTSIZE;
+ lv->offset = p->header.stblindex + i;
+ lv->length =
+ ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
+ i + 1;
+ dtlck->index++;
+ }
+
+ /* free the router entry */
+ dtDeleteEntry(p, index, &dtlck);
+
+ /* reset key of new leftmost entry of level (for consistency) */
+ if (index == 0 &&
+ ((p->header.flag & BT_ROOT) || p->header.prev == 0))
+ dtTruncateEntry(p, 0, &dtlck);
+
+ /* unpin the parent page */
+ DT_PUTPAGE(mp);
+
+ /* exit propagation up */
+ break;
+ }
+
+ return 0;
+}
+
+
+/*
+ * NAME: dtRelocate()
+ *
+ * FUNCTION: relocate dtpage (internal or leaf) of directory;
+ * This function is mainly used by defragfs utility.
+ */
+int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
+ s64 nxaddr)
+{
+ int rc = 0;
+ metapage_t *mp, *pmp, *lmp, *rmp;
+ dtpage_t *p, *pp, *rp = 0, *lp= 0;
+ s64 bn;
+ int index;
+ btstack_t btstack;
+ pxd_t *pxd;
+ s64 oxaddr, nextbn, prevbn;
+ int xlen, xsize;
+ tlock_t *tlck;
+ dtlock_t *dtlck;
+ pxdlock_t *pxdlock;
+ s8 *stbl;
+ lv_t *lv;
+
+ oxaddr = addressPXD(opxd);
+ xlen = lengthPXD(opxd);
+
+ jEVENT(0, ("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d\n",
+ lmxaddr, oxaddr, nxaddr, xlen));
+
+ /*
+ * 1. get the internal parent dtpage covering
+ * router entry for the tartget page to be relocated;
+ */
+ rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
+ if (rc)
+ return rc;
+
+ /* retrieve search result */
+ DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+ jEVENT(0, ("dtRelocate: parent router entry validated.\n"));
+
+ /*
+ * 2. relocate the target dtpage
+ */
+ /* read in the target page from src extent */
+ DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
+ if (rc) {
+ /* release the pinned parent page */
+ DT_PUTPAGE(pmp);
+ return rc;
+ }
+
+ /*
+ * read in sibling pages if any to update sibling pointers;
+ */
+ rmp = NULL;
+ if (p->header.next) {
+ nextbn = le64_to_cpu(p->header.next);
+ DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
+ if (rc) {
+ DT_PUTPAGE(mp);
+ DT_PUTPAGE(pmp);
+ return (rc);
+ }
+ }
+
+ lmp = NULL;
+ if (p->header.prev) {
+ prevbn = le64_to_cpu(p->header.prev);
+ DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
+ if (rc) {
+ DT_PUTPAGE(mp);
+ DT_PUTPAGE(pmp);
+ if (rmp)
+ DT_PUTPAGE(rmp);
+ return (rc);
+ }
+ }
+
+ /* at this point, all xtpages to be updated are in memory */
+
+ /*
+ * update sibling pointers of sibling dtpages if any;
+ */
+ if (lmp) {
+ tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
+ dtlck = (dtlock_t *) & tlck->lock;
+ /* linelock header */
+ ASSERT(dtlck->index == 0);
+ lv = (lv_t *) & dtlck->lv[0];
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ lp->header.next = cpu_to_le64(nxaddr);
+ DT_PUTPAGE(lmp);
+ }
+
+ if (rmp) {
+ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
+ dtlck = (dtlock_t *) & tlck->lock;
+ /* linelock header */
+ ASSERT(dtlck->index == 0);
+ lv = (lv_t *) & dtlck->lv[0];
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ rp->header.prev = cpu_to_le64(nxaddr);
+ DT_PUTPAGE(rmp);
+ }
+
+ /*
+ * update the target dtpage to be relocated
+ *
+ * write LOG_REDOPAGE of LOG_NEW type for dst page
+ * for the whole target page (logredo() will apply
+ * after image and update bmap for allocation of the
+ * dst extent), and update bmap for allocation of
+ * the dst extent;
+ */
+ tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
+ dtlck = (dtlock_t *) & tlck->lock;
+ /* linelock header */
+ ASSERT(dtlck->index == 0);
+ lv = (lv_t *) & dtlck->lv[0];
+
+ /* update the self address in the dtpage header */
+ pxd = &p->header.self;
+ PXDaddress(pxd, nxaddr);
+
+ /* the dst page is the same as the src page, i.e.,
+ * linelock for afterimage of the whole page;
+ */
+ lv->offset = 0;
+ lv->length = p->header.maxslot;
+ dtlck->index++;
+
+ /* update the buffer extent descriptor of the dtpage */
+ xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+#ifdef _STILL_TO_PORT
+ bmSetXD(mp, nxaddr, xsize);
+#endif /* _STILL_TO_PORT */
+ /* unpin the relocated page */
+ DT_PUTPAGE(mp);
+ jEVENT(0, ("dtRelocate: target dtpage relocated.\n"));
+
+ /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
+ * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
+ * will also force a bmap update ).
+ */
+
+ /*
+ * 3. acquire maplock for the source extent to be freed;
+ */
+ /* for dtpage relocation, write a LOG_NOREDOPAGE record
+ * for the source dtpage (logredo() will init NoRedoPage
+ * filter and will also update bmap for free of the source
+ * dtpage), and upadte bmap for free of the source dtpage;
+ */
+ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
+ pxdlock = (pxdlock_t *) & tlck->lock;
+ pxdlock->flag = mlckFREEPXD;
+ PXDaddress(&pxdlock->pxd, oxaddr);
+ PXDlength(&pxdlock->pxd, xlen);
+ pxdlock->index = 1;
+
+ /*
+ * 4. update the parent router entry for relocation;
+ *
+ * acquire tlck for the parent entry covering the target dtpage;
+ * write LOG_REDOPAGE to apply after image only;
+ */
+ jEVENT(0, ("dtRelocate: update parent router entry.\n"));
+ tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
+ dtlck = (dtlock_t *) & tlck->lock;
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+
+ /* update the PXD with the new address */
+ stbl = DT_GETSTBL(pp);
+ pxd = (pxd_t *) & pp->slot[stbl[index]];
+ PXDaddress(pxd, nxaddr);
+ lv->offset = stbl[index];
+ lv->length = 1;
+ dtlck->index++;
+
+ /* unpin the parent dtpage */
+ DT_PUTPAGE(pmp);
+
+ return rc;
+}
+
+
+/*
+ * NAME: dtSearchNode()
+ *
+ * FUNCTION: Search for an dtpage containing a specified address
+ * This function is mainly used by defragfs utility.
+ *
+ * NOTE: Search result on stack, the found page is pinned at exit.
+ * The result page must be an internal dtpage.
+ * lmxaddr give the address of the left most page of the
+ * dtree level, in which the required dtpage resides.
+ */
+static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
+ btstack_t * btstack)
+{
+ int rc = 0;
+ s64 bn;
+ metapage_t *mp;
+ dtpage_t *p;
+ int psize = 288; /* initial in-line directory */
+ s8 *stbl;
+ int i;
+ pxd_t *pxd;
+ btframe_t *btsp;
+
+ BT_CLR(btstack); /* reset stack */
+
+ /*
+ * descend tree to the level with specified leftmost page
+ *
+ * by convention, root bn = 0.
+ */
+ for (bn = 0;;) {
+ /* get/pin the page to search */
+ DT_GETPAGE(ip, bn, mp, psize, p, rc);
+ if (rc)
+ return rc;
+
+ /* does the xaddr of leftmost page of the levevl
+ * matches levevl search key ?
+ */
+ if (p->header.flag & BT_ROOT) {
+ if (lmxaddr == 0)
+ break;
+ } else if (addressPXD(&p->header.self) == lmxaddr)
+ break;
+
+ /*
+ * descend down to leftmost child page
+ */
+ if (p->header.flag & BT_LEAF)
+ return ESTALE;
+
+ /* get the leftmost entry */
+ stbl = DT_GETSTBL(p);
+ pxd = (pxd_t *) & p->slot[stbl[0]];
+
+ /* get the child page block address */
+ bn = addressPXD(pxd);
+ psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
+ /* unpin the parent page */
+ DT_PUTPAGE(mp);
+ }
+
+ /*
+ * search each page at the current levevl
+ */
+ loop:
+ stbl = DT_GETSTBL(p);
+ for (i = 0; i < p->header.nextindex; i++) {
+ pxd = (pxd_t *) & p->slot[stbl[i]];
+
+ /* found the specified router entry */
+ if (addressPXD(pxd) == addressPXD(kpxd) &&
+ lengthPXD(pxd) == lengthPXD(kpxd)) {
+ btsp = btstack->top;
+ btsp->bn = bn;
+ btsp->index = i;
+ btsp->mp = mp;
+
+ return 0;
+ }
+ }
+
+ /* get the right sibling page if any */
+ if (p->header.next)
+ bn = le64_to_cpu(p->header.next);
+ else {
+ DT_PUTPAGE(mp);
+ return ESTALE;
+ }
+
+ /* unpin current page */
+ DT_PUTPAGE(mp);
+
+ /* get the right sibling page */
+ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ goto loop;
+}
+
+
+/*
+ * dtRelink()
+ *
+ * function:
+ * link around a freed page.
+ *
+ * parameter:
+ * fp: page to be freed
+ *
+ * return:
+ */
+static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
+{
+ int rc;
+ metapage_t *mp;
+ s64 nextbn, prevbn;
+ tlock_t *tlck;
+ dtlock_t *dtlck;
+ lv_t *lv;
+
+ nextbn = le64_to_cpu(p->header.next);
+ prevbn = le64_to_cpu(p->header.prev);
+
+ /* update prev pointer of the next page */
+ if (nextbn != 0) {
+ DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ BT_MARK_DIRTY(mp, ip);
+ /*
+ * acquire a transaction lock on the next page
+ *
+ * action: update prev pointer;
+ */
+ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+ jEVENT(0,
+ ("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p\n",
+ tlck, ip, mp));
+ dtlck = (dtlock_t *) & tlck->lock;
+
+ /* linelock header */
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ p->header.prev = cpu_to_le64(prevbn);
+ DT_PUTPAGE(mp);
+ }
+
+ /* update next pointer of the previous page */
+ if (prevbn != 0) {
+ DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ BT_MARK_DIRTY(mp, ip);
+ /*
+ * acquire a transaction lock on the prev page
+ *
+ * action: update next pointer;
+ */
+ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+ jEVENT(0,
+ ("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p\n",
+ tlck, ip, mp));
+ dtlck = (dtlock_t *) & tlck->lock;
+
+ /* linelock header */
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ p->header.next = cpu_to_le64(nextbn);
+ DT_PUTPAGE(mp);
+ }
+
+ return 0;
+}
+
+
+/*
+ * dtInitRoot()
+ *
+ * initialize directory root (inline in inode)
+ */
+void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
+{
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+ dtroot_t *p;
+ int fsi;
+ dtslot_t *f;
+ tlock_t *tlck;
+ dtlock_t *dtlck;
+ lv_t *lv;
+ u16 xflag_save;
+
+ /*
+ * If this was previously an non-empty directory, we need to remove
+ * the old directory table.
+ */
+ if (DO_INDEX(ip)) {
+ if (jfs_ip->next_index > (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
+ tblock_t *tblk = tid_to_tblock(tid);
+ /*
+ * We're playing games with the tid's xflag. If
+ * we're removing a regular file, the file's xtree
+ * is committed with COMMIT_PMAP, but we always
+ * commit the directories xtree with COMMIT_PWMAP.
+ */
+ xflag_save = tblk->xflag;
+ tblk->xflag = 0;
+ /*
+ * xtTruncate isn't guaranteed to fully truncate
+ * the xtree. The caller needs to check i_size
+ * after committing the transaction to see if
+ * additional truncation is needed. The
+ * COMMIT_Stale flag tells caller that we
+ * initiated the truncation.
+ */
+ xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+ set_cflag(COMMIT_Stale, ip);
+
+ tblk->xflag = xflag_save;
+ /*
+ * Tells jfs_metapage code that the metadata pages
+ * for the index table are no longer useful, and
+ * remove them from page cache.
+ */
+ invalidate_inode_metapages(ip);
+ } else
+ ip->i_size = 1;
+
+ jfs_ip->next_index = 2;
+ } else
+ ip->i_size = IDATASIZE;
+
+ /*
+ * acquire a transaction lock on the root
+ *
+ * action: directory initialization;
+ */
+ tlck = txLock(tid, ip, (metapage_t *) & jfs_ip->bxflag,
+ tlckDTREE | tlckENTRY | tlckBTROOT);
+ dtlck = (dtlock_t *) & tlck->lock;
+
+ /* linelock root */
+ ASSERT(dtlck->index == 0);
+ lv = (lv_t *) & dtlck->lv[0];
+ lv->offset = 0;
+ lv->length = DTROOTMAXSLOT;
+ dtlck->index++;
+
+ p = &jfs_ip->i_dtroot;
+
+ p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
+
+ p->header.nextindex = 0;
+
+ /* init freelist */
+ fsi = 1;
+ f = &p->slot[fsi];
+
+ /* init data area of root */
+ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
+ f->next = fsi;
+ f->next = -1;
+
+ p->header.freelist = 1;
+ p->header.freecnt = 8;
+
+ /* init '..' entry */
+ p->header.idotdot = cpu_to_le32(idotdot);
+
+#if 0
+ ip->i_blocks = LBLK2PBLK(ip->i_sb,
+ ((jfs_ip->ea.flag & DXD_EXTENT) ?
+ lengthDXD(&jfs_ip->ea) : 0) +
+ ((jfs_ip->acl.flag & DXD_EXTENT) ?
+ lengthDXD(&jfs_ip->acl) : 0));
+#endif
+
+ return;
+}
+
+/*
+ * jfs_readdir()
+ *
+ * function: read directory entries sequentially
+ * from the specified entry offset
+ *
+ * parameter:
+ *
+ * return: offset = (pn, index) of start entry
+ * of next jfs_readdir()/dtRead()
+ */
+int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct inode *ip = filp->f_dentry->d_inode;
+ struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
+ int rc = 0;
+ struct dtoffset {
+ s16 pn;
+ s16 index;
+ s32 unused;
+ } *dtoffset = (struct dtoffset *) &filp->f_pos;
+ s64 bn;
+ metapage_t *mp;
+ dtpage_t *p;
+ int index;
+ s8 *stbl;
+ btstack_t btstack;
+ int i, next;
+ ldtentry_t *d;
+ dtslot_t *t;
+ int d_namleft, d_namlen, len, outlen;
+ char *d_name, *name_ptr;
+ int dtlhdrdatalen;
+ u32 dir_index;
+ int do_index = 0;
+ uint loop_count = 0;
+
+ if (filp->f_pos == -1)
+ return 0;
+
+ if (DO_INDEX(ip)) {
+ /*
+ * persistent index is stored in directory entries.
+ * Special cases: 0 = .
+ * 1 = ..
+ * -1 = End of directory
+ */
+ do_index = 1;
+ dtlhdrdatalen = DTLHDRDATALEN;
+
+ dir_index = (u32) filp->f_pos;
+
+ if (dir_index > 1) {
+ dir_table_slot_t dirtab_slot;
+
+ if (dtEmpty(ip)) {
+ filp->f_pos = -1;
+ return 0;
+ }
+ repeat:
+ rc = get_index(ip, dir_index, &dirtab_slot);
+ if (rc) {
+ filp->f_pos = -1;
+ return rc;
+ }
+ if (dirtab_slot.flag == DIR_INDEX_FREE) {
+ if (loop_count++ > JFS_IP(ip)->next_index) {
+ jERROR(1, ("jfs_readdir detected "
+ "infinite loop!\n"));
+ filp->f_pos = -1;
+ return 0;
+ }
+ dir_index = le32_to_cpu(dirtab_slot.addr2);
+ if (dir_index == -1) {
+ filp->f_pos = -1;
+ return 0;
+ }
+ goto repeat;
+ }
+ bn = addressDTS(&dirtab_slot);
+ index = dirtab_slot.slot;
+ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+ if (rc) {
+ filp->f_pos = -1;
+ return 0;
+ }
+ if (p->header.flag & BT_INTERNAL) {
+ jERROR(1,("jfs_readdir: bad index table\n"));
+ DT_PUTPAGE(mp);
+ filp->f_pos = -1;
+ return 0;
+ }
+ if (p->header.flag & BT_INTERNAL) {
+ jERROR(1,("jfs_readdir: bad index table\n"));
+ DT_PUTPAGE(mp);
+ filp->f_pos = -1;
+ return 0;
+ }
+ } else {
+ if (dir_index == 0) {
+ /*
+ * self "."
+ */
+ filp->f_pos = 0;
+ if (filldir(dirent, ".", 1, 0, ip->i_ino,
+ DT_DIR))
+ return 0;
+ }
+ /*
+ * parent ".."
+ */
+ filp->f_pos = 1;
+ if (filldir
+ (dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+ return 0;
+
+ /*
+ * Find first entry of left-most leaf
+ */
+ if (dtEmpty(ip)) {
+ filp->f_pos = -1;
+ return 0;
+ }
+
+ if ((rc = dtReadFirst(ip, &btstack)))
+ return -rc;
+
+ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+ }
+ } else {
+ /*
+ * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
+ *
+ * pn = index = 0: First entry "."
+ * pn = 0; index = 1: Second entry ".."
+ * pn > 0: Real entries, pn=1 -> leftmost page
+ * pn = index = -1: No more entries
+ */
+ dtlhdrdatalen = DTLHDRDATALEN_LEGACY;
+
+ if (filp->f_pos == 0) {
+ /* build "." entry */
+
+ if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
+ DT_DIR))
+ return 0;
+ dtoffset->index = 1;
+ }
+
+ if (dtoffset->pn == 0) {
+ if (dtoffset->index == 1) {
+ /* build ".." entry */
+
+ if (filldir(dirent, "..", 2, filp->f_pos,
+ PARENT(ip), DT_DIR))
+ return 0;
+ } else {
+ jERROR(1,
+ ("jfs_readdir called with invalid offset!\n"));
+ }
+ dtoffset->pn = 1;
+ dtoffset->index = 0;
+ }
+
+ if (dtEmpty(ip)) {
+ filp->f_pos = -1;
+ return 0;
+ }
+
+ if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
+ jERROR(1,
+ ("jfs_readdir: unexpected rc = %d from dtReadNext\n",
+ rc));
+ filp->f_pos = -1;
+ return 0;
+ }
+ /* get start leaf page and index */
+ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+ /* offset beyond directory eof ? */
+ if (bn < 0) {
+ filp->f_pos = -1;
+ return 0;
+ }
+ }
+
+ d_name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
+ if (d_name == NULL) {
+ DT_PUTPAGE(mp);
+ jERROR(1, ("jfs_readdir: kmalloc failed!\n"));
+ filp->f_pos = -1;
+ return 0;
+ }
+ while (1) {
+ stbl = DT_GETSTBL(p);
+
+ for (i = index; i < p->header.nextindex; i++) {
+ d = (ldtentry_t *) & p->slot[stbl[i]];
+
+ d_namleft = d->namlen;
+ name_ptr = d_name;
+
+ if (do_index) {
+ filp->f_pos = le32_to_cpu(d->index);
+ len = min(d_namleft, DTLHDRDATALEN);
+ } else
+ len = min(d_namleft, DTLHDRDATALEN_LEGACY);
+
+ /* copy the name of head/only segment */
+ outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
+ codepage);
+ d_namlen = outlen;
+
+ /* copy name in the additional segment(s) */
+ next = d->next;
+ while (next >= 0) {
+ t = (dtslot_t *) & p->slot[next];
+ name_ptr += outlen;
+ d_namleft -= len;
+ len = min(d_namleft, DTSLOTDATALEN);
+ outlen = jfs_strfromUCS_le(name_ptr, t->name,
+ len, codepage);
+ d_namlen+= outlen;
+
+ next = t->next;
+ }
+
+ if (filldir(dirent, d_name, d_namlen, filp->f_pos,
+ le32_to_cpu(d->inumber), DT_UNKNOWN))
+ goto out;
+ if (!do_index)
+ dtoffset->index++;
+ }
+
+ /*
+ * get next leaf page
+ */
+
+ if (p->header.flag & BT_ROOT) {
+ filp->f_pos = -1;
+ break;
+ }
+
+ bn = le64_to_cpu(p->header.next);
+ if (bn == 0) {
+ filp->f_pos = -1;
+ break;
+ }
+
+ /* unpin previous leaf page */
+ DT_PUTPAGE(mp);
+
+ /* get next leaf page */
+ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+ if (rc) {
+ kfree(d_name);
+ return -rc;
+ }
+
+ /* update offset (pn:index) for new page */
+ index = 0;
+ if (!do_index) {
+ dtoffset->pn++;
+ dtoffset->index = 0;
+ }
+
+ }
+
+ out:
+ kfree(d_name);
+ DT_PUTPAGE(mp);
+
+ return rc;
+}
+
+
+/*
+ * dtReadFirst()
+ *
+ * function: get the leftmost page of the directory
+ */
+static int dtReadFirst(struct inode *ip, btstack_t * btstack)
+{
+ int rc = 0;
+ s64 bn;
+ int psize = 288; /* initial in-line directory */
+ metapage_t *mp;
+ dtpage_t *p;
+ s8 *stbl;
+ btframe_t *btsp;
+ pxd_t *xd;
+
+ BT_CLR(btstack); /* reset stack */
+
+ /*
+ * descend leftmost path of the tree
+ *
+ * by convention, root bn = 0.
+ */
+ for (bn = 0;;) {
+ DT_GETPAGE(ip, bn, mp, psize, p, rc);
+ if (rc)
+ return rc;
+
+ /*
+ * leftmost leaf page
+ */
+ if (p->header.flag & BT_LEAF) {
+ /* return leftmost entry */
+ btsp = btstack->top;
+ btsp->bn = bn;
+ btsp->index = 0;
+ btsp->mp = mp;
+
+ return 0;
+ }
+
+ /*
+ * descend down to leftmost child page
+ */
+ /* push (bn, index) of the parent page/entry */
+ BT_PUSH(btstack, bn, 0);
+
+ /* get the leftmost entry */
+ stbl = DT_GETSTBL(p);
+ xd = (pxd_t *) & p->slot[stbl[0]];
+
+ /* get the child page block address */
+ bn = addressPXD(xd);
+ psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
+
+ /* unpin the parent page */
+ DT_PUTPAGE(mp);
+ }
+}
+
+
+/*
+ * dtReadNext()
+ *
+ * function: get the page of the specified offset (pn:index)
+ *
+ * return: if (offset > eof), bn = -1;
+ *
+ * note: if index > nextindex of the target leaf page,
+ * start with 1st entry of next leaf page;
+ */
+static int dtReadNext(struct inode *ip, loff_t * offset, btstack_t * btstack)
+{
+ int rc = 0;
+ struct dtoffset {
+ s16 pn;
+ s16 index;
+ s32 unused;
+ } *dtoffset = (struct dtoffset *) offset;
+ s64 bn;
+ metapage_t *mp;
+ dtpage_t *p;
+ int index;
+ int pn;
+ s8 *stbl;
+ btframe_t *btsp, *parent;
+ pxd_t *xd;
+
+ /*
+ * get leftmost leaf page pinned
+ */
+ if ((rc = dtReadFirst(ip, btstack)))
+ return rc;
+
+ /* get leaf page */
+ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
+
+ /* get the start offset (pn:index) */
+ pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */
+ index = dtoffset->index;
+
+ /* start at leftmost page ? */
+ if (pn == 0) {
+ /* offset beyond eof ? */
+ if (index < p->header.nextindex)
+ goto out;
+
+ if (p->header.flag & BT_ROOT) {
+ bn = -1;
+ goto out;
+ }
+
+ /* start with 1st entry of next leaf page */
+ dtoffset->pn++;
+ dtoffset->index = index = 0;
+ goto a;
+ }
+
+ /* start at non-leftmost page: scan parent pages for large pn */
+ if (p->header.flag & BT_ROOT) {
+ bn = -1;
+ goto out;
+ }
+
+ /* start after next leaf page ? */
+ if (pn > 1)
+ goto b;
+
+ /* get leaf page pn = 1 */
+ a:
+ bn = le64_to_cpu(p->header.next);
+
+ /* unpin leaf page */
+ DT_PUTPAGE(mp);
+
+ /* offset beyond eof ? */
+ if (bn == 0) {
+ bn = -1;
+ goto out;
+ }
+
+ goto c;
+
+ /*
+ * scan last internal page level to get target leaf page
+ */
+ b:
+ /* unpin leftmost leaf page */
+ DT_PUTPAGE(mp);
+
+ /* get left most parent page */
+ btsp = btstack->top;
+ parent = btsp - 1;
+ bn = parent->bn;
+ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ /* scan parent pages at last internal page level */
+ while (pn >= p->header.nextindex) {
+ pn -= p->header.nextindex;
+
+ /* get next parent page address */
+ bn = le64_to_cpu(p->header.next);
+
+ /* unpin current parent page */
+ DT_PUTPAGE(mp);
+
+ /* offset beyond eof ? */
+ if (bn == 0) {
+ bn = -1;
+ goto out;
+ }
+
+ /* get next parent page */
+ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ /* update parent page stack frame */
+ parent->bn = bn;
+ }
+
+ /* get leaf page address */
+ stbl = DT_GETSTBL(p);
+ xd = (pxd_t *) & p->slot[stbl[pn]];
+ bn = addressPXD(xd);
+
+ /* unpin parent page */
+ DT_PUTPAGE(mp);
+
+ /*
+ * get target leaf page
+ */
+ c:
+ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ /*
+ * leaf page has been completed:
+ * start with 1st entry of next leaf page
+ */
+ if (index >= p->header.nextindex) {
+ bn = le64_to_cpu(p->header.next);
+
+ /* unpin leaf page */
+ DT_PUTPAGE(mp);
+
+ /* offset beyond eof ? */
+ if (bn == 0) {
+ bn = -1;
+ goto out;
+ }
+
+ /* get next leaf page */
+ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ /* start with 1st entry of next leaf page */
+ dtoffset->pn++;
+ dtoffset->index = 0;
+ }
+
+ out:
+ /* return target leaf page pinned */
+ btsp = btstack->top;
+ btsp->bn = bn;
+ btsp->index = dtoffset->index;
+ btsp->mp = mp;
+
+ return 0;
+}
+
+
+/*
+ * dtCompare()
+ *
+ * function: compare search key with an internal entry
+ *
+ * return:
+ * < 0 if k is < record
+ * = 0 if k is = record
+ * > 0 if k is > record
+ */
+static int dtCompare(component_t * key, /* search key */
+ dtpage_t * p, /* directory page */
+ int si)
+{ /* entry slot index */
+ register int rc;
+ register wchar_t *kname, *name;
+ register int klen, namlen, len;
+ idtentry_t *ih;
+ dtslot_t *t;
+
+ /*
+ * force the left-most key on internal pages, at any level of
+ * the tree, to be less than any search key.
+ * this obviates having to update the leftmost key on an internal
+ * page when the user inserts a new key in the tree smaller than
+ * anything that has been stored.
+ *
+ * (? if/when dtSearch() narrows down to 1st entry (index = 0),
+ * at any internal page at any level of the tree,
+ * it descends to child of the entry anyway -
+ * ? make the entry as min size dummy entry)
+ *
+ * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
+ * return (1);
+ */
+
+ kname = key->name;
+ klen = key->namlen;
+
+ ih = (idtentry_t *) & p->slot[si];
+ si = ih->next;
+ name = ih->name;
+ namlen = ih->namlen;
+ len = min(namlen, DTIHDRDATALEN);
+
+ /* compare with head/only segment */
+ len = min(klen, len);
+ if ((rc = UniStrncmp_le(kname, name, len)))
+ return rc;
+
+ klen -= len;
+ namlen -= len;
+
+ /* compare with additional segment(s) */
+ kname += len;
+ while (klen > 0 && namlen > 0) {
+ /* compare with next name segment */
+ t = (dtslot_t *) & p->slot[si];
+ len = min(namlen, DTSLOTDATALEN);
+ len = min(klen, len);
+ name = t->name;
+ if ((rc = UniStrncmp_le(kname, name, len)))
+ return rc;
+
+ klen -= len;
+ namlen -= len;
+ kname += len;
+ si = t->next;
+ }
+
+ return (klen - namlen);
+}
+
+
+
+
+/*
+ * ciCompare()
+ *
+ * function: compare search key with an (leaf/internal) entry
+ *
+ * return:
+ * < 0 if k is < record
+ * = 0 if k is = record
+ * > 0 if k is > record
+ */
+static int ciCompare(component_t * key, /* search key */
+ dtpage_t * p, /* directory page */
+ int si, /* entry slot index */
+ int flag)
+{
+ register int rc;
+ register wchar_t *kname, *name, x;
+ register int klen, namlen, len;
+ ldtentry_t *lh;
+ idtentry_t *ih;
+ dtslot_t *t;
+ int i;
+
+ /*
+ * force the left-most key on internal pages, at any level of
+ * the tree, to be less than any search key.
+ * this obviates having to update the leftmost key on an internal
+ * page when the user inserts a new key in the tree smaller than
+ * anything that has been stored.
+ *
+ * (? if/when dtSearch() narrows down to 1st entry (index = 0),
+ * at any internal page at any level of the tree,
+ * it descends to child of the entry anyway -
+ * ? make the entry as min size dummy entry)
+ *
+ * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
+ * return (1);
+ */
+
+ kname = key->name;
+ klen = key->namlen;
+
+ /*
+ * leaf page entry
+ */
+ if (p->header.flag & BT_LEAF) {
+ lh = (ldtentry_t *) & p->slot[si];
+ si = lh->next;
+ name = lh->name;
+ namlen = lh->namlen;
+ if (flag & JFS_DIR_INDEX)
+ len = min(namlen, DTLHDRDATALEN);
+ else
+ len = min(namlen, DTLHDRDATALEN_LEGACY);
+ }
+ /*
+ * internal page entry
+ */
+ else {
+ ih = (idtentry_t *) & p->slot[si];
+ si = ih->next;
+ name = ih->name;
+ namlen = ih->namlen;
+ len = min(namlen, DTIHDRDATALEN);
+ }
+
+ /* compare with head/only segment */
+ len = min(klen, len);
+ for (i = 0; i < len; i++, kname++, name++) {
+ /* only uppercase if case-insensitive support is on */
+ if ((flag & JFS_OS2) == JFS_OS2)
+ x = UniToupper(le16_to_cpu(*name));
+ else
+ x = le16_to_cpu(*name);
+ if ((rc = *kname - x))
+ return rc;
+ }
+
+ klen -= len;
+ namlen -= len;
+
+ /* compare with additional segment(s) */
+ while (klen > 0 && namlen > 0) {
+ /* compare with next name segment */
+ t = (dtslot_t *) & p->slot[si];
+ len = min(namlen, DTSLOTDATALEN);
+ len = min(klen, len);
+ name = t->name;
+ for (i = 0; i < len; i++, kname++, name++) {
+ /* only uppercase if case-insensitive support is on */
+ if ((flag & JFS_OS2) == JFS_OS2)
+ x = UniToupper(le16_to_cpu(*name));
+ else
+ x = le16_to_cpu(*name);
+
+ if ((rc = *kname - x))
+ return rc;
+ }
+
+ klen -= len;
+ namlen -= len;
+ si = t->next;
+ }
+
+ return (klen - namlen);
+}
+
+
+/*
+ * ciGetLeafPrefixKey()
+ *
+ * function: compute prefix of suffix compression
+ * from two adjacent leaf entries
+ * across page boundary
+ *
+ * return:
+ * Number of prefix bytes needed to distinguish b from a.
+ */
+static void ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
+ int ri, component_t * key, int flag)
+{
+ register int klen, namlen;
+ register wchar_t *pl, *pr, *kname;
+ wchar_t lname[JFS_NAME_MAX + 1];
+ component_t lkey = { 0, lname };
+ wchar_t rname[JFS_NAME_MAX + 1];
+ component_t rkey = { 0, rname };
+
+ /* get left and right key */
+ dtGetKey(lp, li, &lkey, flag);
+ lkey.name[lkey.namlen] = 0;
+
+ if ((flag & JFS_OS2) == JFS_OS2)
+ ciToUpper(&lkey);
+
+ dtGetKey(rp, ri, &rkey, flag);
+ rkey.name[rkey.namlen] = 0;
+
+
+ if ((flag & JFS_OS2) == JFS_OS2)
+ ciToUpper(&rkey);
+
+ /* compute prefix */
+ klen = 0;
+ kname = key->name;
+ namlen = min(lkey.namlen, rkey.namlen);
+ for (pl = lkey.name, pr = rkey.name;
+ namlen; pl++, pr++, namlen--, klen++, kname++) {
+ *kname = *pr;
+ if (*pl != *pr) {
+ key->namlen = klen + 1;
+ return;
+ }
+ }
+
+ /* l->namlen <= r->namlen since l <= r */
+ if (lkey.namlen < rkey.namlen) {
+ *kname = *pr;
+ key->namlen = klen + 1;
+ } else /* l->namelen == r->namelen */
+ key->namlen = klen;
+
+ return;
+}
+
+
+
+/*
+ * dtGetKey()
+ *
+ * function: get key of the entry
+ */
+static void dtGetKey(dtpage_t * p, int i, /* entry index */
+ component_t * key, int flag)
+{
+ int si;
+ s8 *stbl;
+ ldtentry_t *lh;
+ idtentry_t *ih;
+ dtslot_t *t;
+ int namlen, len;
+ wchar_t *name, *kname;
+
+ /* get entry */
+ stbl = DT_GETSTBL(p);
+ si = stbl[i];
+ if (p->header.flag & BT_LEAF) {
+ lh = (ldtentry_t *) & p->slot[si];
+ si = lh->next;
+ namlen = lh->namlen;
+ name = lh->name;
+ if (flag & JFS_DIR_INDEX)
+ len = min(namlen, DTLHDRDATALEN);
+ else
+ len = min(namlen, DTLHDRDATALEN_LEGACY);
+ } else {
+ ih = (idtentry_t *) & p->slot[si];
+ si = ih->next;
+ namlen = ih->namlen;
+ name = ih->name;
+ len = min(namlen, DTIHDRDATALEN);
+ }
+
+ key->namlen = namlen;
+ kname = key->name;
+
+ /*
+ * move head/only segment
+ */
+ UniStrncpy_le(kname, name, len);
+
+ /*
+ * move additional segment(s)
+ */
+ while (si >= 0) {
+ /* get next segment */
+ t = &p->slot[si];
+ kname += len;
+ namlen -= len;
+ len = min(namlen, DTSLOTDATALEN);
+ UniStrncpy_le(kname, t->name, len);
+
+ si = t->next;
+ }
+}
+
+
+/*
+ * dtInsertEntry()
+ *
+ * function: allocate free slot(s) and
+ * write a leaf/internal entry
+ *
+ * return: entry slot index
+ */
+static void dtInsertEntry(dtpage_t * p, int index, component_t * key,
+ ddata_t * data, dtlock_t ** dtlock)
+{
+ dtslot_t *h, *t;
+ ldtentry_t *lh = 0;
+ idtentry_t *ih = 0;
+ int hsi, fsi, klen, len, nextindex;
+ wchar_t *kname, *name;
+ s8 *stbl;
+ pxd_t *xd;
+ dtlock_t *dtlck = *dtlock;
+ lv_t *lv;
+ int xsi, n;
+ s64 bn = 0;
+ metapage_t *mp = 0;
+
+ klen = key->namlen;
+ kname = key->name;
+
+ /* allocate a free slot */
+ hsi = fsi = p->header.freelist;
+ h = &p->slot[fsi];
+ p->header.freelist = h->next;
+ --p->header.freecnt;
+
+ /* open new linelock */
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+ lv->offset = hsi;
+
+ /* write head/only segment */
+ if (p->header.flag & BT_LEAF) {
+ lh = (ldtentry_t *) h;
+ lh->next = h->next;
+ lh->inumber = data->leaf.ino; /* little-endian */
+ lh->namlen = klen;
+ name = lh->name;
+ if (data->leaf.ip) {
+ len = min(klen, DTLHDRDATALEN);
+ if (!(p->header.flag & BT_ROOT))
+ bn = addressPXD(&p->header.self);
+ lh->index = cpu_to_le32(add_index(data->leaf.tid,
+ data->leaf.ip,
+ bn, index));
+ } else
+ len = min(klen, DTLHDRDATALEN_LEGACY);
+ } else {
+ ih = (idtentry_t *) h;
+ ih->next = h->next;
+ xd = (pxd_t *) ih;
+ *xd = data->xd;
+ ih->namlen = klen;
+ name = ih->name;
+ len = min(klen, DTIHDRDATALEN);
+ }
+
+ UniStrncpy_le(name, kname, len);
+
+ n = 1;
+ xsi = hsi;
+
+ /* write additional segment(s) */
+ t = h;
+ klen -= len;
+ while (klen) {
+ /* get free slot */
+ fsi = p->header.freelist;
+ t = &p->slot[fsi];
+ p->header.freelist = t->next;
+ --p->header.freecnt;
+
+ /* is next slot contiguous ? */
+ if (fsi != xsi + 1) {
+ /* close current linelock */
+ lv->length = n;
+ dtlck->index++;
+
+ /* open new linelock */
+ if (dtlck->index < dtlck->maxcnt)
+ lv++;
+ else {
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[0];
+ }
+
+ lv->offset = fsi;
+ n = 0;
+ }
+
+ kname += len;
+ len = min(klen, DTSLOTDATALEN);
+ UniStrncpy_le(t->name, kname, len);
+
+ n++;
+ xsi = fsi;
+ klen -= len;
+ }
+
+ /* close current linelock */
+ lv->length = n;
+ dtlck->index++;
+
+ *dtlock = dtlck;
+
+ /* terminate last/only segment */
+ if (h == t) {
+ /* single segment entry */
+ if (p->header.flag & BT_LEAF)
+ lh->next = -1;
+ else
+ ih->next = -1;
+ } else
+ /* multi-segment entry */
+ t->next = -1;
+
+ /* if insert into middle, shift right succeeding entries in stbl */
+ stbl = DT_GETSTBL(p);
+ nextindex = p->header.nextindex;
+ if (index < nextindex) {
+ memmove(stbl + index + 1, stbl + index, nextindex - index);
+
+ if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
+ /*
+ * Need to update slot number for entries that moved
+ * in the stbl
+ */
+ mp = 0;
+ for (n = index + 1; n <= nextindex; n++) {
+ lh = (ldtentry_t *) & (p->slot[stbl[n]]);
+ modify_index(data->leaf.tid, data->leaf.ip,
+ le32_to_cpu(lh->index), bn, n,
+ &mp);
+ }
+ if (mp)
+ release_metapage(mp);
+ }
+ }
+
+ stbl[index] = hsi;
+
+ /* advance next available entry index of stbl */
+ ++p->header.nextindex;
+}
+
+
+/*
+ * dtMoveEntry()
+ *
+ * function: move entries from split/left page to new/right page
+ *
+ * nextindex of dst page and freelist/freecnt of both pages
+ * are updated.
+ */
+static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
+ dtlock_t ** sdtlock, dtlock_t ** ddtlock,
+ int do_index)
+{
+ int ssi, next; /* src slot index */
+ int di; /* dst entry index */
+ int dsi; /* dst slot index */
+ s8 *sstbl, *dstbl; /* sorted entry table */
+ int snamlen, len;
+ ldtentry_t *slh, *dlh = 0;
+ idtentry_t *sih, *dih = 0;
+ dtslot_t *h, *s, *d;
+ dtlock_t *sdtlck = *sdtlock, *ddtlck = *ddtlock;
+ lv_t *slv, *dlv;
+ int xssi, ns, nd;
+ int sfsi;
+
+ sstbl = (s8 *) & sp->slot[sp->header.stblindex];
+ dstbl = (s8 *) & dp->slot[dp->header.stblindex];
+
+ dsi = dp->header.freelist; /* first (whole page) free slot */
+ sfsi = sp->header.freelist;
+
+ /* linelock destination entry slot */
+ dlv = (lv_t *) & ddtlck->lv[ddtlck->index];
+ dlv->offset = dsi;
+
+ /* linelock source entry slot */
+ slv = (lv_t *) & sdtlck->lv[sdtlck->index];
+ slv->offset = sstbl[si];
+ xssi = slv->offset - 1;
+
+ /*
+ * move entries
+ */
+ ns = nd = 0;
+ for (di = 0; si < sp->header.nextindex; si++, di++) {
+ ssi = sstbl[si];
+ dstbl[di] = dsi;
+
+ /* is next slot contiguous ? */
+ if (ssi != xssi + 1) {
+ /* close current linelock */
+ slv->length = ns;
+ sdtlck->index++;
+
+ /* open new linelock */
+ if (sdtlck->index < sdtlck->maxcnt)
+ slv++;
+ else {
+ sdtlck = (dtlock_t *) txLinelock(sdtlck);
+ slv = (lv_t *) & sdtlck->lv[0];
+ }
+
+ slv->offset = ssi;
+ ns = 0;
+ }
+
+ /*
+ * move head/only segment of an entry
+ */
+ /* get dst slot */
+ h = d = &dp->slot[dsi];
+
+ /* get src slot and move */
+ s = &sp->slot[ssi];
+ if (sp->header.flag & BT_LEAF) {
+ /* get source entry */
+ slh = (ldtentry_t *) s;
+ dlh = (ldtentry_t *) h;
+ snamlen = slh->namlen;
+
+ if (do_index) {
+ len = min(snamlen, DTLHDRDATALEN);
+ dlh->index = slh->index; /* little-endian */
+ } else
+ len = min(snamlen, DTLHDRDATALEN_LEGACY);
+
+ memcpy(dlh, slh, 6 + len * 2);
+
+ next = slh->next;
+
+ /* update dst head/only segment next field */
+ dsi++;
+ dlh->next = dsi;
+ } else {
+ sih = (idtentry_t *) s;
+ snamlen = sih->namlen;
+
+ len = min(snamlen, DTIHDRDATALEN);
+ dih = (idtentry_t *) h;
+ memcpy(dih, sih, 10 + len * 2);
+ next = sih->next;
+
+ dsi++;
+ dih->next = dsi;
+ }
+
+ /* free src head/only segment */
+ s->next = sfsi;
+ s->cnt = 1;
+ sfsi = ssi;
+
+ ns++;
+ nd++;
+ xssi = ssi;
+
+ /*
+ * move additional segment(s) of the entry
+ */
+ snamlen -= len;
+ while ((ssi = next) >= 0) {
+ /* is next slot contiguous ? */
+ if (ssi != xssi + 1) {
+ /* close current linelock */
+ slv->length = ns;
+ sdtlck->index++;
+
+ /* open new linelock */
+ if (sdtlck->index < sdtlck->maxcnt)
+ slv++;
+ else {
+ sdtlck =
+ (dtlock_t *)
+ txLinelock(sdtlck);
+ slv = (lv_t *) & sdtlck->lv[0];
+ }
+
+ slv->offset = ssi;
+ ns = 0;
+ }
+
+ /* get next source segment */
+ s = &sp->slot[ssi];
+
+ /* get next destination free slot */
+ d++;
+
+ len = min(snamlen, DTSLOTDATALEN);
+ UniStrncpy(d->name, s->name, len);
+
+ ns++;
+ nd++;
+ xssi = ssi;
+
+ dsi++;
+ d->next = dsi;
+
+ /* free source segment */
+ next = s->next;
+ s->next = sfsi;
+ s->cnt = 1;
+ sfsi = ssi;
+
+ snamlen -= len;
+ } /* end while */
+
+ /* terminate dst last/only segment */
+ if (h == d) {
+ /* single segment entry */
+ if (dp->header.flag & BT_LEAF)
+ dlh->next = -1;
+ else
+ dih->next = -1;
+ } else
+ /* multi-segment entry */
+ d->next = -1;
+ } /* end for */
+
+ /* close current linelock */
+ slv->length = ns;
+ sdtlck->index++;
+ *sdtlock = sdtlck;
+
+ dlv->length = nd;
+ ddtlck->index++;
+ *ddtlock = ddtlck;
+
+ /* update source header */
+ sp->header.freelist = sfsi;
+ sp->header.freecnt += nd;
+
+ /* update destination header */
+ dp->header.nextindex = di;
+
+ dp->header.freelist = dsi;
+ dp->header.freecnt -= nd;
+}
+
+
+/*
+ * dtDeleteEntry()
+ *
+ * function: free a (leaf/internal) entry
+ *
+ * log freelist header, stbl, and each segment slot of entry
+ * (even though last/only segment next field is modified,
+ * physical image logging requires all segment slots of
+ * the entry logged to avoid applying previous updates
+ * to the same slots)
+ */
+static void dtDeleteEntry(dtpage_t * p, int fi, dtlock_t ** dtlock)
+{
+ int fsi; /* free entry slot index */
+ s8 *stbl;
+ dtslot_t *t;
+ int si, freecnt;
+ dtlock_t *dtlck = *dtlock;
+ lv_t *lv;
+ int xsi, n;
+
+ /* get free entry slot index */
+ stbl = DT_GETSTBL(p);
+ fsi = stbl[fi];
+
+ /* open new linelock */
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+
+ lv->offset = fsi;
+
+ /* get the head/only segment */
+ t = &p->slot[fsi];
+ if (p->header.flag & BT_LEAF)
+ si = ((ldtentry_t *) t)->next;
+ else
+ si = ((idtentry_t *) t)->next;
+ t->next = si;
+ t->cnt = 1;
+
+ n = freecnt = 1;
+ xsi = fsi;
+
+ /* find the last/only segment */
+ while (si >= 0) {
+ /* is next slot contiguous ? */
+ if (si != xsi + 1) {
+ /* close current linelock */
+ lv->length = n;
+ dtlck->index++;
+
+ /* open new linelock */
+ if (dtlck->index < dtlck->maxcnt)
+ lv++;
+ else {
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[0];
+ }
+
+ lv->offset = si;
+ n = 0;
+ }
+
+ n++;
+ xsi = si;
+ freecnt++;
+
+ t = &p->slot[si];
+ t->cnt = 1;
+ si = t->next;
+ }
+
+ /* close current linelock */
+ lv->length = n;
+ dtlck->index++;
+
+ *dtlock = dtlck;
+
+ /* update freelist */
+ t->next = p->header.freelist;
+ p->header.freelist = fsi;
+ p->header.freecnt += freecnt;
+
+ /* if delete from middle,
+ * shift left the succedding entries in the stbl
+ */
+ si = p->header.nextindex;
+ if (fi < si - 1)
+ memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
+
+ p->header.nextindex--;
+}
+
+
+/*
+ * dtTruncateEntry()
+ *
+ * function: truncate a (leaf/internal) entry
+ *
+ * log freelist header, stbl, and each segment slot of entry
+ * (even though last/only segment next field is modified,
+ * physical image logging requires all segment slots of
+ * the entry logged to avoid applying previous updates
+ * to the same slots)
+ */
+static void dtTruncateEntry(dtpage_t * p, int ti, dtlock_t ** dtlock)
+{
+ int tsi; /* truncate entry slot index */
+ s8 *stbl;
+ dtslot_t *t;
+ int si, freecnt;
+ dtlock_t *dtlck = *dtlock;
+ lv_t *lv;
+ int fsi, xsi, n;
+
+ /* get free entry slot index */
+ stbl = DT_GETSTBL(p);
+ tsi = stbl[ti];
+
+ /* open new linelock */
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+
+ lv->offset = tsi;
+
+ /* get the head/only segment */
+ t = &p->slot[tsi];
+ ASSERT(p->header.flag & BT_INTERNAL);
+ ((idtentry_t *) t)->namlen = 0;
+ si = ((idtentry_t *) t)->next;
+ ((idtentry_t *) t)->next = -1;
+
+ n = 1;
+ freecnt = 0;
+ fsi = si;
+ xsi = tsi;
+
+ /* find the last/only segment */
+ while (si >= 0) {
+ /* is next slot contiguous ? */
+ if (si != xsi + 1) {
+ /* close current linelock */
+ lv->length = n;
+ dtlck->index++;
+
+ /* open new linelock */
+ if (dtlck->index < dtlck->maxcnt)
+ lv++;
+ else {
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[0];
+ }
+
+ lv->offset = si;
+ n = 0;
+ }
+
+ n++;
+ xsi = si;
+ freecnt++;
+
+ t = &p->slot[si];
+ t->cnt = 1;
+ si = t->next;
+ }
+
+ /* close current linelock */
+ lv->length = n;
+ dtlck->index++;
+
+ *dtlock = dtlck;
+
+ /* update freelist */
+ if (freecnt == 0)
+ return;
+ t->next = p->header.freelist;
+ p->header.freelist = fsi;
+ p->header.freecnt += freecnt;
+}
+
+
+/*
+ * dtLinelockFreelist()
+ */
+static void dtLinelockFreelist(dtpage_t * p, /* directory page */
+ int m, /* max slot index */
+ dtlock_t ** dtlock)
+{
+ int fsi; /* free entry slot index */
+ dtslot_t *t;
+ int si;
+ dtlock_t *dtlck = *dtlock;
+ lv_t *lv;
+ int xsi, n;
+
+ /* get free entry slot index */
+ fsi = p->header.freelist;
+
+ /* open new linelock */
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[dtlck->index];
+
+ lv->offset = fsi;
+
+ n = 1;
+ xsi = fsi;
+
+ t = &p->slot[fsi];
+ si = t->next;
+
+ /* find the last/only segment */
+ while (si < m && si >= 0) {
+ /* is next slot contiguous ? */
+ if (si != xsi + 1) {
+ /* close current linelock */
+ lv->length = n;
+ dtlck->index++;
+
+ /* open new linelock */
+ if (dtlck->index < dtlck->maxcnt)
+ lv++;
+ else {
+ dtlck = (dtlock_t *) txLinelock(dtlck);
+ lv = (lv_t *) & dtlck->lv[0];
+ }
+
+ lv->offset = si;
+ n = 0;
+ }
+
+ n++;
+ xsi = si;
+
+ t = &p->slot[si];
+ si = t->next;
+ }
+
+ /* close current linelock */
+ lv->length = n;
+ dtlck->index++;
+
+ *dtlock = dtlck;
+}
+
+
+/*
+ * NAME: dtModify
+ *
+ * FUNCTION: Modify the inode number part of a directory entry
+ *
+ * PARAMETERS:
+ * tid - Transaction id
+ * ip - Inode of parent directory
+ * key - Name of entry to be modified
+ * orig_ino - Original inode number expected in entry
+ * new_ino - New inode number to put into entry
+ * flag - JFS_RENAME
+ *
+ * RETURNS:
+ * ESTALE - If entry found does not match orig_ino passed in
+ * ENOENT - If no entry can be found to match key
+ * 0 - If successfully modified entry
+ */
+int dtModify(tid_t tid, struct inode *ip,
+ component_t * key, ino_t * orig_ino, ino_t new_ino, int flag)
+{
+ int rc;
+ s64 bn;
+ metapage_t *mp;
+ dtpage_t *p;
+ int index;
+ btstack_t btstack;
+ tlock_t *tlck;
+ dtlock_t *dtlck;
+ lv_t *lv;
+ s8 *stbl;
+ int entry_si; /* entry slot index */
+ ldtentry_t *entry;
+
+ /*
+ * search for the entry to modify:
+ *
+ * dtSearch() returns (leaf page pinned, index at which to modify).
+ */
+ if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
+ return rc;
+
+ /* retrieve search result */
+ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+ BT_MARK_DIRTY(mp, ip);
+ /*
+ * acquire a transaction lock on the leaf page of named entry
+ */
+ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+ dtlck = (dtlock_t *) & tlck->lock;
+
+ /* get slot index of the entry */
+ stbl = DT_GETSTBL(p);
+ entry_si = stbl[index];
+
+ /* linelock entry */
+ ASSERT(dtlck->index == 0);
+ lv = (lv_t *) & dtlck->lv[0];
+ lv->offset = entry_si;
+ lv->length = 1;
+ dtlck->index++;
+
+ /* get the head/only segment */
+ entry = (ldtentry_t *) & p->slot[entry_si];
+
+ /* substitute the inode number of the entry */
+ entry->inumber = cpu_to_le32(new_ino);
+
+ /* unpin the leaf page */
+ DT_PUTPAGE(mp);
+
+ return 0;
+}
+
+#ifdef _JFS_DEBUG_DTREE
+/*
+ * dtDisplayTree()
+ *
+ * function: traverse forward
+ */
+int dtDisplayTree(struct inode *ip)
+{
+ int rc;
+ metapage_t *mp;
+ dtpage_t *p;
+ s64 bn, pbn;
+ int index, lastindex, v, h;
+ pxd_t *xd;
+ btstack_t btstack;
+ btframe_t *btsp;
+ btframe_t *parent;
+ u8 *stbl;
+ int psize = 256;
+
+ printk("display B+-tree.\n");
+
+ /* clear stack */
+ btsp = btstack.stack;
+
+ /*
+ * start with root
+ *
+ * root resides in the inode
+ */
+ bn = 0;
+ v = h = 0;
+
+ /*
+ * first access of each page:
+ */
+ newPage:
+ DT_GETPAGE(ip, bn, mp, psize, p, rc);
+ if (rc)
+ return rc;
+
+ /* process entries forward from first index */
+ index = 0;
+ lastindex = p->header.nextindex - 1;
+
+ if (p->header.flag & BT_INTERNAL) {
+ /*
+ * first access of each internal page
+ */
+ printf("internal page ");
+ dtDisplayPage(ip, bn, p);
+
+ goto getChild;
+ } else { /* (p->header.flag & BT_LEAF) */
+
+ /*
+ * first access of each leaf page
+ */
+ printf("leaf page ");
+ dtDisplayPage(ip, bn, p);
+
+ /*
+ * process leaf page entries
+ *
+ for ( ; index <= lastindex; index++)
+ {
+ }
+ */
+
+ /* unpin the leaf page */
+ DT_PUTPAGE(mp);
+ }
+
+ /*
+ * go back up to the parent page
+ */
+ getParent:
+ /* pop/restore parent entry for the current child page */
+ if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL)
+ /* current page must have been root */
+ return;
+
+ /*
+ * parent page scan completed
+ */
+ if ((index = parent->index) == (lastindex = parent->lastindex)) {
+ /* go back up to the parent page */
+ goto getParent;
+ }
+
+ /*
+ * parent page has entries remaining
+ */
+ /* get back the parent page */
+ bn = parent->bn;
+ /* v = parent->level; */
+ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+
+ /* get next parent entry */
+ index++;
+
+ /*
+ * internal page: go down to child page of current entry
+ */
+ getChild:
+ /* push/save current parent entry for the child page */
+ btsp->bn = pbn = bn;
+ btsp->index = index;
+ btsp->lastindex = lastindex;
+ /* btsp->level = v; */
+ /* btsp->node = h; */
+ ++btsp;
+
+ /* get current entry for the child page */
+ stbl = DT_GETSTBL(p);
+ xd = (pxd_t *) & p->slot[stbl[index]];
+
+ /*
+ * first access of each internal entry:
+ */
+
+ /* get child page */
+ bn = addressPXD(xd);
+ psize = lengthPXD(xd) << ip->i_ipmnt->i_l2bsize;
+
+ printk("traverse down 0x%Lx[%d]->0x%Lx\n", pbn, index, bn);
+ v++;
+ h = index;
+
+ /* release parent page */
+ DT_PUTPAGE(mp);
+
+ /* process the child page */
+ goto newPage;
+}
+
+
+/*
+ * dtDisplayPage()
+ *
+ * function: display page
+ */
+int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p)
+{
+ int rc;
+ metapage_t *mp;
+ ldtentry_t *lh;
+ idtentry_t *ih;
+ pxd_t *xd;
+ int i, j;
+ u8 *stbl;
+ wchar_t name[JFS_NAME_MAX + 1];
+ component_t key = { 0, name };
+ int freepage = 0;
+
+ if (p == NULL) {
+ freepage = 1;
+ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+ if (rc)
+ return rc;
+ }
+
+ /* display page control */
+ printk("bn:0x%Lx flag:0x%08x nextindex:%d\n",
+ bn, p->header.flag, p->header.nextindex);
+
+ /* display entries */
+ stbl = DT_GETSTBL(p);
+ for (i = 0, j = 1; i < p->header.nextindex; i++, j++) {
+ dtGetKey(p, i, &key, JFS_SBI(ip->i_sb)->mntflag);
+ key.name[key.namlen] = '\0';
+ if (p->header.flag & BT_LEAF) {
+ lh = (ldtentry_t *) & p->slot[stbl[i]];
+ printf("\t[%d] %s:%d", i, key.name,
+ le32_to_cpu(lh->inumber));
+ } else {
+ ih = (idtentry_t *) & p->slot[stbl[i]];
+ xd = (pxd_t *) ih;
+ bn = addressPXD(xd);
+ printf("\t[%d] %s:0x%Lx", i, key.name, bn);
+ }
+
+ if (j == 4) {
+ printf("\n");
+ j = 0;
+ }
+ }
+
+ printf("\n");
+
+ if (freepage)
+ DT_PUTPAGE(mp);
+
+ return 0;
+}
+#endif /* _JFS_DEBUG_DTREE */
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
new file mode 100644
index 000000000000..5ea922aeeb28
--- /dev/null
+++ b/fs/jfs/jfs_dtree.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * Change History :
+ *
+ */
+
+#ifndef _H_JFS_DTREE
+#define _H_JFS_DTREE
+
+/*
+ * jfs_dtree.h: directory B+-tree manager
+ */
+
+#include "jfs_btree.h"
+
+typedef union {
+ struct {
+ tid_t tid;
+ struct inode *ip;
+ u32 ino;
+ } leaf;
+ pxd_t xd;
+} ddata_t;
+
+
+/*
+ * entry segment/slot
+ *
+ * an entry consists of type dependent head/only segment/slot and
+ * additional segments/slots linked vi next field;
+ * N.B. last/only segment of entry is terminated by next = -1;
+ */
+/*
+ * directory page slot
+ */
+typedef struct {
+ s8 next; /* 1: */
+ s8 cnt; /* 1: */
+ wchar_t name[15]; /* 30: */
+} dtslot_t; /* (32) */
+
+
+#define DATASLOTSIZE 16
+#define L2DATASLOTSIZE 4
+#define DTSLOTSIZE 32
+#define L2DTSLOTSIZE 5
+#define DTSLOTHDRSIZE 2
+#define DTSLOTDATASIZE 30
+#define DTSLOTDATALEN 15
+
+/*
+ * internal node entry head/only segment
+ */
+typedef struct {
+ pxd_t xd; /* 8: child extent descriptor */
+
+ s8 next; /* 1: */
+ u8 namlen; /* 1: */
+ wchar_t name[11]; /* 22: 2-byte aligned */
+} idtentry_t; /* (32) */
+
+#define DTIHDRSIZE 10
+#define DTIHDRDATALEN 11
+
+/* compute number of slots for entry */
+#define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 )
+
+
+/*
+ * leaf node entry head/only segment
+ *
+ * For legacy filesystems, name contains 13 wchars -- no index field
+ */
+typedef struct {
+ u32 inumber; /* 4: 4-byte aligned */
+ s8 next; /* 1: */
+ u8 namlen; /* 1: */
+ wchar_t name[11]; /* 22: 2-byte aligned */
+ u32 index; /* 4: index into dir_table */
+} ldtentry_t; /* (32) */
+
+#define DTLHDRSIZE 6
+#define DTLHDRDATALEN_LEGACY 13 /* Old (OS/2) format */
+#define DTLHDRDATALEN 11
+
+/*
+ * dir_table used for directory traversal during readdir
+ */
+
+/*
+ * Keep persistent index for directory entries
+ */
+#define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX)
+
+/*
+ * Maximum entry in inline directory table
+ */
+#define MAX_INLINE_DIRTABLE_ENTRY 13
+
+typedef struct dir_table_slot {
+ u8 rsrvd; /* 1: */
+ u8 flag; /* 1: 0 if free */
+ u8 slot; /* 1: slot within leaf page of entry */
+ u8 addr1; /* 1: upper 8 bits of leaf page address */
+ u32 addr2; /* 4: lower 32 bits of leaf page address -OR-
+ index of next entry when this entry was deleted */
+} dir_table_slot_t; /* (8) */
+
+/*
+ * flag values
+ */
+#define DIR_INDEX_VALID 1
+#define DIR_INDEX_FREE 0
+
+#define DTSaddress(dir_table_slot, address64)\
+{\
+ (dir_table_slot)->addr1 = ((u64)address64) >> 32;\
+ (dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+
+#define addressDTS(dts)\
+ ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
+
+/* compute number of slots for entry */
+#define NDTLEAF_LEGACY(klen) ( ((2 + (klen)) + (15 - 1)) / 15 )
+#define NDTLEAF NDTINTERNAL
+
+
+/*
+ * directory root page (in-line in on-disk inode):
+ *
+ * cf. dtpage_t below.
+ */
+typedef union {
+ struct {
+ dasd_t DASD; /* 16: DASD limit/usage info F226941 */
+
+ u8 flag; /* 1: */
+ u8 nextindex; /* 1: next free entry in stbl */
+ s8 freecnt; /* 1: free count */
+ s8 freelist; /* 1: freelist header */
+
+ u32 idotdot; /* 4: parent inode number */
+
+ s8 stbl[8]; /* 8: sorted entry index table */
+ } header; /* (32) */
+
+ dtslot_t slot[9];
+} dtroot_t;
+
+#define PARENT(IP) \
+ (le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot))
+
+#define DTROOTMAXSLOT 9
+
+#define dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0)
+
+
+/*
+ * directory regular page:
+ *
+ * entry slot array of 32 byte slot
+ *
+ * sorted entry slot index table (stbl):
+ * contiguous slots at slot specified by stblindex,
+ * 1-byte per entry
+ * 512 byte block: 16 entry tbl (1 slot)
+ * 1024 byte block: 32 entry tbl (1 slot)
+ * 2048 byte block: 64 entry tbl (2 slot)
+ * 4096 byte block: 128 entry tbl (4 slot)
+ *
+ * data area:
+ * 512 byte block: 16 - 2 = 14 slot
+ * 1024 byte block: 32 - 2 = 30 slot
+ * 2048 byte block: 64 - 3 = 61 slot
+ * 4096 byte block: 128 - 5 = 123 slot
+ *
+ * N.B. index is 0-based; index fields refer to slot index
+ * except nextindex which refers to entry index in stbl;
+ * end of entry stot list or freelist is marked with -1.
+ */
+typedef union {
+ struct {
+ s64 next; /* 8: next sibling */
+ s64 prev; /* 8: previous sibling */
+
+ u8 flag; /* 1: */
+ u8 nextindex; /* 1: next entry index in stbl */
+ s8 freecnt; /* 1: */
+ s8 freelist; /* 1: slot index of head of freelist */
+
+ u8 maxslot; /* 1: number of slots in page slot[] */
+ u8 stblindex; /* 1: slot index of start of stbl */
+ u8 rsrvd[2]; /* 2: */
+
+ pxd_t self; /* 8: self pxd */
+ } header; /* (32) */
+
+ dtslot_t slot[128];
+} dtpage_t;
+
+#define DTPAGEMAXSLOT 128
+
+#define DT8THPGNODEBYTES 512
+#define DT8THPGNODETSLOTS 1
+#define DT8THPGNODESLOTS 16
+
+#define DTQTRPGNODEBYTES 1024
+#define DTQTRPGNODETSLOTS 1
+#define DTQTRPGNODESLOTS 32
+
+#define DTHALFPGNODEBYTES 2048
+#define DTHALFPGNODETSLOTS 2
+#define DTHALFPGNODESLOTS 64
+
+#define DTFULLPGNODEBYTES 4096
+#define DTFULLPGNODETSLOTS 4
+#define DTFULLPGNODESLOTS 128
+
+#define DTENTRYSTART 1
+
+/* get sorted entry table of the page */
+#define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\
+ ((dtroot_t *)(p))->header.stbl : \
+ (s8 *)&(p)->slot[(p)->header.stblindex] )
+
+/*
+ * Flags for dtSearch
+ */
+#define JFS_CREATE 1
+#define JFS_LOOKUP 2
+#define JFS_REMOVE 3
+#define JFS_RENAME 4
+
+#define DIRENTSIZ(namlen) \
+ ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
+
+
+/*
+ * external declarations
+ */
+extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot);
+
+extern int dtSearch(struct inode *ip, component_t * key,
+ ino_t * data, btstack_t * btstack, int flag);
+
+extern int dtInsert(tid_t tid, struct inode *ip,
+ component_t * key, ino_t * ino, btstack_t * btstack);
+
+extern int dtDelete(tid_t tid,
+ struct inode *ip, component_t * key, ino_t * data, int flag);
+
+extern int dtRelocate(tid_t tid,
+ struct inode *ip, s64 lmxaddr, pxd_t * opxd, s64 nxaddr);
+
+extern int dtModify(tid_t tid, struct inode *ip,
+ component_t * key, ino_t * orig_ino, ino_t new_ino, int flag);
+
+extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
+
+#ifdef _JFS_DEBUG_DTREE
+extern int dtDisplayTree(struct inode *ip);
+
+extern int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p);
+#endif /* _JFS_DEBUG_DTREE */
+
+#endif /* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_extendfs.h b/fs/jfs/jfs_extendfs.h
new file mode 100644
index 000000000000..fb697a8d2a30
--- /dev/null
+++ b/fs/jfs/jfs_extendfs.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_EXTENDFS
+#define _H_JFS_EXTENDFS
+
+/*
+ * jfs_extendfs.h
+ */
+/*
+ * extendfs parameter list
+ */
+typedef struct {
+ u32 flag; /* 4: */
+ u8 dev; /* 1: */
+ u8 pad[3]; /* 3: */
+ s64 LVSize; /* 8: LV size in LV block */
+ s64 FSSize; /* 8: FS size in LV block */
+ s32 LogSize; /* 4: inlinelog size in LV block */
+} extendfs_t; /* (28) */
+
+/* plist flag */
+#define EXTENDFS_QUERY 0x00000001
+
+#endif /* _H_JFS_EXTENDFS */
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
new file mode 100644
index 000000000000..2c4a9f931f25
--- /dev/null
+++ b/fs/jfs/jfs_extent.c
@@ -0,0 +1,637 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ *
+ * Module: jfs_extent.c:
+ */
+
+#include <linux/fs.h>
+#include "jfs_incore.h"
+#include "jfs_dmap.h"
+#include "jfs_extent.h"
+#include "jfs_debug.h"
+
+/*
+ * forward references
+ */
+static int extBalloc(struct inode *, s64, s64 *, s64 *);
+static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
+int extRecord(struct inode *, xad_t *);
+static s64 extRoundDown(s64 nb);
+
+/*
+ * external references
+ */
+extern int dbExtend(struct inode *, s64, s64, s64);
+extern int jfs_commit_inode(struct inode *, int);
+
+
+#define DPD(a) (printk("(a): %d\n",(a)))
+#define DPC(a) (printk("(a): %c\n",(a)))
+#define DPL1(a) \
+{ \
+ if ((a) >> 32) \
+ printk("(a): %x%08x ",(a)); \
+ else \
+ printk("(a): %x ",(a) << 32); \
+}
+#define DPL(a) \
+{ \
+ if ((a) >> 32) \
+ printk("(a): %x%08x\n",(a)); \
+ else \
+ printk("(a): %x\n",(a) << 32); \
+}
+
+#define DPD1(a) (printk("(a): %d ",(a)))
+#define DPX(a) (printk("(a): %08x\n",(a)))
+#define DPX1(a) (printk("(a): %08x ",(a)))
+#define DPS(a) (printk("%s\n",(a)))
+#define DPE(a) (printk("\nENTERING: %s\n",(a)))
+#define DPE1(a) (printk("\nENTERING: %s",(a)))
+#define DPS1(a) (printk(" %s ",(a)))
+
+
+/*
+ * NAME: extAlloc()
+ *
+ * FUNCTION: allocate an extent for a specified page range within a
+ * file.
+ *
+ * PARAMETERS:
+ * ip - the inode of the file.
+ * xlen - requested extent length.
+ * pno - the starting page number with the file.
+ * xp - pointer to an xad. on entry, xad describes an
+ * extent that is used as an allocation hint if the
+ * xaddr of the xad is non-zero. on successful exit,
+ * the xad describes the newly allocated extent.
+ * abnr - boolean_t indicating whether the newly allocated extent
+ * should be marked as allocated but not recorded.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ * ENOSPC - insufficient disk resources.
+ */
+int
+extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ s64 nxlen, nxaddr, xoff, hint, xaddr = 0;
+ int rc, nbperpage;
+ int xflag;
+
+ /* This blocks if we are low on resources */
+ txBeginAnon(ip->i_sb);
+
+ /* validate extent length */
+ if (xlen > MAXXLEN)
+ xlen = MAXXLEN;
+
+ /* get the number of blocks per page */
+ nbperpage = sbi->nbperpage;
+
+ /* get the page's starting extent offset */
+ xoff = pno << sbi->l2nbperpage;
+
+ /* check if an allocation hint was provided */
+ if ((hint = addressXAD(xp))) {
+ /* get the size of the extent described by the hint */
+ nxlen = lengthXAD(xp);
+
+ /* check if the hint is for the portion of the file
+ * immediately previous to the current allocation
+ * request and if hint extent has the same abnr
+ * value as the current request. if so, we can
+ * extend the hint extent to include the current
+ * extent if we can allocate the blocks immediately
+ * following the hint extent.
+ */
+ if (offsetXAD(xp) + nxlen == xoff &&
+ abnr == ((xp->flag & XAD_NOTRECORDED) ? TRUE : FALSE))
+ xaddr = hint + nxlen;
+
+ /* adjust the hint to the last block of the extent */
+ hint += (nxlen - 1);
+ }
+
+ /* allocate the disk blocks for the extent. initially, extBalloc()
+ * will try to allocate disk blocks for the requested size (xlen).
+ * if this fails (xlen contigious free blocks not avaliable), it'll
+ * try to allocate a smaller number of blocks (producing a smaller
+ * extent), with this smaller number of blocks consisting of the
+ * requested number of blocks rounded down to the next smaller
+ * power of 2 number (i.e. 16 -> 8). it'll continue to round down
+ * and retry the allocation until the number of blocks to allocate
+ * is smaller than the number of blocks per page.
+ */
+ nxlen = xlen;
+ if ((rc =
+ extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) {
+ return (rc);
+ }
+
+ /* determine the value of the extent flag */
+ xflag = (abnr == TRUE) ? XAD_NOTRECORDED : 0;
+
+ /* if we can extend the hint extent to cover the current request,
+ * extend it. otherwise, insert a new extent to
+ * cover the current request.
+ */
+ if (xaddr && xaddr == nxaddr)
+ rc = xtExtend(0, ip, xoff, (int) nxlen, 0);
+ else
+ rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0);
+
+ /* if the extend or insert failed,
+ * free the newly allocated blocks and return the error.
+ */
+ if (rc) {
+ dbFree(ip, nxaddr, nxlen);
+ return (rc);
+ }
+
+ /* update the number of blocks allocated to the file */
+ ip->i_blocks += LBLK2PBLK(ip->i_sb, nxlen);
+
+ /* set the results of the extent allocation */
+ XADaddress(xp, nxaddr);
+ XADlength(xp, nxlen);
+ XADoffset(xp, xoff);
+ xp->flag = xflag;
+
+ mark_inode_dirty(ip);
+
+ /*
+ * COMMIT_SyncList flags an anonymous tlock on page that is on
+ * sync list.
+ * We need to commit the inode to get the page written disk.
+ */
+ if (test_and_clear_cflag(COMMIT_Synclist,ip))
+ jfs_commit_inode(ip, 0);
+
+ return (0);
+}
+
+
+/*
+ * NAME: extRealloc()
+ *
+ * FUNCTION: extend the allocation of a file extent containing a
+ * partial back last page.
+ *
+ * PARAMETERS:
+ * ip - the inode of the file.
+ * cp - cbuf for the partial backed last page.
+ * xlen - request size of the resulting extent.
+ * xp - pointer to an xad. on successful exit, the xad
+ * describes the newly allocated extent.
+ * abnr - boolean_t indicating whether the newly allocated extent
+ * should be marked as allocated but not recorded.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ * ENOSPC - insufficient disk resources.
+ */
+int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
+{
+ struct super_block *sb = ip->i_sb;
+ s64 xaddr, xlen, nxaddr, delta, xoff;
+ s64 ntail, nextend, ninsert;
+ int rc, nbperpage = JFS_SBI(sb)->nbperpage;
+ int xflag;
+
+ /* This blocks if we are low on resources */
+ txBeginAnon(ip->i_sb);
+
+ /* validate extent length */
+ if (nxlen > MAXXLEN)
+ nxlen = MAXXLEN;
+
+ /* get the extend (partial) page's disk block address and
+ * number of blocks.
+ */
+ xaddr = addressXAD(xp);
+ xlen = lengthXAD(xp);
+ xoff = offsetXAD(xp);
+
+ /* if the extend page is abnr and if the request is for
+ * the extent to be allocated and recorded,
+ * make the page allocated and recorded.
+ */
+ if ((xp->flag & XAD_NOTRECORDED) && !abnr) {
+ xp->flag = 0;
+ if ((rc = xtUpdate(0, ip, xp)))
+ return (rc);
+ }
+
+ /* try to allocated the request number of blocks for the
+ * extent. dbRealloc() first tries to satisfy the request
+ * by extending the allocation in place. otherwise, it will
+ * try to allocate a new set of blocks large enough for the
+ * request. in satisfying a request, dbReAlloc() may allocate
+ * less than what was request but will always allocate enough
+ * space as to satisfy the extend page.
+ */
+ if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr)))
+ return (rc);
+
+ delta = nxlen - xlen;
+
+ /* check if the extend page is not abnr but the request is abnr
+ * and the allocated disk space is for more than one page. if this
+ * is the case, there is a miss match of abnr between the extend page
+ * and the one or more pages following the extend page. as a result,
+ * two extents will have to be manipulated. the first will be that
+ * of the extent of the extend page and will be manipulated thru
+ * an xtExtend() or an xtTailgate(), depending upon whether the
+ * disk allocation occurred as an inplace extension. the second
+ * extent will be manipulated (created) through an xtInsert() and
+ * will be for the pages following the extend page.
+ */
+ if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) {
+ ntail = nbperpage;
+ nextend = ntail - xlen;
+ ninsert = nxlen - nbperpage;
+
+ xflag = XAD_NOTRECORDED;
+ } else {
+ ntail = nxlen;
+ nextend = delta;
+ ninsert = 0;
+
+ xflag = xp->flag;
+ }
+
+ /* if we were able to extend the disk allocation in place,
+ * extend the extent. otherwise, move the extent to a
+ * new disk location.
+ */
+ if (xaddr == nxaddr) {
+ /* extend the extent */
+ if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
+ dbFree(ip, xaddr + xlen, delta);
+ return (rc);
+ }
+ } else {
+ /*
+ * move the extent to a new location:
+ *
+ * xtTailgate() accounts for relocated tail extent;
+ */
+ if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
+ dbFree(ip, nxaddr, nxlen);
+ return (rc);
+ }
+ }
+
+
+ /* check if we need to also insert a new extent */
+ if (ninsert) {
+ /* perform the insert. if it fails, free the blocks
+ * to be inserted and make it appear that we only did
+ * the xtExtend() or xtTailgate() above.
+ */
+ xaddr = nxaddr + ntail;
+ if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert,
+ &xaddr, 0)) {
+ dbFree(ip, xaddr, (s64) ninsert);
+ delta = nextend;
+ nxlen = ntail;
+ xflag = 0;
+ }
+ }
+
+ /* update the inode with the number of blocks allocated */
+ ip->i_blocks += LBLK2PBLK(sb, delta);
+
+ /* set the return results */
+ XADaddress(xp, nxaddr);
+ XADlength(xp, nxlen);
+ XADoffset(xp, xoff);
+ xp->flag = xflag;
+
+ mark_inode_dirty(ip);
+
+ return (0);
+}
+
+
+/*
+ * NAME: extHint()
+ *
+ * FUNCTION: produce an extent allocation hint for a file offset.
+ *
+ * PARAMETERS:
+ * ip - the inode of the file.
+ * offset - file offset for which the hint is needed.
+ * xp - pointer to the xad that is to be filled in with
+ * the hint.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ */
+int extHint(struct inode *ip, s64 offset, xad_t * xp)
+{
+ struct super_block *sb = ip->i_sb;
+ xadlist_t xadl;
+ lxdlist_t lxdl;
+ lxd_t lxd;
+ s64 prev;
+ int rc, nbperpage = JFS_SBI(sb)->nbperpage;
+
+ /* init the hint as "no hint provided" */
+ XADaddress(xp, 0);
+
+ /* determine the starting extent offset of the page previous
+ * to the page containing the offset.
+ */
+ prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage;
+
+ /* if the offsets in the first page of the file,
+ * no hint provided.
+ */
+ if (prev < 0)
+ return (0);
+
+ /* prepare to lookup the previous page's extent info */
+ lxdl.maxnlxd = 1;
+ lxdl.nlxd = 1;
+ lxdl.lxd = &lxd;
+ LXDoffset(&lxd, prev)
+ LXDlength(&lxd, nbperpage);
+
+ xadl.maxnxad = 1;
+ xadl.nxad = 0;
+ xadl.xad = xp;
+
+ /* perform the lookup */
+ if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
+ return (rc);
+
+ /* check if not extent exists for the previous page.
+ * this is possible for sparse files.
+ */
+ if (xadl.nxad == 0) {
+// assert(ISSPARSE(ip));
+ return (0);
+ }
+
+ /* only preserve the abnr flag within the xad flags
+ * of the returned hint.
+ */
+ xp->flag &= XAD_NOTRECORDED;
+
+ assert(xadl.nxad == 1);
+ assert(lengthXAD(xp) == nbperpage);
+
+ return (0);
+}
+
+
+/*
+ * NAME: extRecord()
+ *
+ * FUNCTION: change a page with a file from not recorded to recorded.
+ *
+ * PARAMETERS:
+ * ip - inode of the file.
+ * cp - cbuf of the file page.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ * ENOSPC - insufficient disk resources.
+ */
+int extRecord(struct inode *ip, xad_t * xp)
+{
+ int rc;
+
+ txBeginAnon(ip->i_sb);
+
+ /* update the extent */
+ if ((rc = xtUpdate(0, ip, xp)))
+ return (rc);
+
+#ifdef _STILL_TO_PORT
+ /* no longer abnr */
+ cp->cm_abnr = FALSE;
+
+ /* mark the cbuf as modified */
+ cp->cm_modified = TRUE;
+#endif /* _STILL_TO_PORT */
+
+ return (0);
+}
+
+
+/*
+ * NAME: extFill()
+ *
+ * FUNCTION: allocate disk space for a file page that represents
+ * a file hole.
+ *
+ * PARAMETERS:
+ * ip - the inode of the file.
+ * cp - cbuf of the file page represent the hole.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ * ENOSPC - insufficient disk resources.
+ */
+int extFill(struct inode *ip, xad_t * xp)
+{
+ int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
+ s64 blkno = offsetXAD(xp) >> ip->i_blksize;
+
+// assert(ISSPARSE(ip));
+
+ /* initialize the extent allocation hint */
+ XADaddress(xp, 0);
+
+ /* allocate an extent to fill the hole */
+ if ((rc = extAlloc(ip, nbperpage, blkno, xp, FALSE)))
+ return (rc);
+
+ assert(lengthPXD(xp) == nbperpage);
+
+ return (0);
+}
+
+
+/*
+ * NAME: extBalloc()
+ *
+ * FUNCTION: allocate disk blocks to form an extent.
+ *
+ * initially, we will try to allocate disk blocks for the
+ * requested size (nblocks). if this fails (nblocks
+ * contigious free blocks not avaliable), we'll try to allocate
+ * a smaller number of blocks (producing a smaller extent), with
+ * this smaller number of blocks consisting of the requested
+ * number of blocks rounded down to the next smaller power of 2
+ * number (i.e. 16 -> 8). we'll continue to round down and
+ * retry the allocation until the number of blocks to allocate
+ * is smaller than the number of blocks per page.
+ *
+ * PARAMETERS:
+ * ip - the inode of the file.
+ * hint - disk block number to be used as an allocation hint.
+ * *nblocks - pointer to an s64 value. on entry, this value specifies
+ * the desired number of block to be allocated. on successful
+ * exit, this value is set to the number of blocks actually
+ * allocated.
+ * blkno - pointer to a block address that is filled in on successful
+ * return with the starting block number of the newly
+ * allocated block range.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ * ENOSPC - insufficient disk resources.
+ */
+static int
+extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
+{
+ s64 nb, nblks, daddr, max;
+ int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
+ bmap_t *mp = JFS_SBI(ip->i_sb)->bmap;
+
+ /* get the number of blocks to initially attempt to allocate.
+ * we'll first try the number of blocks requested unless this
+ * number is greater than the maximum number of contigious free
+ * blocks in the map. in that case, we'll start off with the
+ * maximum free.
+ */
+ max = (s64) 1 << mp->db_maxfreebud;
+ if (*nblocks >= max && *nblocks > nbperpage)
+ nb = nblks = (max > nbperpage) ? max : nbperpage;
+ else
+ nb = nblks = *nblocks;
+
+ /* try to allocate blocks */
+ while ((rc = dbAlloc(ip, hint, nb, &daddr))) {
+ /* if something other than an out of space error,
+ * stop and return this error.
+ */
+ if (rc != ENOSPC)
+ return (rc);
+
+ /* decrease the allocation request size */
+ nb = min(nblks, extRoundDown(nb));
+
+ /* give up if we cannot cover a page */
+ if (nb < nbperpage)
+ return (rc);
+ }
+
+ *nblocks = nb;
+ *blkno = daddr;
+
+ return (0);
+}
+
+
+/*
+ * NAME: extBrealloc()
+ *
+ * FUNCTION: attempt to extend an extent's allocation.
+ *
+ * initially, we will try to extend the extent's allocation
+ * in place. if this fails, we'll try to move the extent
+ * to a new set of blocks. if moving the extent, we initially
+ * will try to allocate disk blocks for the requested size
+ * (nnew). if this fails (nnew contigious free blocks not
+ * avaliable), we'll try to allocate a smaller number of
+ * blocks (producing a smaller extent), with this smaller
+ * number of blocks consisting of the requested number of
+ * blocks rounded down to the next smaller power of 2
+ * number (i.e. 16 -> 8). we'll continue to round down and
+ * retry the allocation until the number of blocks to allocate
+ * is smaller than the number of blocks per page.
+ *
+ * PARAMETERS:
+ * ip - the inode of the file.
+ * blkno - starting block number of the extents current allocation.
+ * nblks - number of blocks within the extents current allocation.
+ * newnblks - pointer to a s64 value. on entry, this value is the
+ * the new desired extent size (number of blocks). on
+ * successful exit, this value is set to the extent's actual
+ * new size (new number of blocks).
+ * newblkno - the starting block number of the extents new allocation.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ * ENOSPC - insufficient disk resources.
+ */
+static int
+extBrealloc(struct inode *ip,
+ s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno)
+{
+ int rc;
+
+ /* try to extend in place */
+ if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) {
+ *newblkno = blkno;
+ return (0);
+ } else {
+ if (rc != ENOSPC)
+ return (rc);
+ }
+
+ /* in place extension not possible.
+ * try to move the extent to a new set of blocks.
+ */
+ return (extBalloc(ip, blkno, newnblks, newblkno));
+}
+
+
+/*
+ * NAME: extRoundDown()
+ *
+ * FUNCTION: round down a specified number of blocks to the next
+ * smallest power of 2 number.
+ *
+ * PARAMETERS:
+ * nb - the inode of the file.
+ *
+ * RETURN VALUES:
+ * next smallest power of 2 number.
+ */
+static s64 extRoundDown(s64 nb)
+{
+ int i;
+ u64 m, k;
+
+ for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) {
+ if (m & nb)
+ break;
+ }
+
+ i = 63 - i;
+ k = (u64) 1 << i;
+ k = ((k - 1) & nb) ? k : k >> 1;
+
+ return (k);
+}
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
new file mode 100644
index 000000000000..e2284c5992cf
--- /dev/null
+++ b/fs/jfs/jfs_extent.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_EXTENT
+#define _H_JFS_EXTENT
+
+/* get block allocation allocation hint as location of disk inode */
+#define INOHINT(ip) \
+ (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
+
+extern int extAlloc(struct inode *, s64, s64, xad_t *, boolean_t);
+extern int extFill(struct inode *, xad_t *);
+extern int extHint(struct inode *, s64, xad_t *);
+extern int extRealloc(struct inode *, s64, xad_t *, boolean_t);
+extern int extRecord(struct inode *, xad_t *);
+
+#endif /* _H_JFS_EXTENT */
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
new file mode 100644
index 000000000000..8c82a5c171b7
--- /dev/null
+++ b/fs/jfs/jfs_filsys.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+*/
+
+#ifndef _H_JFS_FILSYS
+#define _H_JFS_FILSYS
+
+/*
+ * jfs_filsys.h
+ *
+ * file system (implementation-dependent) constants
+ *
+ * refer to <limits.h> for system wide implementation-dependent constants
+ */
+
+/*
+ * file system option (superblock flag)
+ */
+/* platform option (conditional compilation) */
+#define JFS_AIX 0x80000000 /* AIX support */
+/* POSIX name/directory support */
+
+#define JFS_OS2 0x40000000 /* OS/2 support */
+/* case-insensitive name/directory support */
+
+#define JFS_DFS 0x20000000 /* DCE DFS LFS support */
+
+#define JFS_LINUX 0x10000000 /* Linux support */
+/* case-sensitive name/directory support */
+
+/* directory option */
+#define JFS_UNICODE 0x00000001 /* unicode name */
+
+/* commit option */
+#define JFS_COMMIT 0x00000f00 /* commit option mask */
+#define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */
+#define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */
+#define JFS_TMPFS 0x00000400 /* temporary file system -
+ * do not log/commit:
+ */
+
+/* log logical volume option */
+#define JFS_INLINELOG 0x00000800 /* inline log within file system */
+#define JFS_INLINEMOVE 0x00001000 /* inline log being moved */
+
+/* Secondary aggregate inode table */
+#define JFS_BAD_SAIT 0x00010000 /* current secondary ait is bad */
+
+/* sparse regular file support */
+#define JFS_SPARSE 0x00020000 /* sparse regular file */
+
+/* DASD Limits F226941 */
+#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */
+#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */
+
+/* big endian flag */
+#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */
+
+/* Directory index */
+#define JFS_DIR_INDEX 0x00200000 /* Persistant index for */
+ /* directory entries */
+
+
+/*
+ * buffer cache configuration
+ */
+/* page size */
+#ifdef PSIZE
+#undef PSIZE
+#endif
+#define PSIZE 4096 /* page size (in byte) */
+#define L2PSIZE 12 /* log2(PSIZE) */
+#define POFFSET 4095 /* offset within page */
+
+/* buffer page size */
+#define BPSIZE PSIZE
+
+/*
+ * fs fundamental size
+ *
+ * PSIZE >= file system block size >= PBSIZE >= DISIZE
+ */
+#define PBSIZE 512 /* physical block size (in byte) */
+#define L2PBSIZE 9 /* log2(PBSIZE) */
+
+#define DISIZE 512 /* on-disk inode size (in byte) */
+#define L2DISIZE 9 /* log2(DISIZE) */
+
+#define IDATASIZE 256 /* inode inline data size */
+#define IXATTRSIZE 128 /* inode inline extended attribute size */
+
+#define XTPAGE_SIZE 4096
+#define log2_PAGESIZE 12
+
+#define IAG_SIZE 4096
+#define IAG_EXTENT_SIZE 4096
+#define INOSPERIAG 4096 /* number of disk inodes per iag */
+#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */
+#define INOSPEREXT 32 /* number of disk inode per extent */
+#define L2INOSPEREXT 5 /* l2 number of disk inode per extent */
+#define IXSIZE (DISIZE * INOSPEREXT) /* inode extent size */
+#define INOSPERPAGE 8 /* number of disk inodes per 4K page */
+#define L2INOSPERPAGE 3 /* log2(INOSPERPAGE) */
+
+#define IAGFREELIST_LWM 64
+
+#define INODE_EXTENT_SIZE IXSIZE /* inode extent size */
+#define NUM_INODE_PER_EXTENT INOSPEREXT
+#define NUM_INODE_PER_IAG INOSPERIAG
+
+#define MINBLOCKSIZE 512
+#define MAXBLOCKSIZE 4096
+#define MAXFILESIZE ((s64)1 << 52)
+
+#define JFS_LINK_MAX 65535 /* nlink_t is unsigned short */
+
+/* Minimum number of bytes supported for a JFS partition */
+#define MINJFS (0x1000000)
+#define MINJFSTEXT "16"
+
+/*
+ * file system block size -> physical block size
+ */
+#define LBOFFSET(x) ((x) & (PBSIZE - 1))
+#define LBNUMBER(x) ((x) >> L2PBSIZE)
+#define LBLK2PBLK(sb,b) ((b) << (sb->s_blocksize_bits - L2PBSIZE))
+#define PBLK2LBLK(sb,b) ((b) >> (sb->s_blocksize_bits - L2PBSIZE))
+/* size in byte -> last page number */
+#define SIZE2PN(size) ( ((s64)((size) - 1)) >> (L2PSIZE) )
+/* size in byte -> last file system block number */
+#define SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) )
+
+/*
+ * fixed physical block address (physical block size = 512 byte)
+ *
+ * NOTE: since we can't guarantee a physical block size of 512 bytes the use of
+ * these macros should be removed and the byte offset macros used instead.
+ */
+#define SUPER1_B 64 /* primary superblock */
+#define AIMAP_B (SUPER1_B + 8) /* 1st extent of aggregate inode map */
+#define AITBL_B (AIMAP_B + 16) /*
+ * 1st extent of aggregate inode table
+ */
+#define SUPER2_B (AITBL_B + 32) /* 2ndary superblock pbn */
+#define BMAP_B (SUPER2_B + 8) /* block allocation map */
+
+/*
+ * SIZE_OF_SUPER defines the total amount of space reserved on disk for the
+ * superblock. This is not the same as the superblock structure, since all of
+ * this space is not currently being used.
+ */
+#define SIZE_OF_SUPER PSIZE
+
+/*
+ * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table
+ */
+#define SIZE_OF_AG_TABLE PSIZE
+
+/*
+ * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of
+ * the inode allocation map (to hold iag)
+ */
+#define SIZE_OF_MAP_PAGE PSIZE
+
+/*
+ * fixed byte offset address
+ */
+#define SUPER1_OFF 0x8000 /* primary superblock */
+#define AIMAP_OFF (SUPER1_OFF + SIZE_OF_SUPER)
+ /*
+ * Control page of aggregate inode map
+ * followed by 1st extent of map
+ */
+#define AITBL_OFF (AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1))
+ /*
+ * 1st extent of aggregate inode table
+ */
+#define SUPER2_OFF (AITBL_OFF + INODE_EXTENT_SIZE)
+ /*
+ * secondary superblock
+ */
+#define BMAP_OFF (SUPER2_OFF + SIZE_OF_SUPER)
+ /*
+ * block allocation map
+ */
+
+/*
+ * The following macro is used to indicate the number of reserved disk blocks at
+ * the front of an aggregate, in terms of physical blocks. This value is
+ * currently defined to be 32K. This turns out to be the same as the primary
+ * superblock's address, since it directly follows the reserved blocks.
+ */
+#define AGGR_RSVD_BLOCKS SUPER1_B
+
+/*
+ * The following macro is used to indicate the number of reserved bytes at the
+ * front of an aggregate. This value is currently defined to be 32K. This
+ * turns out to be the same as the primary superblock's byte offset, since it
+ * directly follows the reserved blocks.
+ */
+#define AGGR_RSVD_BYTES SUPER1_OFF
+
+/*
+ * The following macro defines the byte offset for the first inode extent in
+ * the aggregate inode table. This allows us to find the self inode to find the
+ * rest of the table. Currently this value is 44K.
+ */
+#define AGGR_INODE_TABLE_START AITBL_OFF
+
+/*
+ * fixed reserved inode number
+ */
+/* aggregate inode */
+#define AGGR_RESERVED_I 0 /* aggregate inode (reserved) */
+#define AGGREGATE_I 1 /* aggregate inode map inode */
+#define BMAP_I 2 /* aggregate block allocation map inode */
+#define LOG_I 3 /* aggregate inline log inode */
+#define BADBLOCK_I 4 /* aggregate bad block inode */
+#define FILESYSTEM_I 16 /* 1st/only fileset inode in ait:
+ * fileset inode map inode
+ */
+
+/* per fileset inode */
+#define FILESET_RSVD_I 0 /* fileset inode (reserved) */
+#define FILESET_EXT_I 1 /* fileset inode extension */
+#define ROOT_I 2 /* fileset root inode */
+#define ACL_I 3 /* fileset ACL inode */
+
+#define FILESET_OBJECT_I 4 /* the first fileset inode available for a file
+ * or directory or link...
+ */
+#define FIRST_FILESET_INO 16 /* the first aggregate inode which describes
+ * an inode. (To fsck this is also the first
+ * inode in part 2 of the agg inode table.)
+ */
+
+/*
+ * directory configuration
+ */
+#define JFS_NAME_MAX 255
+#define JFS_PATH_MAX BPSIZE
+
+
+/*
+ * file system state (superblock state)
+ */
+#define FM_CLEAN 0x00000000 /* file system is unmounted and clean */
+#define FM_MOUNT 0x00000001 /* file system is mounted cleanly */
+#define FM_DIRTY 0x00000002 /* file system was not unmounted and clean
+ * when mounted or
+ * commit failure occurred while being mounted:
+ * fsck() must be run to repair
+ */
+#define FM_LOGREDO 0x00000004 /* log based recovery (logredo()) failed:
+ * fsck() must be run to repair
+ */
+#define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */
+
+#endif /* _H_JFS_FILSYS */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
new file mode 100644
index 000000000000..ada3818a7d85
--- /dev/null
+++ b/fs/jfs/jfs_imap.c
@@ -0,0 +1,3212 @@
+/*
+
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+/*
+ * Change History :
+ *
+ */
+
+/*
+ * jfs_imap.c: inode allocation map manager
+ *
+ * Serialization:
+ * Each AG has a simple lock which is used to control the serialization of
+ * the AG level lists. This lock should be taken first whenever an AG
+ * level list will be modified or accessed.
+ *
+ * Each IAG is locked by obtaining the buffer for the IAG page.
+ *
+ * There is also a inode lock for the inode map inode. A read lock needs to
+ * be taken whenever an IAG is read from the map or the global level
+ * information is read. A write lock needs to be taken whenever the global
+ * level information is modified or an atomic operation needs to be used.
+ *
+ * If more than one IAG is read at one time, the read lock may not
+ * be given up until all of the IAG's are read. Otherwise, a deadlock
+ * may occur when trying to obtain the read lock while another thread
+ * holding the read lock is waiting on the IAG already being held.
+ *
+ * The control page of the inode map is read into memory by diMount().
+ * Thereafter it should only be modified in memory and then it will be
+ * written out when the filesystem is unmounted by diUnmount().
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/locks.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_dinode.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+
+/*
+ * imap locks
+ */
+/* iag free list lock */
+#define IAGFREE_LOCK_INIT(imap) init_MUTEX(&imap->im_freelock)
+#define IAGFREE_LOCK(imap) down(&imap->im_freelock)
+#define IAGFREE_UNLOCK(imap) up(&imap->im_freelock)
+
+/* per ag iag list locks */
+#define AG_LOCK_INIT(imap,index) init_MUTEX(&(imap->im_aglock[index]))
+#define AG_LOCK(imap,agno) down(&imap->im_aglock[agno])
+#define AG_UNLOCK(imap,agno) up(&imap->im_aglock[agno])
+
+/*
+ * external references
+ */
+extern struct address_space_operations jfs_aops;
+
+/*
+ * forward references
+ */
+static int diAllocAG(imap_t *, int, boolean_t, struct inode *);
+static int diAllocAny(imap_t *, int, boolean_t, struct inode *);
+static int diAllocBit(imap_t *, iag_t *, int);
+static int diAllocExt(imap_t *, int, struct inode *);
+static int diAllocIno(imap_t *, int, struct inode *);
+static int diFindFree(u32, int);
+static int diNewExt(imap_t *, iag_t *, int);
+static int diNewIAG(imap_t *, int *, int, metapage_t **);
+static void duplicateIXtree(struct super_block *, s64, int, s64 *);
+
+static int diIAGRead(imap_t * imap, int, metapage_t **);
+static int copy_from_dinode(dinode_t *, struct inode *);
+static void copy_to_dinode(dinode_t *, struct inode *);
+
+/*
+ * debug code for double-checking inode map
+ */
+/* #define _JFS_DEBUG_IMAP 1 */
+
+#ifdef _JFS_DEBUG_IMAP
+#define DBG_DIINIT(imap) DBGdiInit(imap)
+#define DBG_DIALLOC(imap, ino) DBGdiAlloc(imap, ino)
+#define DBG_DIFREE(imap, ino) DBGdiFree(imap, ino)
+
+static void *DBGdiInit(imap_t * imap);
+static void DBGdiAlloc(imap_t * imap, ino_t ino);
+static void DBGdiFree(imap_t * imap, ino_t ino);
+#else
+#define DBG_DIINIT(imap)
+#define DBG_DIALLOC(imap, ino)
+#define DBG_DIFREE(imap, ino)
+#endif /* _JFS_DEBUG_IMAP */
+
+/*
+ * NAME: diMount()
+ *
+ * FUNCTION: initialize the incore inode map control structures for
+ * a fileset or aggregate init time.
+ *
+ * the inode map's control structure (dinomap_t) is
+ * brought in from disk and placed in virtual memory.
+ *
+ * PARAMETERS:
+ * ipimap - pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOMEM - insufficient free virtual memory.
+ * EIO - i/o error.
+ */
+int diMount(struct inode *ipimap)
+{
+ imap_t *imap;
+ metapage_t *mp;
+ int index;
+ dinomap_t *dinom_le;
+
+ /*
+ * allocate/initialize the in-memory inode map control structure
+ */
+ /* allocate the in-memory inode map control structure. */
+ imap = (imap_t *) kmalloc(sizeof(imap_t), GFP_KERNEL);
+ if (imap == NULL) {
+ jERROR(1, ("diMount: kmalloc returned NULL!\n"));
+ return (ENOMEM);
+ }
+
+ /* read the on-disk inode map control structure. */
+
+ mp = read_metapage(ipimap,
+ IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+ PSIZE, 0);
+ if (mp == NULL) {
+ kfree(imap);
+ return (EIO);
+ }
+
+ /* copy the on-disk version to the in-memory version. */
+ dinom_le = (dinomap_t *) mp->data;
+ imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag);
+ imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag);
+ atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos));
+ atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree));
+ imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext);
+ imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext);
+ for (index = 0; index < MAXAG; index++) {
+ imap->im_agctl[index].inofree =
+ le32_to_cpu(dinom_le->in_agctl[index].inofree);
+ imap->im_agctl[index].extfree =
+ le32_to_cpu(dinom_le->in_agctl[index].extfree);
+ imap->im_agctl[index].numinos =
+ le32_to_cpu(dinom_le->in_agctl[index].numinos);
+ imap->im_agctl[index].numfree =
+ le32_to_cpu(dinom_le->in_agctl[index].numfree);
+ }
+
+ /* release the buffer. */
+ release_metapage(mp);
+
+ /*
+ * allocate/initialize inode allocation map locks
+ */
+ /* allocate and init iag free list lock */
+ IAGFREE_LOCK_INIT(imap);
+
+ /* allocate and init ag list locks */
+ for (index = 0; index < MAXAG; index++) {
+ AG_LOCK_INIT(imap, index);
+ }
+
+ /* bind the inode map inode and inode map control structure
+ * to each other.
+ */
+ imap->im_ipimap = ipimap;
+ JFS_IP(ipimap)->i_imap = imap;
+
+// DBG_DIINIT(imap);
+
+ return (0);
+}
+
+
+/*
+ * NAME: diUnmount()
+ *
+ * FUNCTION: write to disk the incore inode map control structures for
+ * a fileset or aggregate at unmount time.
+ *
+ * PARAMETERS:
+ * ipimap - pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOMEM - insufficient free virtual memory.
+ * EIO - i/o error.
+ */
+int diUnmount(struct inode *ipimap, int mounterror)
+{
+ imap_t *imap = JFS_IP(ipimap)->i_imap;
+
+ /*
+ * update the on-disk inode map control structure
+ */
+
+ if (!(mounterror || isReadOnly(ipimap)))
+ diSync(ipimap);
+
+ /*
+ * Invalidate the page cache buffers
+ */
+ truncate_inode_pages(ipimap->i_mapping, 0);
+
+ /*
+ * free in-memory control structure
+ */
+ kfree(imap);
+
+ return (0);
+}
+
+
+/*
+ * diSync()
+ */
+int diSync(struct inode *ipimap)
+{
+ dinomap_t *dinom_le;
+ imap_t *imp = JFS_IP(ipimap)->i_imap;
+ metapage_t *mp;
+ int index;
+
+ /*
+ * write imap global conrol page
+ */
+ /* read the on-disk inode map control structure */
+ mp = get_metapage(ipimap,
+ IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+ PSIZE, 0);
+ if (mp == NULL) {
+ jERROR(1,("diSync: get_metapage failed!\n"));
+ return EIO;
+ }
+
+ /* copy the in-memory version to the on-disk version */
+ //memcpy(mp->data, &imp->im_imap,sizeof(dinomap_t));
+ dinom_le = (dinomap_t *) mp->data;
+ dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag);
+ dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag);
+ dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos));
+ dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree));
+ dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext);
+ dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext);
+ for (index = 0; index < MAXAG; index++) {
+ dinom_le->in_agctl[index].inofree =
+ cpu_to_le32(imp->im_agctl[index].inofree);
+ dinom_le->in_agctl[index].extfree =
+ cpu_to_le32(imp->im_agctl[index].extfree);
+ dinom_le->in_agctl[index].numinos =
+ cpu_to_le32(imp->im_agctl[index].numinos);
+ dinom_le->in_agctl[index].numfree =
+ cpu_to_le32(imp->im_agctl[index].numfree);
+ }
+
+ /* write out the control structure */
+ write_metapage(mp);
+
+ /*
+ * write out dirty pages of imap
+ */
+ fsync_inode_data_buffers(ipimap);
+
+ diWriteSpecial(ipimap);
+
+ return (0);
+}
+
+
+/*
+ * NAME: diRead()
+ *
+ * FUNCTION: initialize an incore inode from disk.
+ *
+ * on entry, the specifed incore inode should itself
+ * specify the disk inode number corresponding to the
+ * incore inode (i.e. i_number should be initialized).
+ *
+ * this routine handles incore inode initialization for
+ * both "special" and "regular" inodes. special inodes
+ * are those required early in the mount process and
+ * require special handling since much of the file system
+ * is not yet initialized. these "special" inodes are
+ * identified by a NULL inode map inode pointer and are
+ * actually initialized by a call to diReadSpecial().
+ *
+ * for regular inodes, the iag describing the disk inode
+ * is read from disk to determine the inode extent address
+ * for the disk inode. with the inode extent address in
+ * hand, the page of the extent that contains the disk
+ * inode is read and the disk inode is copied to the
+ * incore inode.
+ *
+ * PARAMETERS:
+ * ip - pointer to incore inode to be initialized from disk.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ * ENOMEM - insufficient memory
+ *
+ */
+int diRead(struct inode *ip)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ int iagno, ino, extno, rc;
+ struct inode *ipimap;
+ dinode_t *dp;
+ iag_t *iagp;
+ metapage_t *mp;
+ s64 blkno, agstart;
+ imap_t *imap;
+ int block_offset;
+ int inodes_left;
+ uint pageno;
+ int rel_inode;
+
+ jFYI(1, ("diRead: ino = %ld\n", ip->i_ino));
+
+ ipimap = sbi->ipimap;
+ JFS_IP(ip)->ipimap = ipimap;
+
+ /* determine the iag number for this inode (number) */
+ iagno = INOTOIAG(ip->i_ino);
+
+ /* read the iag */
+ imap = JFS_IP(ipimap)->i_imap;
+ IREAD_LOCK(ipimap);
+ rc = diIAGRead(imap, iagno, &mp);
+ IREAD_UNLOCK(ipimap);
+ if (rc) {
+ jERROR(1, ("diRead: diIAGRead returned %d\n", rc));
+ return (rc);
+ }
+
+ iagp = (iag_t *) mp->data;
+
+ /* determine inode extent that holds the disk inode */
+ ino = ip->i_ino & (INOSPERIAG - 1);
+ extno = ino >> L2INOSPEREXT;
+
+ if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) ||
+ (addressPXD(&iagp->inoext[extno]) == 0)) {
+ jERROR(1, ("diRead: Bad inoext: 0x%lx, 0x%lx\n",
+ (ulong) addressPXD(&iagp->inoext[extno]),
+ (ulong) lengthPXD(&iagp->inoext[extno])));
+ release_metapage(mp);
+ updateSuper(ip->i_sb, FM_DIRTY);
+ return ESTALE;
+ }
+
+ /* get disk block number of the page within the inode extent
+ * that holds the disk inode.
+ */
+ blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage);
+
+ /* get the ag for the iag */
+ agstart = le64_to_cpu(iagp->agstart);
+
+ release_metapage(mp);
+
+ rel_inode = (ino & (INOSPERPAGE - 1));
+ pageno = blkno >> sbi->l2nbperpage;
+
+ if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+ /*
+ * OS/2 didn't always align inode extents on page boundaries
+ */
+ inodes_left =
+ (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+
+ if (rel_inode < inodes_left)
+ rel_inode += block_offset << sbi->l2niperblk;
+ else {
+ pageno += 1;
+ rel_inode -= inodes_left;
+ }
+ }
+
+ /* read the page of disk inode */
+ mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+ if (mp == 0) {
+ jERROR(1, ("diRead: read_metapage failed\n"));
+ return EIO;
+ }
+
+ /* locate the the disk inode requested */
+ dp = (dinode_t *) mp->data;
+ dp += rel_inode;
+
+ if (ip->i_ino != le32_to_cpu(dp->di_number)) {
+ jERROR(1, ("diRead: i_ino != di_number\n"));
+ updateSuper(ip->i_sb, FM_DIRTY);
+ rc = EIO;
+ } else if (le32_to_cpu(dp->di_nlink) == 0) {
+ jERROR(1,
+ ("diRead: di_nlink is zero. ino=%ld\n", ip->i_ino));
+ updateSuper(ip->i_sb, FM_DIRTY);
+ rc = ESTALE;
+ } else
+ /* copy the disk inode to the in-memory inode */
+ rc = copy_from_dinode(dp, ip);
+
+ release_metapage(mp);
+
+ /* set the ag for the inode */
+ JFS_IP(ip)->agno = BLKTOAG(agstart, sbi);
+
+ return (rc);
+}
+
+
+/*
+ * NAME: diReadSpecial()
+ *
+ * FUNCTION: initialize a 'special' inode from disk.
+ *
+ * this routines handles aggregate level inodes. The
+ * inode cache cannot differentiate between the
+ * aggregate inodes and the filesystem inodes, so we
+ * handle these here. We don't actually use the aggregate
+ * inode map, since these inodes are at a fixed location
+ * and in some cases the aggregate inode map isn't initialized
+ * yet.
+ *
+ * PARAMETERS:
+ * sb - filesystem superblock
+ * inum - aggregate inode number
+ *
+ * RETURN VALUES:
+ * new inode - success
+ * NULL - i/o error.
+ */
+struct inode *diReadSpecial(struct super_block *sb, ino_t inum)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(sb);
+ uint address;
+ dinode_t *dp;
+ struct inode *ip;
+ metapage_t *mp;
+
+ ip = new_inode(sb);
+ if (ip == NULL) {
+ jERROR(1,
+ ("diReadSpecial: new_inode returned NULL!\n"));
+ return ip;
+ }
+
+ /*
+ * If ip->i_number >= 32 (INOSPEREXT), then read from secondary
+ * aggregate inode table.
+ */
+
+ if (inum >= INOSPEREXT) {
+ address =
+ addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+ inum -= INOSPEREXT;
+ ASSERT(inum < INOSPEREXT);
+ JFS_IP(ip)->ipimap = sbi->ipaimap2;
+ } else {
+ address = AITBL_OFF >> L2PSIZE;
+ JFS_IP(ip)->ipimap = sbi->ipaimap;
+ }
+ ip->i_ino = inum;
+
+ address += inum >> 3; /* 8 inodes per 4K page */
+
+ /* read the page of fixed disk inode (AIT) in raw mode */
+ jEVENT(0,
+ ("Reading aggregate inode %d from block %d\n", (uint) inum,
+ address));
+ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+ if (mp == NULL) {
+ ip->i_sb = NULL;
+ ip->i_nlink = 1; /* Don't want iput() deleting it */
+ iput(ip);
+ return (NULL);
+ }
+
+ /* get the pointer to the disk inode of interest */
+ dp = (dinode_t *) (mp->data);
+ dp += inum % 8; /* 8 inodes per 4K page */
+
+ /* copy on-disk inode to in-memory inode */
+ if ((copy_from_dinode(dp, ip)) != 0) {
+ /* handle bad return by returning NULL for ip */
+ ip->i_sb = NULL;
+ ip->i_nlink = 1; /* Don't want iput() deleting it */
+ iput(ip);
+ /* release the page */
+ release_metapage(mp);
+ return (NULL);
+
+ }
+
+ ip->i_mapping->a_ops = &jfs_aops;
+ ip->i_mapping->gfp_mask = GFP_NOFS;
+
+ if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) {
+ sbi->gengen = le32_to_cpu(dp->di_gengen);
+ sbi->inostamp = le32_to_cpu(dp->di_inostamp);
+ }
+
+ /* release the page */
+ release_metapage(mp);
+
+ return (ip);
+}
+
+/*
+ * NAME: diWriteSpecial()
+ *
+ * FUNCTION: Write the special inode to disk
+ *
+ * PARAMETERS:
+ * ip - special inode
+ *
+ * RETURN VALUES: none
+ */
+
+void diWriteSpecial(struct inode *ip)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ uint address;
+ dinode_t *dp;
+ ino_t inum = ip->i_ino;
+ metapage_t *mp;
+
+ /*
+ * If ip->i_number >= 32 (INOSPEREXT), then write to secondary
+ * aggregate inode table.
+ */
+
+ if (!(ip->i_state & I_DIRTY))
+ return;
+
+ ip->i_state &= ~I_DIRTY;
+
+ if (inum >= INOSPEREXT) {
+ address =
+ addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+ inum -= INOSPEREXT;
+ ASSERT(inum < INOSPEREXT);
+ } else {
+ address = AITBL_OFF >> L2PSIZE;
+ }
+
+ address += inum >> 3; /* 8 inodes per 4K page */
+
+ /* read the page of fixed disk inode (AIT) in raw mode */
+ jEVENT(0,
+ ("Reading aggregate inode %d from block %d\n", (uint) inum,
+ address));
+ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+ if (mp == NULL) {
+ jERROR(1,
+ ("diWriteSpecial: failed to read aggregate inode extent!\n"));
+ return;
+ }
+
+ /* get the pointer to the disk inode of interest */
+ dp = (dinode_t *) (mp->data);
+ dp += inum % 8; /* 8 inodes per 4K page */
+
+ /* copy on-disk inode to in-memory inode */
+ copy_to_dinode(dp, ip);
+ memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288);
+
+ if (inum == FILESYSTEM_I)
+ dp->di_gengen = cpu_to_le32(sbi->gengen);
+
+ /* write the page */
+ write_metapage(mp);
+}
+
+/*
+ * NAME: diFreeSpecial()
+ *
+ * FUNCTION: Free allocated space for special inode
+ */
+void diFreeSpecial(struct inode *ip)
+{
+ if (ip == NULL) {
+ jERROR(1, ("diFreeSpecial called with NULL ip!\n"));
+ return;
+ }
+ fsync_inode_data_buffers(ip);
+ truncate_inode_pages(ip->i_mapping, 0);
+ iput(ip);
+}
+
+
+
+/*
+ * NAME: diWrite()
+ *
+ * FUNCTION: write the on-disk inode portion of the in-memory inode
+ * to its corresponding on-disk inode.
+ *
+ * on entry, the specifed incore inode should itself
+ * specify the disk inode number corresponding to the
+ * incore inode (i.e. i_number should be initialized).
+ *
+ * the inode contains the inode extent address for the disk
+ * inode. with the inode extent address in hand, the
+ * page of the extent that contains the disk inode is
+ * read and the disk inode portion of the incore inode
+ * is copied to the disk inode.
+ *
+ * PARAMETERS:
+ * tid - transacation id
+ * ip - pointer to incore inode to be written to the inode extent.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ */
+int diWrite(tid_t tid, struct inode *ip)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+ int rc = 0;
+ s32 ino;
+ dinode_t *dp;
+ s64 blkno;
+ int block_offset;
+ int inodes_left;
+ metapage_t *mp;
+ uint pageno;
+ int rel_inode;
+ int dioffset;
+ struct inode *ipimap;
+ uint type;
+ lid_t lid;
+ tlock_t *ditlck, *tlck;
+ linelock_t *dilinelock, *ilinelock;
+ lv_t *lv;
+ int n;
+
+ ipimap = jfs_ip->ipimap;
+
+ ino = ip->i_ino & (INOSPERIAG - 1);
+
+ assert(lengthPXD(&(jfs_ip->ixpxd)) ==
+ JFS_IP(ipimap)->i_imap->im_nbperiext);
+ assert(addressPXD(&(jfs_ip->ixpxd)));
+
+ /*
+ * read the page of disk inode containing the specified inode:
+ */
+ /* compute the block address of the page */
+ blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage);
+
+ rel_inode = (ino & (INOSPERPAGE - 1));
+ pageno = blkno >> sbi->l2nbperpage;
+
+ if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+ /*
+ * OS/2 didn't always align inode extents on page boundaries
+ */
+ inodes_left =
+ (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+
+ if (rel_inode < inodes_left)
+ rel_inode += block_offset << sbi->l2niperblk;
+ else {
+ pageno += 1;
+ rel_inode -= inodes_left;
+ }
+ }
+ /* read the page of disk inode */
+ retry:
+ mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+ if (mp == 0)
+ return (EIO);
+
+ /* get the pointer to the disk inode */
+ dp = (dinode_t *) mp->data;
+ dp += rel_inode;
+
+ dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE;
+
+ /*
+ * acquire transaction lock on the on-disk inode;
+ * N.B. tlock is acquired on ipimap not ip;
+ */
+ if ((ditlck =
+ txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL)
+ goto retry;
+ dilinelock = (linelock_t *) & ditlck->lock;
+
+ /*
+ * copy btree root from in-memory inode to on-disk inode
+ *
+ * (tlock is taken from inline B+-tree root in in-memory
+ * inode when the B+-tree root is updated, which is pointed
+ * by jfs_ip->blid as well as being on tx tlock list)
+ *
+ * further processing of btree root is based on the copy
+ * in in-memory inode, where txLog() will log from, and,
+ * for xtree root, txUpdateMap() will update map and reset
+ * XAD_NEW bit;
+ */
+
+ if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) {
+ /*
+ * This is the special xtree inside the directory for storing
+ * the directory table
+ */
+ xtpage_t *p, *xp;
+ xad_t *xad;
+
+ jfs_ip->xtlid = 0;
+ tlck = lid_to_tlock(lid);
+ assert(tlck->type & tlckXTREE);
+ tlck->type |= tlckBTROOT;
+ tlck->mp = mp;
+ ilinelock = (linelock_t *) & tlck->lock;
+
+ /*
+ * copy xtree root from inode to dinode:
+ */
+ p = &jfs_ip->i_xtroot;
+ xp = (xtpage_t *) &dp->di_dirtable;
+ lv = (lv_t *) & ilinelock->lv;
+ for (n = 0; n < ilinelock->index; n++, lv++) {
+ memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+ lv->length << L2XTSLOTSIZE);
+ }
+
+ /* reset on-disk (metadata page) xtree XAD_NEW bit */
+ xad = &xp->xad[XTENTRYSTART];
+ for (n = XTENTRYSTART;
+ n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+ if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+ xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+ }
+
+ if ((lid = jfs_ip->blid) == 0)
+ goto inlineData;
+ jfs_ip->blid = 0;
+
+ tlck = lid_to_tlock(lid);
+ type = tlck->type;
+ tlck->type |= tlckBTROOT;
+ tlck->mp = mp;
+ ilinelock = (linelock_t *) & tlck->lock;
+
+ /*
+ * regular file: 16 byte (XAD slot) granularity
+ */
+ if (type & tlckXTREE) {
+ xtpage_t *p, *xp;
+ xad_t *xad;
+
+ /*
+ * copy xtree root from inode to dinode:
+ */
+ p = &jfs_ip->i_xtroot;
+ xp = &dp->di_xtroot;
+ lv = (lv_t *) & ilinelock->lv;
+ for (n = 0; n < ilinelock->index; n++, lv++) {
+ memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+ lv->length << L2XTSLOTSIZE);
+ }
+
+ /* reset on-disk (metadata page) xtree XAD_NEW bit */
+ xad = &xp->xad[XTENTRYSTART];
+ for (n = XTENTRYSTART;
+ n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+ if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+ xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+ }
+ /*
+ * directory: 32 byte (directory entry slot) granularity
+ */
+ else if (type & tlckDTREE) {
+ dtpage_t *p, *xp;
+
+ /*
+ * copy dtree root from inode to dinode:
+ */
+ p = (dtpage_t *) &jfs_ip->i_dtroot;
+ xp = (dtpage_t *) & dp->di_dtroot;
+ lv = (lv_t *) & ilinelock->lv;
+ for (n = 0; n < ilinelock->index; n++, lv++) {
+ memcpy(&xp->slot[lv->offset], &p->slot[lv->offset],
+ lv->length << L2DTSLOTSIZE);
+ }
+ } else {
+ jERROR(1, ("diWrite: UFO tlock\n"));
+ }
+
+ inlineData:
+ /*
+ * copy inline symlink from in-memory inode to on-disk inode
+ */
+ if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) {
+ lv = (lv_t *) & dilinelock->lv[dilinelock->index];
+ lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
+ lv->length = 2;
+ memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
+ dilinelock->index++;
+ }
+#ifdef _STILL_TO_PORT
+ /*
+ * copy inline data from in-memory inode to on-disk inode:
+ * 128 byte slot granularity
+ */
+ if (test_cflag(COMMIT_Inlineea, ip))
+ lv = (lv_t *) & dilinelock->lv[dilinelock->index];
+ lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE;
+ lv->length = 1;
+ memcpy(&dp->di_inlineea, &ip->i_inlineea, INODESLOTSIZE);
+ dilinelock->index++;
+
+ clear_cflag(COMMIT_Inlineea, ip);
+ }
+#endif /* _STILL_TO_PORT */
+
+ /*
+ * lock/copy inode base: 128 byte slot granularity
+ */
+// baseDinode:
+ lv = (lv_t *) & dilinelock->lv[dilinelock->index];
+ lv->offset = dioffset >> L2INODESLOTSIZE;
+ copy_to_dinode(dp, ip);
+ if (test_and_clear_cflag(COMMIT_Dirtable, ip)) {
+ lv->length = 2;
+ memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96);
+ } else
+ lv->length = 1;
+ dilinelock->index++;
+
+#ifdef _JFS_FASTDASD
+ /*
+ * We aren't logging changes to the DASD used in directory inodes,
+ * but we need to write them to disk. If we don't unmount cleanly,
+ * mount will recalculate the DASD used.
+ */
+ if (S_ISDIR(ip->i_mode)
+ && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED))
+ bcopy(&ip->i_DASD, &dp->di_DASD, sizeof(dasd_t));
+#endif /* _JFS_FASTDASD */
+
+ /* release the buffer holding the updated on-disk inode.
+ * the buffer will be later written by commit processing.
+ */
+ write_metapage(mp);
+
+ return (rc);
+}
+
+
+/*
+ * NAME: diFree(ip)
+ *
+ * FUNCTION: free a specified inode from the inode working map
+ * for a fileset or aggregate.
+ *
+ * if the inode to be freed represents the first (only)
+ * free inode within the iag, the iag will be placed on
+ * the ag free inode list.
+ *
+ * freeing the inode will cause the inode extent to be
+ * freed if the inode is the only allocated inode within
+ * the extent. in this case all the disk resource backing
+ * up the inode extent will be freed. in addition, the iag
+ * will be placed on the ag extent free list if the extent
+ * is the first free extent in the iag. if freeing the
+ * extent also means that no free inodes will exist for
+ * the iag, the iag will also be removed from the ag free
+ * inode list.
+ *
+ * the iag describing the inode will be freed if the extent
+ * is to be freed and it is the only backed extent within
+ * the iag. in this case, the iag will be removed from the
+ * ag free extent list and ag free inode list and placed on
+ * the inode map's free iag list.
+ *
+ * a careful update approach is used to provide consistency
+ * in the face of updates to multiple buffers. under this
+ * approach, all required buffers are obtained before making
+ * any updates and are held until all updates are complete.
+ *
+ * PARAMETERS:
+ * ip - inode to be freed.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * EIO - i/o error.
+ */
+int diFree(struct inode *ip)
+{
+ int rc;
+ ino_t inum = ip->i_ino;
+ iag_t *iagp, *aiagp, *biagp, *ciagp, *diagp;
+ metapage_t *mp, *amp, *bmp, *cmp, *dmp;
+ int iagno, ino, extno, bitno, sword, agno;
+ int back, fwd;
+ u32 bitmap, mask;
+ struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap;
+ imap_t *imap = JFS_IP(ipimap)->i_imap;
+ s64 xaddr;
+ s64 xlen;
+ pxd_t freepxd;
+ tid_t tid;
+ struct inode *iplist[3];
+ tlock_t *tlck;
+ pxdlock_t *pxdlock;
+
+ /*
+ * This is just to suppress compiler warnings. The same logic that
+ * references these variables is used to initialize them.
+ */
+ aiagp = biagp = ciagp = diagp = NULL;
+
+ /* get the iag number containing the inode.
+ */
+ iagno = INOTOIAG(inum);
+
+ /* make sure that the iag is contained within
+ * the map.
+ */
+ //assert(iagno < imap->im_nextiag);
+ if (iagno >= imap->im_nextiag) {
+ jERROR(1, ("diFree: inum = %d, iagno = %d, nextiag = %d\n",
+ (uint) inum, iagno, imap->im_nextiag));
+ dump_mem("imap", imap, 32);
+ updateSuper(ip->i_sb, FM_DIRTY);
+ return EIO;
+ }
+
+ /* get the allocation group for this ino.
+ */
+ agno = JFS_IP(ip)->agno;
+
+ /* Lock the AG specific inode map information
+ */
+ AG_LOCK(imap, agno);
+
+ /* Obtain read lock in imap inode. Don't release it until we have
+ * read all of the IAG's that we are going to.
+ */
+ IREAD_LOCK(ipimap);
+
+ /* read the iag.
+ */
+ if ((rc = diIAGRead(imap, iagno, &mp))) {
+ IREAD_UNLOCK(ipimap);
+ AG_UNLOCK(imap, agno);
+ return (rc);
+ }
+ iagp = (iag_t *) mp->data;
+
+ /* get the inode number and extent number of the inode within
+ * the iag and the inode number within the extent.
+ */
+ ino = inum & (INOSPERIAG - 1);
+ extno = ino >> L2INOSPEREXT;
+ bitno = ino & (INOSPEREXT - 1);
+ mask = HIGHORDER >> bitno;
+
+ assert(le32_to_cpu(iagp->wmap[extno]) & mask);
+#ifdef _STILL_TO_PORT
+ assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0);
+#endif /* _STILL_TO_PORT */
+ assert(addressPXD(&iagp->inoext[extno]));
+
+ /* compute the bitmap for the extent reflecting the freed inode.
+ */
+ bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask;
+
+ if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) {
+ jERROR(1,("diFree: numfree > numinos\n"));
+ release_metapage(mp);
+ IREAD_UNLOCK(ipimap);
+ AG_UNLOCK(imap, agno);
+ updateSuper(ip->i_sb, FM_DIRTY);
+ return EIO;
+ }
+ /*
+ * inode extent still has some inodes or below low water mark:
+ * keep the inode extent;
+ */
+ if (bitmap ||
+ imap->im_agctl[agno].numfree < 96 ||
+ (imap->im_agctl[agno].numfree < 288 &&
+ (((imap->im_agctl[agno].numfree * 100) /
+ imap->im_agctl[agno].numinos) <= 25))) {
+ /* if the iag currently has no free inodes (i.e.,
+ * the inode being freed is the first free inode of iag),
+ * insert the iag at head of the inode free list for the ag.
+ */
+ if (iagp->nfreeinos == 0) {
+ /* check if there are any iags on the ag inode
+ * free list. if so, read the first one so that
+ * we can link the current iag onto the list at
+ * the head.
+ */
+ if ((fwd = imap->im_agctl[agno].inofree) >= 0) {
+ /* read the iag that currently is the head
+ * of the list.
+ */
+ if ((rc = diIAGRead(imap, fwd, &amp))) {
+ IREAD_UNLOCK(ipimap);
+ AG_UNLOCK(imap, agno);
+ release_metapage(mp);
+ return (rc);
+ }
+ aiagp = (iag_t *) amp->data;
+
+ /* make current head point back to the iag.
+ */
+ aiagp->inofreeback = cpu_to_le32(iagno);
+
+ write_metapage(amp);
+ }
+
+ /* iag points forward to current head and iag
+ * becomes the new head of the list.
+ */
+ iagp->inofreefwd =
+ cpu_to_le32(imap->im_agctl[agno].inofree);
+ iagp->inofreeback = -1;
+ imap->im_agctl[agno].inofree = iagno;
+ }
+ IREAD_UNLOCK(ipimap);
+
+ /* update the free inode summary map for the extent if
+ * freeing the inode means the extent will now have free
+ * inodes (i.e., the inode being freed is the first free
+ * inode of extent),
+ */
+ if (iagp->wmap[extno] == ONES) {
+ sword = extno >> L2EXTSPERSUM;
+ bitno = extno & (EXTSPERSUM - 1);
+ iagp->inosmap[sword] &=
+ cpu_to_le32(~(HIGHORDER >> bitno));
+ }
+
+ /* update the bitmap.
+ */
+ iagp->wmap[extno] = cpu_to_le32(bitmap);
+ DBG_DIFREE(imap, inum);
+
+ /* update the free inode counts at the iag, ag and
+ * map level.
+ */
+ iagp->nfreeinos =
+ cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
+ imap->im_agctl[agno].numfree += 1;
+ atomic_inc(&imap->im_numfree);
+
+ /* release the AG inode map lock
+ */
+ AG_UNLOCK(imap, agno);
+
+ /* write the iag */
+ write_metapage(mp);
+
+ return (0);
+ }
+
+
+ /*
+ * inode extent has become free and above low water mark:
+ * free the inode extent;
+ */
+
+ /*
+ * prepare to update iag list(s) (careful update step 1)
+ */
+ amp = bmp = cmp = dmp = NULL;
+ fwd = back = -1;
+
+ /* check if the iag currently has no free extents. if so,
+ * it will be placed on the head of the ag extent free list.
+ */
+ if (iagp->nfreeexts == 0) {
+ /* check if the ag extent free list has any iags.
+ * if so, read the iag at the head of the list now.
+ * this (head) iag will be updated later to reflect
+ * the addition of the current iag at the head of
+ * the list.
+ */
+ if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
+ if ((rc = diIAGRead(imap, fwd, &amp)))
+ goto error_out;
+ aiagp = (iag_t *) amp->data;
+ }
+ } else {
+ /* iag has free extents. check if the addition of a free
+ * extent will cause all extents to be free within this
+ * iag. if so, the iag will be removed from the ag extent
+ * free list and placed on the inode map's free iag list.
+ */
+ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
+ /* in preparation for removing the iag from the
+ * ag extent free list, read the iags preceeding
+ * and following the iag on the ag extent free
+ * list.
+ */
+ if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
+ if ((rc = diIAGRead(imap, fwd, &amp)))
+ goto error_out;
+ aiagp = (iag_t *) amp->data;
+ }
+
+ if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
+ if ((rc = diIAGRead(imap, back, &bmp)))
+ goto error_out;
+ biagp = (iag_t *) bmp->data;
+ }
+ }
+ }
+
+ /* remove the iag from the ag inode free list if freeing
+ * this extent cause the iag to have no free inodes.
+ */
+ if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
+ int inofreeback = le32_to_cpu(iagp->inofreeback);
+ int inofreefwd = le32_to_cpu(iagp->inofreefwd);
+
+ /* in preparation for removing the iag from the
+ * ag inode free list, read the iags preceeding
+ * and following the iag on the ag inode free
+ * list. before reading these iags, we must make
+ * sure that we already don't have them in hand
+ * from up above, since re-reading an iag (buffer)
+ * we are currently holding would cause a deadlock.
+ */
+ if (inofreefwd >= 0) {
+
+ if (inofreefwd == fwd)
+ ciagp = (iag_t *) amp->data;
+ else if (inofreefwd == back)
+ ciagp = (iag_t *) bmp->data;
+ else {
+ if ((rc =
+ diIAGRead(imap, inofreefwd, &cmp)))
+ goto error_out;
+ assert(cmp != NULL);
+ ciagp = (iag_t *) cmp->data;
+ }
+ assert(ciagp != NULL);
+ }
+
+ if (inofreeback >= 0) {
+ if (inofreeback == fwd)
+ diagp = (iag_t *) amp->data;
+ else if (inofreeback == back)
+ diagp = (iag_t *) bmp->data;
+ else {
+ if ((rc =
+ diIAGRead(imap, inofreeback, &dmp)))
+ goto error_out;
+ assert(dmp != NULL);
+ diagp = (iag_t *) dmp->data;
+ }
+ assert(diagp != NULL);
+ }
+ }
+
+ IREAD_UNLOCK(ipimap);
+
+ /*
+ * invalidate any page of the inode extent freed from buffer cache;
+ */
+ freepxd = iagp->inoext[extno];
+ xaddr = addressPXD(&iagp->inoext[extno]);
+ xlen = lengthPXD(&iagp->inoext[extno]);
+ invalidate_metapages(JFS_SBI(ip->i_sb)->direct_inode, xaddr, xlen);
+
+ /*
+ * update iag list(s) (careful update step 2)
+ */
+ /* add the iag to the ag extent free list if this is the
+ * first free extent for the iag.
+ */
+ if (iagp->nfreeexts == 0) {
+ if (fwd >= 0)
+ aiagp->extfreeback = cpu_to_le32(iagno);
+
+ iagp->extfreefwd =
+ cpu_to_le32(imap->im_agctl[agno].extfree);
+ iagp->extfreeback = -1;
+ imap->im_agctl[agno].extfree = iagno;
+ } else {
+ /* remove the iag from the ag extent list if all extents
+ * are now free and place it on the inode map iag free list.
+ */
+ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
+ if (fwd >= 0)
+ aiagp->extfreeback = iagp->extfreeback;
+
+ if (back >= 0)
+ biagp->extfreefwd = iagp->extfreefwd;
+ else
+ imap->im_agctl[agno].extfree =
+ le32_to_cpu(iagp->extfreefwd);
+
+ iagp->extfreefwd = iagp->extfreeback = -1;
+
+ IAGFREE_LOCK(imap);
+ iagp->iagfree = cpu_to_le32(imap->im_freeiag);
+ imap->im_freeiag = iagno;
+ IAGFREE_UNLOCK(imap);
+ }
+ }
+
+ /* remove the iag from the ag inode free list if freeing
+ * this extent causes the iag to have no free inodes.
+ */
+ if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
+ if ((int) le32_to_cpu(iagp->inofreefwd) >= 0)
+ ciagp->inofreeback = iagp->inofreeback;
+
+ if ((int) le32_to_cpu(iagp->inofreeback) >= 0)
+ diagp->inofreefwd = iagp->inofreefwd;
+ else
+ imap->im_agctl[agno].inofree =
+ le32_to_cpu(iagp->inofreefwd);
+
+ iagp->inofreefwd = iagp->inofreeback = -1;
+ }
+
+ /* update the inode extent address and working map
+ * to reflect the free extent.
+ * the permanent map should have been updated already
+ * for the inode being freed.
+ */
+ assert(iagp->pmap[extno] == 0);
+ iagp->wmap[extno] = 0;
+ DBG_DIFREE(imap, inum);
+ PXDlength(&iagp->inoext[extno], 0);
+ PXDaddress(&iagp->inoext[extno], 0);
+
+ /* update the free extent and free inode summary maps
+ * to reflect the freed extent.
+ * the inode summary map is marked to indicate no inodes
+ * available for the freed extent.
+ */
+ sword = extno >> L2EXTSPERSUM;
+ bitno = extno & (EXTSPERSUM - 1);
+ mask = HIGHORDER >> bitno;
+ iagp->inosmap[sword] |= cpu_to_le32(mask);
+ iagp->extsmap[sword] &= cpu_to_le32(~mask);
+
+ /* update the number of free inodes and number of free extents
+ * for the iag.
+ */
+ iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) -
+ (INOSPEREXT - 1));
+ iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
+
+ /* update the number of free inodes and backed inodes
+ * at the ag and inode map level.
+ */
+ imap->im_agctl[agno].numfree -= (INOSPEREXT - 1);
+ imap->im_agctl[agno].numinos -= INOSPEREXT;
+ atomic_sub(INOSPEREXT - 1, &imap->im_numfree);
+ atomic_sub(INOSPEREXT, &imap->im_numinos);
+
+ if (amp)
+ write_metapage(amp);
+ if (bmp)
+ write_metapage(bmp);
+ if (cmp)
+ write_metapage(cmp);
+ if (dmp)
+ write_metapage(dmp);
+
+ /*
+ * start transaction to update block allocation map
+ * for the inode extent freed;
+ *
+ * N.B. AG_LOCK is released and iag will be released below, and
+ * other thread may allocate inode from/reusing the ixad freed
+ * BUT with new/different backing inode extent from the extent
+ * to be freed by the transaction;
+ */
+ tid = txBegin(ipimap->i_sb, COMMIT_FORCE);
+
+ /* acquire tlock of the iag page of the freed ixad
+ * to force the page NOHOMEOK (even though no data is
+ * logged from the iag page) until NOREDOPAGE|FREEXTENT log
+ * for the free of the extent is committed;
+ * write FREEXTENT|NOREDOPAGE log record
+ * N.B. linelock is overlaid as freed extent descriptor;
+ */
+ tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE);
+ pxdlock = (pxdlock_t *) & tlck->lock;
+ pxdlock->flag = mlckFREEPXD;
+ pxdlock->pxd = freepxd;
+ pxdlock->index = 1;
+
+ write_metapage(mp);
+
+ iplist[0] = ipimap;
+
+ /*
+ * logredo needs the IAG number and IAG extent index in order
+ * to ensure that the IMap is consistent. The least disruptive
+ * way to pass these values through to the transaction manager
+ * is in the iplist array.
+ *
+ * It's not pretty, but it works.
+ */
+ iplist[1] = (struct inode *) (size_t)iagno;
+ iplist[2] = (struct inode *) (size_t)extno;
+
+ rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); // D233382
+
+ txEnd(tid);
+
+ /* unlock the AG inode map information */
+ AG_UNLOCK(imap, agno);
+
+ return (0);
+
+ error_out:
+ IREAD_UNLOCK(ipimap);
+
+ if (amp)
+ release_metapage(amp);
+ if (bmp)
+ release_metapage(bmp);
+ if (cmp)
+ release_metapage(cmp);
+ if (dmp)
+ release_metapage(dmp);
+
+ AG_UNLOCK(imap, agno);
+
+ release_metapage(mp);
+
+ return (rc);
+}
+
+/*
+ * There are several places in the diAlloc* routines where we initialize
+ * the inode.
+ */
+static inline void
+diInitInode(struct inode *ip, int iagno, int ino, int extno, iag_t * iagp)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+
+ ip->i_ino = (iagno << L2INOSPERIAG) + ino;
+ DBG_DIALLOC(JFS_IP(ipimap)->i_imap, ip->i_ino);
+ jfs_ip->ixpxd = iagp->inoext[extno];
+ jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+}
+
+
+/*
+ * NAME: diAlloc(pip,dir,ip)
+ *
+ * FUNCTION: allocate a disk inode from the inode working map
+ * for a fileset or aggregate.
+ *
+ * PARAMETERS:
+ * pip - pointer to incore inode for the parent inode.
+ * dir - TRUE if the new disk inode is for a directory.
+ * ip - pointer to a new inode
+ *
+ * RETURN VALUES:
+ * 0 - success.
+ * ENOSPC - insufficient disk resources.
+ * EIO - i/o error.
+ */
+int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
+{
+ int rc, ino, iagno, addext, extno, bitno, sword;
+ int nwords, rem, i, agno;
+ u32 mask, inosmap, extsmap;
+ struct inode *ipimap;
+ metapage_t *mp;
+ ino_t inum;
+ iag_t *iagp;
+ imap_t *imap;
+
+ /* get the pointers to the inode map inode and the
+ * corresponding imap control structure.
+ */
+ ipimap = JFS_SBI(pip->i_sb)->ipimap;
+ imap = JFS_IP(ipimap)->i_imap;
+ JFS_IP(ip)->ipimap = ipimap;
+ JFS_IP(ip)->fileset = FILESYSTEM_I;
+
+ /* for a directory, the allocation policy is to start
+ * at the ag level using the preferred ag.
+ */
+ if (dir == TRUE) {
+ agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
+ AG_LOCK(imap, agno);
+ goto tryag;
+ }
+
+ /* for files, the policy starts off by trying to allocate from
+ * the same iag containing the parent disk inode:
+ * try to allocate the new disk inode close to the parent disk
+ * inode, using parent disk inode number + 1 as the allocation
+ * hint. (we use a left-to-right policy to attempt to avoid
+ * moving backward on the disk.) compute the hint within the
+ * file system and the iag.
+ */
+ inum = pip->i_ino + 1;
+ ino = inum & (INOSPERIAG - 1);
+
+ /* back off the the hint if it is outside of the iag */
+ if (ino == 0)
+ inum = pip->i_ino;
+
+ /* get the ag number of this iag */
+ agno = JFS_IP(pip)->agno;
+
+ /* lock the AG inode map information */
+ AG_LOCK(imap, agno);
+
+ /* Get read lock on imap inode */
+ IREAD_LOCK(ipimap);
+
+ /* get the iag number and read the iag */
+ iagno = INOTOIAG(inum);
+ if ((rc = diIAGRead(imap, iagno, &mp))) {
+ IREAD_UNLOCK(ipimap);
+ return (rc);
+ }
+ iagp = (iag_t *) mp->data;
+
+ /* determine if new inode extent is allowed to be added to the iag.
+ * new inode extent can be added to the iag if the ag
+ * has less than 32 free disk inodes and the iag has free extents.
+ */
+ addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
+
+ /*
+ * try to allocate from the IAG
+ */
+ /* check if the inode may be allocated from the iag
+ * (i.e. the inode has free inodes or new extent can be added).
+ */
+ if (iagp->nfreeinos || addext) {
+ /* determine the extent number of the hint.
+ */
+ extno = ino >> L2INOSPEREXT;
+
+ /* check if the extent containing the hint has backed
+ * inodes. if so, try to allocate within this extent.
+ */
+ if (addressPXD(&iagp->inoext[extno])) {
+ bitno = ino & (INOSPEREXT - 1);
+ if ((bitno =
+ diFindFree(le32_to_cpu(iagp->wmap[extno]),
+ bitno))
+ < INOSPEREXT) {
+ ino = (extno << L2INOSPEREXT) + bitno;
+
+ /* a free inode (bit) was found within this
+ * extent, so allocate it.
+ */
+ rc = diAllocBit(imap, iagp, ino);
+ IREAD_UNLOCK(ipimap);
+ if (rc) {
+ assert(rc == EIO);
+ } else {
+ /* set the results of the allocation
+ * and write the iag.
+ */
+ diInitInode(ip, iagno, ino, extno,
+ iagp);
+ mark_metapage_dirty(mp);
+ }
+ release_metapage(mp);
+
+ /* free the AG lock and return.
+ */
+ AG_UNLOCK(imap, agno);
+ return (rc);
+ }
+
+ if (!addext)
+ extno =
+ (extno ==
+ EXTSPERIAG - 1) ? 0 : extno + 1;
+ }
+
+ /*
+ * no free inodes within the extent containing the hint.
+ *
+ * try to allocate from the backed extents following
+ * hint or, if appropriate (i.e. addext is true), allocate
+ * an extent of free inodes at or following the extent
+ * containing the hint.
+ *
+ * the free inode and free extent summary maps are used
+ * here, so determine the starting summary map position
+ * and the number of words we'll have to examine. again,
+ * the approach is to allocate following the hint, so we
+ * might have to initially ignore prior bits of the summary
+ * map that represent extents prior to the extent containing
+ * the hint and later revisit these bits.
+ */
+ bitno = extno & (EXTSPERSUM - 1);
+ nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1;
+ sword = extno >> L2EXTSPERSUM;
+
+ /* mask any prior bits for the starting words of the
+ * summary map.
+ */
+ mask = ONES << (EXTSPERSUM - bitno);
+ inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
+ extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
+
+ /* scan the free inode and free extent summary maps for
+ * free resources.
+ */
+ for (i = 0; i < nwords; i++) {
+ /* check if this word of the free inode summary
+ * map describes an extent with free inodes.
+ */
+ if (~inosmap) {
+ /* an extent with free inodes has been
+ * found. determine the extent number
+ * and the inode number within the extent.
+ */
+ rem = diFindFree(inosmap, 0);
+ extno = (sword << L2EXTSPERSUM) + rem;
+ rem =
+ diFindFree(le32_to_cpu
+ (iagp->wmap[extno]), 0);
+ assert(rem < INOSPEREXT);
+
+ /* determine the inode number within the
+ * iag and allocate the inode from the
+ * map.
+ */
+ ino = (extno << L2INOSPEREXT) + rem;
+ rc = diAllocBit(imap, iagp, ino);
+ IREAD_UNLOCK(ipimap);
+ if (rc) {
+ assert(rc == EIO);
+ } else {
+ /* set the results of the allocation
+ * and write the iag.
+ */
+ diInitInode(ip, iagno, ino, extno,
+ iagp);
+ mark_metapage_dirty(mp);
+ }
+ release_metapage(mp);
+
+ /* free the AG lock and return.
+ */
+ AG_UNLOCK(imap, agno);
+ return (rc);
+
+ }
+
+ /* check if we may allocate an extent of free
+ * inodes and whether this word of the free
+ * extents summary map describes a free extent.
+ */
+ if (addext && ~extsmap) {
+ /* a free extent has been found. determine
+ * the extent number.
+ */
+ rem = diFindFree(extsmap, 0);
+ extno = (sword << L2EXTSPERSUM) + rem;
+
+ /* allocate an extent of free inodes.
+ */
+ if ((rc = diNewExt(imap, iagp, extno))) {
+ /* if there is no disk space for a
+ * new extent, try to allocate the
+ * disk inode from somewhere else.
+ */
+ if (rc == ENOSPC)
+ break;
+
+ assert(rc == EIO);
+ } else {
+ /* set the results of the allocation
+ * and write the iag.
+ */
+ diInitInode(ip, iagno,
+ extno << L2INOSPEREXT,
+ extno, iagp);
+ mark_metapage_dirty(mp);
+ }
+ release_metapage(mp);
+ /* free the imap inode & the AG lock & return.
+ */
+ IREAD_UNLOCK(ipimap);
+ AG_UNLOCK(imap, agno);
+ return (rc);
+ }
+
+ /* move on to the next set of summary map words.
+ */
+ sword = (sword == SMAPSZ - 1) ? 0 : sword + 1;
+ inosmap = le32_to_cpu(iagp->inosmap[sword]);
+ extsmap = le32_to_cpu(iagp->extsmap[sword]);
+ }
+ }
+ /* unlock imap inode */
+ IREAD_UNLOCK(ipimap);
+
+ /* nothing doing in this iag, so release it. */
+ release_metapage(mp);
+
+ tryag:
+ /*
+ * try to allocate anywhere within the same AG as the parent inode.
+ */
+ rc = diAllocAG(imap, agno, dir, ip);
+
+ AG_UNLOCK(imap, agno);
+
+ if (rc != ENOSPC)
+ return (rc);
+
+ /*
+ * try to allocate in any AG.
+ */
+ return (diAllocAny(imap, agno, dir, ip));
+}
+
+
+/*
+ * NAME: diAllocAG(imap,agno,dir,ip)
+ *
+ * FUNCTION: allocate a disk inode from the allocation group.
+ *
+ * this routine first determines if a new extent of free
+ * inodes should be added for the allocation group, with
+ * the current request satisfied from this extent. if this
+ * is the case, an attempt will be made to do just that. if
+ * this attempt fails or it has been determined that a new
+ * extent should not be added, an attempt is made to satisfy
+ * the request by allocating an existing (backed) free inode
+ * from the allocation group.
+ *
+ * PRE CONDITION: Already have the AG lock for this AG.
+ *
+ * PARAMETERS:
+ * imap - pointer to inode map control structure.
+ * agno - allocation group to allocate from.
+ * dir - TRUE if the new disk inode is for a directory.
+ * ip - pointer to the new inode to be filled in on successful return
+ * with the disk inode number allocated, its extent address
+ * and the start of the ag.
+ *
+ * RETURN VALUES:
+ * 0 - success.
+ * ENOSPC - insufficient disk resources.
+ * EIO - i/o error.
+ */
+static int
+diAllocAG(imap_t * imap, int agno, boolean_t dir, struct inode *ip)
+{
+ int rc, addext, numfree, numinos;
+
+ /* get the number of free and the number of backed disk
+ * inodes currently within the ag.
+ */
+ numfree = imap->im_agctl[agno].numfree;
+ numinos = imap->im_agctl[agno].numinos;
+
+ if (numfree > numinos) {
+ jERROR(1,("diAllocAG: numfree > numinos\n"));
+ updateSuper(ip->i_sb, FM_DIRTY);
+ return EIO;
+ }
+
+ /* determine if we should allocate a new extent of free inodes
+ * within the ag: for directory inodes, add a new extent
+ * if there are a small number of free inodes or number of free
+ * inodes is a small percentage of the number of backed inodes.
+ */
+ if (dir == TRUE)
+ addext = (numfree < 64 ||
+ (numfree < 256
+ && ((numfree * 100) / numinos) <= 20));
+ else
+ addext = (numfree == 0);
+
+ /*
+ * try to allocate a new extent of free inodes.
+ */
+ if (addext) {
+ /* if free space is not avaliable for this new extent, try
+ * below to allocate a free and existing (already backed)
+ * inode from the ag.
+ */
+ if ((rc = diAllocExt(imap, agno, ip)) != ENOSPC)
+ return (rc);
+ }
+
+ /*
+ * try to allocate an existing free inode from the ag.
+ */
+ return (diAllocIno(imap, agno, ip));
+}
+
+
+/*
+ * NAME: diAllocAny(imap,agno,dir,iap)
+ *
+ * FUNCTION: allocate a disk inode from any other allocation group.
+ *
+ * this routine is called when an allocation attempt within
+ * the primary allocation group has failed. if attempts to
+ * allocate an inode from any allocation group other than the
+ * specified primary group.
+ *
+ * PARAMETERS:
+ * imap - pointer to inode map control structure.
+ * agno - primary allocation group (to avoid).
+ * dir - TRUE if the new disk inode is for a directory.
+ * ip - pointer to a new inode to be filled in on successful return
+ * with the disk inode number allocated, its extent address
+ * and the start of the ag.
+ *
+ * RETURN VALUES:
+ * 0 - success.
+ * ENOSPC - insufficient disk resources.
+ * EIO - i/o error.
+ */
+static int
+diAllocAny(imap_t * imap, int agno, boolean_t dir, struct inode *ip)
+{
+ int ag, rc;
+ int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag;
+
+
+ /* try to allocate from the ags following agno up to
+ * the maximum ag number.
+ */
+ for (ag = agno + 1; ag <= maxag; ag++) {
+ AG_LOCK(imap, ag);
+
+ rc = diAllocAG(imap, ag, dir, ip);
+
+ AG_UNLOCK(imap, ag);
+
+ if (rc != ENOSPC)
+ return (rc);
+ }
+
+ /* try to allocate from the ags in front of agno.
+ */
+ for (ag = 0; ag < agno; ag++) {
+ AG_LOCK(imap, ag);
+
+ rc = diAllocAG(imap, ag, dir, ip);
+
+ AG_UNLOCK(imap, ag);
+
+ if (rc != ENOSPC)
+ return (rc);
+ }
+
+ /* no free disk inodes.
+ */
+ return (ENOSPC);
+}
+
+
+/*
+ * NAME: diAllocIno(imap,agno,ip)
+ *
+ * FUNCTION: allocate a disk inode from the allocation group's free
+ * inode list, returning an error if this free list is
+ * empty (i.e. no iags on the list).
+ *
+ * allocation occurs from the first iag on the list using
+ * the iag's free inode summary map to find the leftmost
+ * free inode in the iag.
+ *
+ * PRE CONDITION: Already have AG lock for this AG.
+ *
+ * PARAMETERS:
+ * imap - pointer to inode map control structure.
+ * agno - allocation group.
+ * ip - pointer to new inode to be filled in on successful return
+ * with the disk inode number allocated, its extent address
+ * and the start of the ag.
+ *
+ * RETURN VALUES:
+ * 0 - success.
+ * ENOSPC - insufficient disk resources.
+ * EIO - i/o error.
+ */
+static int diAllocIno(imap_t * imap, int agno, struct inode *ip)
+{
+ int iagno, ino, rc, rem, extno, sword;
+ metapage_t *mp;
+ iag_t *iagp;
+
+ /* check if there are iags on the ag's free inode list.
+ */
+ if ((iagno = imap->im_agctl[agno].inofree) < 0)
+ return (ENOSPC);
+
+ /* obtain read lock on imap inode */
+ IREAD_LOCK(imap->im_ipimap);
+
+ /* read the iag at the head of the list.
+ */
+ if ((rc = diIAGRead(imap, iagno, &mp))) {
+ IREAD_UNLOCK(imap->im_ipimap);
+ return (rc);
+ }
+ iagp = (iag_t *) mp->data;
+
+ /* better be free inodes in this iag if it is on the
+ * list.
+ */
+ //assert(iagp->nfreeinos);
+ if (!iagp->nfreeinos) {
+ jERROR(1,
+ ("diAllocIno: nfreeinos = 0, but iag on freelist\n"));
+ jERROR(1, (" agno = %d, iagno = %d\n", agno, iagno));
+ dump_mem("iag", iagp, 64);
+ updateSuper(ip->i_sb, FM_DIRTY);
+ return EIO;
+ }
+
+ /* scan the free inode summary map to find an extent
+ * with free inodes.
+ */
+ for (sword = 0;; sword++) {
+ assert(sword < SMAPSZ);
+
+ if (~iagp->inosmap[sword])
+ break;
+ }
+
+ /* found a extent with free inodes. determine
+ * the extent number.
+ */
+ rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0);
+ assert(rem < EXTSPERSUM);
+ extno = (sword << L2EXTSPERSUM) + rem;
+
+ /* find the first free inode in the extent.
+ */
+ rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0);
+ assert(rem < INOSPEREXT);
+
+ /* compute the inode number within the iag.
+ */
+ ino = (extno << L2INOSPEREXT) + rem;
+
+ /* allocate the inode.
+ */
+ rc = diAllocBit(imap, iagp, ino);
+ IREAD_UNLOCK(imap->im_ipimap);
+ if (rc) {
+ release_metapage(mp);
+ return (rc);
+ }
+
+ /* set the results of the allocation and write the iag.
+ */
+ diInitInode(ip, iagno, ino, extno, iagp);
+ write_metapage(mp);
+
+ return (0);
+}
+
+
+/*
+ * NAME: diAllocExt(imap,agno,ip)
+ *
+ * FUNCTION: add a new extent of free inodes to an iag, allocating
+ * an inode from this extent to satisfy the current allocation
+ * request.
+ *
+ * this routine first tries to find an existing iag with free
+ * extents through the ag free extent list. if list is not
+ * empty, the head of the list will be selected as the home
+ * of the new extent of free inodes. otherwise (the list is
+ * empty), a new iag will be allocated for the ag to contain
+ * the extent.
+ *
+ * once an iag has been selected, the free extent summary map
+ * is used to locate a free extent within the iag and diNewExt()
+ * is called to initialize the extent, with initialization
+ * including the allocation of the first inode of the extent
+ * for the purpose of satisfying this request.
+ *
+ * PARAMETERS:
+ * imap - pointer to inode map control structure.
+ * agno - allocation group number.
+ * ip - pointer to new inode to be filled in on successful return
+ * with the disk inode number allocated, its extent address
+ * and the start of the ag.
+ *
+ * RETURN VALUES:
+ * 0 - success.
+ * ENOSPC - insufficient disk resources.
+ * EIO - i/o error.
+ */
+static int diAllocExt(imap_t * imap, int agno, struct inode *ip)
+{
+ int rem, iagno, sword, extno, rc;
+ metapage_t *mp;
+ iag_t *iagp;
+
+ /* check if the ag has any iags with free extents. if not,
+ * allocate a new iag for the ag.
+ */
+ if ((iagno = imap->im_agctl[agno].extfree) < 0) {
+ /* If successful, diNewIAG will obtain the read lock on the
+ * imap inode.
+ */
+ if ((rc = diNewIAG(imap, &iagno, agno, &mp))) {
+ return (rc);
+ }
+ iagp = (iag_t *) mp->data;
+
+ /* set the ag number if this a brand new iag
+ */
+ iagp->agstart =
+ cpu_to_le64(AGTOBLK(agno, imap->im_ipimap));
+ } else {
+ /* read the iag.
+ */
+ IREAD_LOCK(imap->im_ipimap);
+ if ((rc = diIAGRead(imap, iagno, &mp))) {
+ assert(0);
+ }
+ iagp = (iag_t *) mp->data;
+ }
+
+ /* using the free extent summary map, find a free extent.
+ */
+ for (sword = 0;; sword++) {
+ assert(sword < SMAPSZ);
+ if (~iagp->extsmap[sword])
+ break;
+ }
+
+ /* determine the extent number of the free extent.
+ */
+ rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0);
+ assert(rem < EXTSPERSUM);
+ extno = (sword << L2EXTSPERSUM) + rem;
+
+ /* initialize the new extent.
+ */
+ rc = diNewExt(imap, iagp, extno);
+ IREAD_UNLOCK(imap->im_ipimap);
+ if (rc) {
+ /* something bad happened. if a new iag was allocated,
+ * place it back on the inode map's iag free list, and
+ * clear the ag number information.
+ */
+ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+ IAGFREE_LOCK(imap);
+ iagp->iagfree = cpu_to_le32(imap->im_freeiag);
+ imap->im_freeiag = iagno;
+ IAGFREE_UNLOCK(imap);
+ }
+ write_metapage(mp);
+ return (rc);
+ }
+
+ /* set the results of the allocation and write the iag.
+ */
+ diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp);
+
+ write_metapage(mp);
+
+ return (0);
+}
+
+
+/*
+ * NAME: diAllocBit(imap,iagp,ino)
+ *
+ * FUNCTION: allocate a backed inode from an iag.
+ *
+ * this routine performs the mechanics of allocating a
+ * specified inode from a backed extent.
+ *
+ * if the inode to be allocated represents the last free
+ * inode within the iag, the iag will be removed from the
+ * ag free inode list.
+ *
+ * a careful update approach is used to provide consistency
+ * in the face of updates to multiple buffers. under this
+ * approach, all required buffers are obtained before making
+ * any updates and are held all are updates are complete.
+ *
+ * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on
+ * this AG. Must have read lock on imap inode.
+ *
+ * PARAMETERS:
+ * imap - pointer to inode map control structure.
+ * iagp - pointer to iag.
+ * ino - inode number to be allocated within the iag.
+ *
+ * RETURN VALUES:
+ * 0 - success.
+ * ENOSPC - insufficient disk resources.
+ * EIO - i/o error.
+ */
+static int diAllocBit(imap_t * imap, iag_t * iagp, int ino)
+{
+ int extno, bitno, agno, sword, rc;
+ metapage_t *amp, *bmp;
+ iag_t *aiagp = 0, *biagp = 0;
+ u32 mask;
+
+ /* check if this is the last free inode within the iag.
+ * if so, it will have to be removed from the ag free
+ * inode list, so get the iags preceeding and following
+ * it on the list.
+ */
+ if (iagp->nfreeinos == cpu_to_le32(1)) {
+ amp = bmp = NULL;
+
+ if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) {
+ if ((rc =
+ diIAGRead(imap, le32_to_cpu(iagp->inofreefwd),
+ &amp)))
+ return (rc);
+ aiagp = (iag_t *) amp->data;
+ }
+
+ if ((int) le32_to_cpu(iagp->inofreeback) >= 0) {
+ if ((rc =
+ diIAGRead(imap,
+ le32_to_cpu(iagp->inofreeback),
+ &bmp))) {
+ if (amp)
+ release_metapage(amp);
+ return (rc);
+ }
+ biagp = (iag_t *) bmp->data;
+ }
+ }
+
+ /* get the ag number, extent number, inode number within
+ * the extent.
+ */
+ agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb));
+ extno = ino >> L2INOSPEREXT;
+ bitno = ino & (INOSPEREXT - 1);
+
+ /* compute the mask for setting the map.
+ */
+ mask = HIGHORDER >> bitno;
+
+ /* the inode should be free and backed.
+ */
+ assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0);
+ assert((le32_to_cpu(iagp->wmap[extno]) & mask) == 0);
+ assert(addressPXD(&iagp->inoext[extno]) != 0);
+
+ /* mark the inode as allocated in the working map.
+ */
+ iagp->wmap[extno] |= cpu_to_le32(mask);
+
+ /* check if all inodes within the extent are now
+ * allocated. if so, update the free inode summary
+ * map to reflect this.
+ */
+ if (iagp->wmap[extno] == ONES) {
+ sword = extno >> L2EXTSPERSUM;
+ bitno = extno & (EXTSPERSUM - 1);
+ iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno);
+ }
+
+ /* if this was the last free inode in the iag, remove the
+ * iag from the ag free inode list.
+ */
+ if (iagp->nfreeinos == cpu_to_le32(1)) {
+ if (amp) {
+ aiagp->inofreeback = iagp->inofreeback;
+ write_metapage(amp);
+ }
+
+ if (bmp) {
+ biagp->inofreefwd = iagp->inofreefwd;
+ write_metapage(bmp);
+ } else {
+ imap->im_agctl[agno].inofree =
+ le32_to_cpu(iagp->inofreefwd);
+ }
+ iagp->inofreefwd = iagp->inofreeback = -1;
+ }
+
+ /* update the free inode count at the iag, ag, inode
+ * map levels.
+ */
+ iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1);
+ imap->im_agctl[agno].numfree -= 1;
+ atomic_dec(&imap->im_numfree);
+
+ return (0);
+}
+
+
+/*
+ * NAME: diNewExt(imap,iagp,extno)
+ *
+ * FUNCTION: initialize a new extent of inodes for an iag, allocating
+ * the first inode of the extent for use for the current
+ * allocation request.
+ *
+ * disk resources are allocated for the new extent of inodes
+ * and the inodes themselves are initialized to reflect their
+ * existence within the extent (i.e. their inode numbers and
+ * inode extent addresses are set) and their initial state
+ * (mode and link count are set to zero).
+ *
+ * if the iag is new, it is not yet on an ag extent free list
+ * but will now be placed on this list.
+ *
+ * if the allocation of the new extent causes the iag to
+ * have no free extent, the iag will be removed from the
+ * ag extent free list.
+ *
+ * if the iag has no free backed inodes, it will be placed
+ * on the ag free inode list, since the addition of the new
+ * extent will now cause it to have free inodes.
+ *
+ * a careful update approach is used to provide consistency
+ * (i.e. list consistency) in the face of updates to multiple
+ * buffers. under this approach, all required buffers are
+ * obtained before making any updates and are held until all
+ * updates are complete.
+ *
+ * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on
+ * this AG. Must have read lock on imap inode.
+ *
+ * PARAMETERS:
+ * imap - pointer to inode map control structure.
+ * iagp - pointer to iag.
+ * extno - extent number.
+ *
+ * RETURN VALUES:
+ * 0 - success.
+ * ENOSPC - insufficient disk resources.
+ * EIO - i/o error.
+ */
+static int diNewExt(imap_t * imap, iag_t * iagp, int extno)
+{
+ int agno, iagno, fwd, back, freei = 0, sword, rc;
+ iag_t *aiagp = 0, *biagp = 0, *ciagp = 0;
+ metapage_t *amp, *bmp, *cmp, *dmp;
+ struct inode *ipimap;
+ s64 blkno, hint;
+ int i, j;
+ u32 mask;
+ ino_t ino;
+ dinode_t *dp;
+ struct jfs_sb_info *sbi;
+
+ /* better have free extents.
+ */
+ assert(iagp->nfreeexts);
+
+ /* get the inode map inode.
+ */
+ ipimap = imap->im_ipimap;
+ sbi = JFS_SBI(ipimap->i_sb);
+
+ amp = bmp = cmp = NULL;
+
+ /* get the ag and iag numbers for this iag.
+ */
+ agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+ iagno = le32_to_cpu(iagp->iagnum);
+
+ /* check if this is the last free extent within the
+ * iag. if so, the iag must be removed from the ag
+ * free extent list, so get the iags preceeding and
+ * following the iag on this list.
+ */
+ if (iagp->nfreeexts == cpu_to_le32(1)) {
+ if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
+ if ((rc = diIAGRead(imap, fwd, &amp)))
+ return (rc);
+ aiagp = (iag_t *) amp->data;
+ }
+
+ if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
+ if ((rc = diIAGRead(imap, back, &bmp)))
+ goto error_out;
+ biagp = (iag_t *) bmp->data;
+ }
+ } else {
+ /* the iag has free extents. if all extents are free
+ * (as is the case for a newly allocated iag), the iag
+ * must be added to the ag free extent list, so get
+ * the iag at the head of the list in preparation for
+ * adding this iag to this list.
+ */
+ fwd = back = -1;
+ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+ if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
+ if ((rc = diIAGRead(imap, fwd, &amp)))
+ goto error_out;
+ aiagp = (iag_t *) amp->data;
+ }
+ }
+ }
+
+ /* check if the iag has no free inodes. if so, the iag
+ * will have to be added to the ag free inode list, so get
+ * the iag at the head of the list in preparation for
+ * adding this iag to this list. in doing this, we must
+ * check if we already have the iag at the head of
+ * the list in hand.
+ */
+ if (iagp->nfreeinos == 0) {
+ freei = imap->im_agctl[agno].inofree;
+
+ if (freei >= 0) {
+ if (freei == fwd) {
+ ciagp = aiagp;
+ } else if (freei == back) {
+ ciagp = biagp;
+ } else {
+ if ((rc = diIAGRead(imap, freei, &cmp)))
+ goto error_out;
+ ciagp = (iag_t *) cmp->data;
+ }
+ assert(ciagp != NULL);
+ }
+ }
+
+ /* allocate disk space for the inode extent.
+ */
+ if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0))
+ hint = ((s64) agno << sbi->bmap->db_agl2size) - 1;
+ else
+ hint = addressPXD(&iagp->inoext[extno - 1]) +
+ lengthPXD(&iagp->inoext[extno - 1]) - 1;
+
+ if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno)))
+ goto error_out;
+
+ /* compute the inode number of the first inode within the
+ * extent.
+ */
+ ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT);
+
+ /* initialize the inodes within the newly allocated extent a
+ * page at a time.
+ */
+ for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) {
+ /* get a buffer for this page of disk inodes.
+ */
+ dmp = get_metapage(ipimap, blkno + i, PSIZE, 1);
+ if (dmp == NULL) {
+ rc = EIO;
+ goto error_out;
+ }
+ dp = (dinode_t *) dmp->data;
+
+ /* initialize the inode number, mode, link count and
+ * inode extent address.
+ */
+ for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) {
+ dp->di_inostamp = cpu_to_le32(sbi->inostamp);
+ dp->di_number = cpu_to_le32(ino);
+ dp->di_fileset = cpu_to_le32(FILESYSTEM_I);
+ dp->di_mode = 0;
+ dp->di_nlink = 0;
+ PXDaddress(&(dp->di_ixpxd), blkno);
+ PXDlength(&(dp->di_ixpxd), imap->im_nbperiext);
+ }
+ write_metapage(dmp);
+ }
+
+ /* if this is the last free extent within the iag, remove the
+ * iag from the ag free extent list.
+ */
+ if (iagp->nfreeexts == cpu_to_le32(1)) {
+ if (fwd >= 0)
+ aiagp->extfreeback = iagp->extfreeback;
+
+ if (back >= 0)
+ biagp->extfreefwd = iagp->extfreefwd;
+ else
+ imap->im_agctl[agno].extfree =
+ le32_to_cpu(iagp->extfreefwd);
+
+ iagp->extfreefwd = iagp->extfreeback = -1;
+ } else {
+ /* if the iag has all free extents (newly allocated iag),
+ * add the iag to the ag free extent list.
+ */
+ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+ if (fwd >= 0)
+ aiagp->extfreeback = cpu_to_le32(iagno);
+
+ iagp->extfreefwd = cpu_to_le32(fwd);
+ iagp->extfreeback = -1;
+ imap->im_agctl[agno].extfree = iagno;
+ }
+ }
+
+ /* if the iag has no free inodes, add the iag to the
+ * ag free inode list.
+ */
+ if (iagp->nfreeinos == 0) {
+ if (freei >= 0)
+ ciagp->inofreeback = cpu_to_le32(iagno);
+
+ iagp->inofreefwd =
+ cpu_to_le32(imap->im_agctl[agno].inofree);
+ iagp->inofreeback = -1;
+ imap->im_agctl[agno].inofree = iagno;
+ }
+
+ /* initialize the extent descriptor of the extent. */
+ PXDlength(&iagp->inoext[extno], imap->im_nbperiext);
+ PXDaddress(&iagp->inoext[extno], blkno);
+
+ /* initialize the working and persistent map of the extent.
+ * the working map will be initialized such that
+ * it indicates the first inode of the extent is allocated.
+ */
+ iagp->wmap[extno] = cpu_to_le32(HIGHORDER);
+ iagp->pmap[extno] = 0;
+
+ /* update the free inode and free extent summary maps
+ * for the extent to indicate the extent has free inodes
+ * and no longer represents a free extent.
+ */
+ sword = extno >> L2EXTSPERSUM;
+ mask = HIGHORDER >> (extno & (EXTSPERSUM - 1));
+ iagp->extsmap[sword] |= cpu_to_le32(mask);
+ iagp->inosmap[sword] &= cpu_to_le32(~mask);
+
+ /* update the free inode and free extent counts for the
+ * iag.
+ */
+ iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) +
+ (INOSPEREXT - 1));
+ iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
+
+ /* update the free and backed inode counts for the ag.
+ */
+ imap->im_agctl[agno].numfree += (INOSPEREXT - 1);
+ imap->im_agctl[agno].numinos += INOSPEREXT;
+
+ /* update the free and backed inode counts for the inode map.
+ */
+ atomic_add(INOSPEREXT - 1, &imap->im_numfree);
+ atomic_add(INOSPEREXT, &imap->im_numinos);
+
+ /* write the iags.
+ */
+ if (amp)
+ write_metapage(amp);
+ if (bmp)
+ write_metapage(bmp);
+ if (cmp)
+ write_metapage(cmp);
+
+ return (0);
+
+ error_out:
+
+ /* release the iags.
+ */
+ if (amp)
+ release_metapage(amp);
+ if (bmp)
+ release_metapage(bmp);
+ if (cmp)
+ release_metapage(cmp);
+
+ return (rc);
+}
+
+
+/*
+ * NAME: diNewIAG(imap,iagnop,agno)
+ *
+ * FUNCTION: allocate a new iag for an allocation group.
+ *
+ * first tries to allocate the iag from the inode map
+ * iagfree list:
+ * if the list has free iags, the head of the list is removed
+ * and returned to satisfy the request.
+ * if the inode map's iag free list is empty, the inode map
+ * is extended to hold a new iag. this new iag is initialized
+ * and returned to satisfy the request.
+ *
+ * PARAMETERS:
+ * imap - pointer to inode map control structure.
+ * iagnop - pointer to an iag number set with the number of the
+ * newly allocated iag upon successful return.
+ * agno - allocation group number.
+ * bpp - Buffer pointer to be filled in with new IAG's buffer
+ *
+ * RETURN VALUES:
+ * 0 - success.
+ * ENOSPC - insufficient disk resources.
+ * EIO - i/o error.
+ *
+ * serialization:
+ * AG lock held on entry/exit;
+ * write lock on the map is held inside;
+ * read lock on the map is held on successful completion;
+ *
+ * note: new iag transaction:
+ * . synchronously write iag;
+ * . write log of xtree and inode of imap;
+ * . commit;
+ * . synchronous write of xtree (right to left, bottom to top);
+ * . at start of logredo(): init in-memory imap with one additional iag page;
+ * . at end of logredo(): re-read imap inode to determine
+ * new imap size;
+ */
+static int
+diNewIAG(imap_t * imap, int *iagnop, int agno, metapage_t ** mpp)
+{
+ int rc;
+ int iagno, i, xlen;
+ struct inode *ipimap;
+ struct super_block *sb;
+ struct jfs_sb_info *sbi;
+ metapage_t *mp;
+ iag_t *iagp;
+ s64 xaddr = 0;
+ s64 blkno;
+ tid_t tid;
+#ifdef _STILL_TO_PORT
+ xad_t xad;
+#endif /* _STILL_TO_PORT */
+ struct inode *iplist[1];
+
+ /* pick up pointers to the inode map and mount inodes */
+ ipimap = imap->im_ipimap;
+ sb = ipimap->i_sb;
+ sbi = JFS_SBI(sb);
+
+ /* acquire the free iag lock */
+ IAGFREE_LOCK(imap);
+
+ /* if there are any iags on the inode map free iag list,
+ * allocate the iag from the head of the list.
+ */
+ if (imap->im_freeiag >= 0) {
+ /* pick up the iag number at the head of the list */
+ iagno = imap->im_freeiag;
+
+ /* determine the logical block number of the iag */
+ blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
+ } else {
+ /* no free iags. the inode map will have to be extented
+ * to include a new iag.
+ */
+
+ /* acquire inode map lock */
+ IWRITE_LOCK(ipimap);
+
+ assert(ipimap->i_size >> L2PSIZE == imap->im_nextiag + 1);
+
+ /* get the next avaliable iag number */
+ iagno = imap->im_nextiag;
+
+ /* make sure that we have not exceeded the maximum inode
+ * number limit.
+ */
+ if (iagno > (MAXIAGS - 1)) {
+ /* release the inode map lock */
+ IWRITE_UNLOCK(ipimap);
+
+ rc = ENOSPC;
+ goto out;
+ }
+
+ /*
+ * synchronously append new iag page.
+ */
+ /* determine the logical address of iag page to append */
+ blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
+
+ /* Allocate extent for new iag page */
+ xlen = sbi->nbperpage;
+ if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) {
+ /* release the inode map lock */
+ IWRITE_UNLOCK(ipimap);
+
+ goto out;
+ }
+
+ /* assign a buffer for the page */
+ mp = get_metapage(ipimap, xaddr, PSIZE, 1);
+ //bp = bmAssign(ipimap, blkno, xaddr, PSIZE, bmREAD_PAGE);
+ if (!mp) {
+ /* Free the blocks allocated for the iag since it was
+ * not successfully added to the inode map
+ */
+ dbFree(ipimap, xaddr, (s64) xlen);
+
+ /* release the inode map lock */
+ IWRITE_UNLOCK(ipimap);
+
+ rc = EIO;
+ goto out;
+ }
+ iagp = (iag_t *) mp->data;
+
+ /* init the iag */
+ memset(iagp, 0, sizeof(iag_t));
+ iagp->iagnum = cpu_to_le32(iagno);
+ iagp->inofreefwd = iagp->inofreeback = -1;
+ iagp->extfreefwd = iagp->extfreeback = -1;
+ iagp->iagfree = -1;
+ iagp->nfreeinos = 0;
+ iagp->nfreeexts = cpu_to_le32(EXTSPERIAG);
+
+ /* initialize the free inode summary map (free extent
+ * summary map initialization handled by bzero).
+ */
+ for (i = 0; i < SMAPSZ; i++)
+ iagp->inosmap[i] = ONES;
+
+ flush_metapage(mp);
+#ifdef _STILL_TO_PORT
+ /* synchronously write the iag page */
+ if (bmWrite(bp)) {
+ /* Free the blocks allocated for the iag since it was
+ * not successfully added to the inode map
+ */
+ dbFree(ipimap, xaddr, (s64) xlen);
+
+ /* release the inode map lock */
+ IWRITE_UNLOCK(ipimap);
+
+ rc = EIO;
+ goto out;
+ }
+
+ /* Now the iag is on disk */
+
+ /*
+ * start tyransaction of update of the inode map
+ * addressing structure pointing to the new iag page;
+ */
+#endif /* _STILL_TO_PORT */
+ tid = txBegin(sb, COMMIT_FORCE);
+
+ /* update the inode map addressing structure to point to it */
+ if ((rc =
+ xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) {
+ /* Free the blocks allocated for the iag since it was
+ * not successfully added to the inode map
+ */
+ dbFree(ipimap, xaddr, (s64) xlen);
+
+ /* release the inode map lock */
+ IWRITE_UNLOCK(ipimap);
+
+ goto out;
+ }
+
+ /* update the inode map's inode to reflect the extension */
+ ipimap->i_size += PSIZE;
+ ipimap->i_blocks += LBLK2PBLK(sb, xlen);
+
+ /*
+ * txCommit(COMMIT_FORCE) will synchronously write address
+ * index pages and inode after commit in careful update order
+ * of address index pages (right to left, bottom up);
+ */
+ iplist[0] = ipimap;
+ rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+
+ txEnd(tid);
+
+ duplicateIXtree(sb, blkno, xlen, &xaddr);
+
+ /* update the next avaliable iag number */
+ imap->im_nextiag += 1;
+
+ /* Add the iag to the iag free list so we don't lose the iag
+ * if a failure happens now.
+ */
+ imap->im_freeiag = iagno;
+
+ /* Until we have logredo working, we want the imap inode &
+ * control page to be up to date.
+ */
+ diSync(ipimap);
+
+ /* release the inode map lock */
+ IWRITE_UNLOCK(ipimap);
+ }
+
+ /* obtain read lock on map */
+ IREAD_LOCK(ipimap);
+
+ /* read the iag */
+ if ((rc = diIAGRead(imap, iagno, &mp))) {
+ IREAD_UNLOCK(ipimap);
+ rc = EIO;
+ goto out;
+ }
+ iagp = (iag_t *) mp->data;
+
+ /* remove the iag from the iag free list */
+ imap->im_freeiag = le32_to_cpu(iagp->iagfree);
+ iagp->iagfree = -1;
+
+ /* set the return iag number and buffer pointer */
+ *iagnop = iagno;
+ *mpp = mp;
+
+ out:
+ /* release the iag free lock */
+ IAGFREE_UNLOCK(imap);
+
+ return (rc);
+}
+
+/*
+ * NAME: diIAGRead()
+ *
+ * FUNCTION: get the buffer for the specified iag within a fileset
+ * or aggregate inode map.
+ *
+ * PARAMETERS:
+ * imap - pointer to inode map control structure.
+ * iagno - iag number.
+ * bpp - point to buffer pointer to be filled in on successful
+ * exit.
+ *
+ * SERIALIZATION:
+ * must have read lock on imap inode
+ * (When called by diExtendFS, the filesystem is quiesced, therefore
+ * the read lock is unnecessary.)
+ *
+ * RETURN VALUES:
+ * 0 - success.
+ * EIO - i/o error.
+ */
+static int diIAGRead(imap_t * imap, int iagno, metapage_t ** mpp)
+{
+ struct inode *ipimap = imap->im_ipimap;
+ s64 blkno;
+
+ /* compute the logical block number of the iag. */
+ blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage);
+
+ /* read the iag. */
+ *mpp = read_metapage(ipimap, blkno, PSIZE, 0);
+ if (*mpp == NULL) {
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * NAME: diFindFree()
+ *
+ * FUNCTION: find the first free bit in a word starting at
+ * the specified bit position.
+ *
+ * PARAMETERS:
+ * word - word to be examined.
+ * start - starting bit position.
+ *
+ * RETURN VALUES:
+ * bit position of first free bit in the word or 32 if
+ * no free bits were found.
+ */
+static int diFindFree(u32 word, int start)
+{
+ int bitno;
+ assert(start < 32);
+ /* scan the word for the first free bit. */
+ for (word <<= start, bitno = start; bitno < 32;
+ bitno++, word <<= 1) {
+ if ((word & HIGHORDER) == 0)
+ break;
+ }
+ return (bitno);
+}
+
+/*
+ * NAME: diUpdatePMap()
+ *
+ * FUNCTION: Update the persistent map in an IAG for the allocation or
+ * freeing of the specified inode.
+ *
+ * PRE CONDITIONS: Working map has already been updated for allocate.
+ *
+ * PARAMETERS:
+ * ipimap - Incore inode map inode
+ * inum - Number of inode to mark in permanent map
+ * is_free - If TRUE indicates inode should be marked freed, otherwise
+ * indicates inode should be marked allocated.
+ *
+ * RETURNS: 0 for success
+ */
+int
+diUpdatePMap(struct inode *ipimap,
+ unsigned long inum, boolean_t is_free, tblock_t * tblk)
+{
+ int rc;
+ iag_t *iagp;
+ metapage_t *mp;
+ int iagno, ino, extno, bitno;
+ imap_t *imap;
+ u32 mask;
+ log_t *log;
+ int lsn, difft, diffp;
+
+ imap = JFS_IP(ipimap)->i_imap;
+ /* get the iag number containing the inode */
+ iagno = INOTOIAG(inum);
+ /* make sure that the iag is contained within the map */
+ assert(iagno < imap->im_nextiag);
+ /* read the iag */
+ IREAD_LOCK(ipimap);
+ rc = diIAGRead(imap, iagno, &mp);
+ IREAD_UNLOCK(ipimap);
+ if (rc)
+ return (rc);
+ iagp = (iag_t *) mp->data;
+ /* get the inode number and extent number of the inode within
+ * the iag and the inode number within the extent.
+ */
+ ino = inum & (INOSPERIAG - 1);
+ extno = ino >> L2INOSPEREXT;
+ bitno = ino & (INOSPEREXT - 1);
+ mask = HIGHORDER >> bitno;
+ /*
+ * mark the inode free in persistent map:
+ */
+ if (is_free == TRUE) {
+ /* The inode should have been allocated both in working
+ * map and in persistent map;
+ * the inode will be freed from working map at the release
+ * of last reference release;
+ */
+// assert(le32_to_cpu(iagp->wmap[extno]) & mask);
+ if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+ jERROR(1,
+ ("diUpdatePMap: inode %ld not marked as allocated in wmap!\n",
+ inum));
+ updateSuper(ipimap->i_sb, FM_DIRTY);
+ }
+// assert(le32_to_cpu(iagp->pmap[extno]) & mask);
+ if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
+ jERROR(1,
+ ("diUpdatePMap: inode %ld not marked as allocated in pmap!\n",
+ inum));
+ updateSuper(ipimap->i_sb, FM_DIRTY);
+ }
+ /* update the bitmap for the extent of the freed inode */
+ iagp->pmap[extno] &= cpu_to_le32(~mask);
+ }
+ /*
+ * mark the inode allocated in persistent map:
+ */
+ else {
+ /* The inode should be already allocated in the working map
+ * and should be free in persistent map;
+ */
+ assert(le32_to_cpu(iagp->wmap[extno]) & mask);
+ assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0);
+ /* update the bitmap for the extent of the allocated inode */
+ iagp->pmap[extno] |= cpu_to_le32(mask);
+ }
+ /*
+ * update iag lsn
+ */
+ lsn = tblk->lsn;
+ log = JFS_SBI(tblk->sb)->log;
+ if (mp->lsn != 0) {
+ /* inherit older/smaller lsn */
+ logdiff(difft, lsn, log);
+ logdiff(diffp, mp->lsn, log);
+ if (difft < diffp) {
+ mp->lsn = lsn;
+ /* move mp after tblock in logsync list */
+ LOGSYNC_LOCK(log);
+ list_del(&mp->synclist);
+ list_add(&mp->synclist, &tblk->synclist);
+ LOGSYNC_UNLOCK(log);
+ }
+ /* inherit younger/larger clsn */
+ LOGSYNC_LOCK(log);
+ assert(mp->clsn);
+ logdiff(difft, tblk->clsn, log);
+ logdiff(diffp, mp->clsn, log);
+ if (difft > diffp)
+ mp->clsn = tblk->clsn;
+ LOGSYNC_UNLOCK(log);
+ } else {
+ mp->log = log;
+ mp->lsn = lsn;
+ /* insert mp after tblock in logsync list */
+ LOGSYNC_LOCK(log);
+ log->count++;
+ list_add(&mp->synclist, &tblk->synclist);
+ mp->clsn = tblk->clsn;
+ LOGSYNC_UNLOCK(log);
+ }
+// bmLazyWrite(mp, log->flag & JFS_COMMIT);
+ write_metapage(mp);
+ return (0);
+}
+
+/*
+ * diExtendFS()
+ *
+ * function: update imap for extendfs();
+ *
+ * note: AG size has been increased s.t. each k old contiguous AGs are
+ * coalesced into a new AG;
+ */
+int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
+{
+ int rc, rcx = 0;
+ imap_t *imap = JFS_IP(ipimap)->i_imap;
+ iag_t *iagp = 0, *hiagp = 0;
+ bmap_t *mp = JFS_SBI(ipbmap->i_sb)->bmap;
+ metapage_t *bp, *hbp;
+ int i, n, head;
+ int numinos, xnuminos = 0, xnumfree = 0;
+ s64 agstart;
+
+ jEVENT(0, ("diExtendFS: nextiag:%d numinos:%d numfree:%d\n",
+ imap->im_nextiag, atomic_read(&imap->im_numinos),
+ atomic_read(&imap->im_numfree)));
+
+ /*
+ * reconstruct imap
+ *
+ * coalesce contiguous k (newAGSize/oldAGSize) AGs;
+ * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
+ * note: new AG size = old AG size * (2**x).
+ */
+
+ /* init per AG control information im_agctl[] */
+ for (i = 0; i < MAXAG; i++) {
+ imap->im_agctl[i].inofree = -1; /* free inode list */
+ imap->im_agctl[i].extfree = -1; /* free extent list */
+ imap->im_agctl[i].numinos = 0; /* number of backed inodes */
+ imap->im_agctl[i].numfree = 0; /* number of free backed inodes */
+ }
+
+ /*
+ * process each iag_t page of the map.
+ *
+ * rebuild AG Free Inode List, AG Free Inode Extent List;
+ */
+ for (i = 0; i < imap->im_nextiag; i++) {
+ if ((rc = diIAGRead(imap, i, &bp))) {
+ rcx = rc;
+ continue;
+ }
+ iagp = (iag_t *) bp->data;
+ assert(le32_to_cpu(iagp->iagnum) == i);
+
+ /* leave free iag in the free iag list */
+ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+ release_metapage(bp);
+ continue;
+ }
+
+ /* agstart that computes to the same ag is treated as same; */
+ agstart = le64_to_cpu(iagp->agstart);
+ /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */
+ n = agstart >> mp->db_agl2size;
+/*
+printf("diExtendFS: iag:%d agstart:%Ld agno:%d\n", i, agstart, n);
+*/
+
+ /* compute backed inodes */
+ numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts))
+ << L2INOSPEREXT;
+ if (numinos > 0) {
+ /* merge AG backed inodes */
+ imap->im_agctl[n].numinos += numinos;
+ xnuminos += numinos;
+ }
+
+ /* if any backed free inodes, insert at AG free inode list */
+ if ((int) le32_to_cpu(iagp->nfreeinos) > 0) {
+ if ((head = imap->im_agctl[n].inofree) == -1)
+ iagp->inofreefwd = iagp->inofreeback = -1;
+ else {
+ if ((rc = diIAGRead(imap, head, &hbp))) {
+ rcx = rc;
+ goto nextiag;
+ }
+ hiagp = (iag_t *) hbp->data;
+ hiagp->inofreeback =
+ le32_to_cpu(iagp->iagnum);
+ iagp->inofreefwd = cpu_to_le32(head);
+ iagp->inofreeback = -1;
+ write_metapage(hbp);
+ }
+
+ imap->im_agctl[n].inofree =
+ le32_to_cpu(iagp->iagnum);
+
+ /* merge AG backed free inodes */
+ imap->im_agctl[n].numfree +=
+ le32_to_cpu(iagp->nfreeinos);
+ xnumfree += le32_to_cpu(iagp->nfreeinos);
+ }
+
+ /* if any free extents, insert at AG free extent list */
+ if (le32_to_cpu(iagp->nfreeexts) > 0) {
+ if ((head = imap->im_agctl[n].extfree) == -1)
+ iagp->extfreefwd = iagp->extfreeback = -1;
+ else {
+ if ((rc = diIAGRead(imap, head, &hbp))) {
+ rcx = rc;
+ goto nextiag;
+ }
+ hiagp = (iag_t *) hbp->data;
+ hiagp->extfreeback = iagp->iagnum;
+ iagp->extfreefwd = cpu_to_le32(head);
+ iagp->extfreeback = -1;
+ write_metapage(hbp);
+ }
+
+ imap->im_agctl[n].extfree =
+ le32_to_cpu(iagp->iagnum);
+ }
+
+ nextiag:
+ write_metapage(bp);
+ }
+
+ ASSERT(xnuminos == atomic_read(&imap->im_numinos) &&
+ xnumfree == atomic_read(&imap->im_numfree));
+
+ return rcx;
+}
+
+
+/*
+ * duplicateIXtree()
+ *
+ * serialization: IWRITE_LOCK held on entry/exit
+ *
+ * note: shadow page with regular inode (rel.2);
+ */
+static void
+duplicateIXtree(struct super_block *sb, s64 blkno, int xlen, s64 * xaddr)
+{
+ int rc;
+ tid_t tid;
+ struct inode *ip;
+ metapage_t *mpsuper;
+ struct jfs_superblock *j_sb;
+
+ /* if AIT2 ipmap2 is bad, do not try to update it */
+ if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT) /* s_flag */
+ return;
+ ip = diReadSpecial(sb, FILESYSTEM_I + INOSPEREXT);
+ if (ip == 0) {
+ JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
+ if ((rc = readSuper(sb, &mpsuper)))
+ return;
+ j_sb = (struct jfs_superblock *) (mpsuper->data);
+ j_sb->s_flag |= JFS_BAD_SAIT;
+ write_metapage(mpsuper);
+ return;
+ }
+
+ /* start transaction */
+ tid = txBegin(sb, COMMIT_FORCE);
+ /* update the inode map addressing structure to point to it */
+ if ((rc = xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0))) {
+ JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
+ txAbort(tid, 1);
+ goto cleanup;
+
+ }
+ /* update the inode map's inode to reflect the extension */
+ ip->i_size += PSIZE;
+ ip->i_blocks += LBLK2PBLK(sb, xlen);
+ rc = txCommit(tid, 1, &ip, COMMIT_FORCE);
+ cleanup:
+ txEnd(tid);
+ diFreeSpecial(ip);
+}
+
+/*
+ * NAME: copy_from_dinode()
+ *
+ * FUNCTION: Copies inode info from disk inode to in-memory inode
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * ENOMEM - insufficient memory
+ */
+static int copy_from_dinode(dinode_t * dip, struct inode *ip)
+{
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+
+ jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
+ jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
+
+ ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff;
+ ip->i_nlink = le32_to_cpu(dip->di_nlink);
+ ip->i_uid = le32_to_cpu(dip->di_uid);
+ ip->i_gid = le32_to_cpu(dip->di_gid);
+ ip->i_size = le64_to_cpu(dip->di_size);
+ ip->i_atime = le32_to_cpu(dip->di_atime.tv_sec);
+ ip->i_mtime = le32_to_cpu(dip->di_mtime.tv_sec);
+ ip->i_ctime = le32_to_cpu(dip->di_ctime.tv_sec);
+ ip->i_blksize = ip->i_sb->s_blocksize;
+ ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
+ ip->i_version = ++event;
+ ip->i_generation = le32_to_cpu(dip->di_gen);
+
+ jfs_ip->ixpxd = dip->di_ixpxd; /* in-memory pxd's are little-endian */
+ jfs_ip->acl = dip->di_acl; /* as are dxd's */
+ jfs_ip->ea = dip->di_ea;
+ jfs_ip->next_index = le32_to_cpu(dip->di_next_index);
+ jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec);
+ jfs_ip->acltype = le32_to_cpu(dip->di_acltype);
+ /*
+ * We may only need to do this for "special" inodes (dmap, imap)
+ */
+ if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
+ ip->i_rdev = to_kdev_t(le32_to_cpu(dip->di_rdev));
+ else if (S_ISDIR(ip->i_mode)) {
+ memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384);
+ } else if (!S_ISFIFO(ip->i_mode)) {
+ memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288);
+ }
+ /* Zero the in-memory-only stuff */
+ jfs_ip->cflag = 0;
+ jfs_ip->btindex = 0;
+ jfs_ip->btorder = 0;
+ jfs_ip->bxflag = 0;
+ jfs_ip->blid = 0;
+ jfs_ip->atlhead = 0;
+ jfs_ip->atltail = 0;
+ jfs_ip->xtlid = 0;
+ return (0);
+}
+
+/*
+ * NAME: copy_to_dinode()
+ *
+ * FUNCTION: Copies inode info from in-memory inode to disk inode
+ */
+static void copy_to_dinode(dinode_t * dip, struct inode *ip)
+{
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+
+ dip->di_fileset = cpu_to_le32(jfs_ip->fileset);
+ dip->di_inostamp = cpu_to_le32(JFS_SBI(ip->i_sb)->inostamp);
+ dip->di_number = cpu_to_le32(ip->i_ino);
+ dip->di_gen = cpu_to_le32(ip->i_generation);
+ dip->di_size = cpu_to_le64(ip->i_size);
+ dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks));
+ dip->di_nlink = cpu_to_le32(ip->i_nlink);
+ dip->di_uid = cpu_to_le32(ip->i_uid);
+ dip->di_gid = cpu_to_le32(ip->i_gid);
+ /*
+ * mode2 is only needed for storing the higher order bits.
+ * Trust i_mode for the lower order ones
+ */
+ dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | ip->i_mode);
+ dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime);
+ dip->di_atime.tv_nsec = 0;
+ dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime);
+ dip->di_ctime.tv_nsec = 0;
+ dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime);
+ dip->di_mtime.tv_nsec = 0;
+ dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */
+ dip->di_acl = jfs_ip->acl; /* as are dxd's */
+ dip->di_ea = jfs_ip->ea;
+ dip->di_next_index = cpu_to_le32(jfs_ip->next_index);
+ dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime);
+ dip->di_otime.tv_nsec = 0;
+ dip->di_acltype = cpu_to_le32(jfs_ip->acltype);
+
+ if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
+ dip->di_rdev = cpu_to_le32(kdev_t_to_nr(ip->i_rdev));
+}
+
+#ifdef _JFS_DEBUG_IMAP
+/*
+ * DBGdiInit()
+ */
+static void *DBGdiInit(imap_t * imap)
+{
+ u32 *dimap;
+ int size;
+ size = 64 * 1024;
+ if ((dimap = (u32 *) xmalloc(size, L2PSIZE, kernel_heap)) == NULL)
+ assert(0);
+ bzero((void *) dimap, size);
+ imap->im_DBGdimap = dimap;
+}
+
+/*
+ * DBGdiAlloc()
+ */
+static void DBGdiAlloc(imap_t * imap, ino_t ino)
+{
+ u32 *dimap = imap->im_DBGdimap;
+ int w, b;
+ u32 m;
+ w = ino >> 5;
+ b = ino & 31;
+ m = 0x80000000 >> b;
+ assert(w < 64 * 256);
+ if (dimap[w] & m) {
+ printk("DEBUG diAlloc: duplicate alloc ino:0x%x\n", ino);
+ }
+ dimap[w] |= m;
+}
+
+/*
+ * DBGdiFree()
+ */
+static void DBGdiFree(imap_t * imap, ino_t ino)
+{
+ u32 *dimap = imap->im_DBGdimap;
+ int w, b;
+ u32 m;
+ w = ino >> 5;
+ b = ino & 31;
+ m = 0x80000000 >> b;
+ assert(w < 64 * 256);
+ if ((dimap[w] & m) == 0) {
+ printk("DEBUG diFree: duplicate free ino:0x%x\n", ino);
+ }
+ dimap[w] &= ~m;
+}
+
+static void dump_cp(imap_t * ipimap, char *function, int line)
+{
+ printk("\n* ********* *\nControl Page %s %d\n", function, line);
+ printk("FreeIAG %d\tNextIAG %d\n", ipimap->im_freeiag,
+ ipimap->im_nextiag);
+ printk("NumInos %d\tNumFree %d\n",
+ atomic_read(&ipimap->im_numinos),
+ atomic_read(&ipimap->im_numfree));
+ printk("AG InoFree %d\tAG ExtFree %d\n",
+ ipimap->im_agctl[0].inofree, ipimap->im_agctl[0].extfree);
+ printk("AG NumInos %d\tAG NumFree %d\n",
+ ipimap->im_agctl[0].numinos, ipimap->im_agctl[0].numfree);
+}
+
+static void dump_iag(iag_t * iag, char *function, int line)
+{
+ printk("\n* ********* *\nIAG %s %d\n", function, line);
+ printk("IagNum %d\tIAG Free %d\n", le32_to_cpu(iag->iagnum),
+ le32_to_cpu(iag->iagfree));
+ printk("InoFreeFwd %d\tInoFreeBack %d\n",
+ le32_to_cpu(iag->inofreefwd),
+ le32_to_cpu(iag->inofreeback));
+ printk("ExtFreeFwd %d\tExtFreeBack %d\n",
+ le32_to_cpu(iag->extfreefwd),
+ le32_to_cpu(iag->extfreeback));
+ printk("NFreeInos %d\tNFreeExts %d\n", le32_to_cpu(iag->nfreeinos),
+ le32_to_cpu(iag->nfreeexts));
+}
+#endif /* _JFS_DEBUG_IMAP */
diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h
new file mode 100644
index 000000000000..58cc5e195b4c
--- /dev/null
+++ b/fs/jfs/jfs_imap.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_IMAP
+#define _H_JFS_IMAP
+
+#include "jfs_txnmgr.h"
+
+/*
+ * jfs_imap.h: disk inode manager
+ */
+
+#define EXTSPERIAG 128 /* number of disk inode extent per iag */
+#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */
+#define SMAPSZ 4 /* number of words per summary map */
+#define EXTSPERSUM 32 /* number of extents per summary map entry */
+#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */
+#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */
+#define MAXIAGS ((1<<20)-1) /* maximum number of iags */
+#define MAXAG 128 /* maximum number of allocation groups */
+
+#define AMAPSIZE 512 /* bytes in the IAG allocation maps */
+#define SMAPSIZE 16 /* bytes in the IAG summary maps */
+
+/* convert inode number to iag number */
+#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG)
+
+/* convert iag number to logical block number of the iag page */
+#define IAGTOLBLK(iagno,l2nbperpg) (((iagno) + 1) << (l2nbperpg))
+
+/* get the starting block number of the 4K page of an inode extent
+ * that contains ino.
+ */
+#define INOPBLK(pxd,ino,l2nbperpg) (addressPXD((pxd)) + \
+ ((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg)))
+
+/*
+ * inode allocation map:
+ *
+ * inode allocation map consists of
+ * . the inode map control page and
+ * . inode allocation group pages (per 4096 inodes)
+ * which are addressed by standard JFS xtree.
+ */
+/*
+ * inode allocation group page (per 4096 inodes of an AG)
+ */
+typedef struct {
+ s64 agstart; /* 8: starting block of ag */
+ s32 iagnum; /* 4: inode allocation group number */
+ s32 inofreefwd; /* 4: ag inode free list forward */
+ s32 inofreeback; /* 4: ag inode free list back */
+ s32 extfreefwd; /* 4: ag inode extent free list forward */
+ s32 extfreeback; /* 4: ag inode extent free list back */
+ s32 iagfree; /* 4: iag free list */
+
+ /* summary map: 1 bit per inode extent */
+ s32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes;
+ * note: this indicates free and backed
+ * inodes, if the extent is not backed the
+ * value will be 1. if the extent is
+ * backed but all inodes are being used the
+ * value will be 1. if the extent is
+ * backed but at least one of the inodes is
+ * free the value will be 0.
+ */
+ s32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */
+ s32 nfreeinos; /* 4: number of free inodes */
+ s32 nfreeexts; /* 4: number of free extents */
+ /* (72) */
+ u8 pad[1976]; /* 1976: pad to 2048 bytes */
+ /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */
+ u32 wmap[EXTSPERIAG]; /* 512: working allocation map */
+ u32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */
+ pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */
+} iag_t; /* (4096) */
+
+/*
+ * per AG control information (in inode map control page)
+ */
+typedef struct {
+ s32 inofree; /* 4: free inode list anchor */
+ s32 extfree; /* 4: free extent list anchor */
+ s32 numinos; /* 4: number of backed inodes */
+ s32 numfree; /* 4: number of free inodes */
+} iagctl_t; /* (16) */
+
+/*
+ * per fileset/aggregate inode map control page
+ */
+typedef struct {
+ s32 in_freeiag; /* 4: free iag list anchor */
+ s32 in_nextiag; /* 4: next free iag number */
+ s32 in_numinos; /* 4: num of backed inodes */
+ s32 in_numfree; /* 4: num of free backed inodes */
+ s32 in_nbperiext; /* 4: num of blocks per inode extent */
+ s32 in_l2nbperiext; /* 4: l2 of in_nbperiext */
+ s32 in_diskblock; /* 4: for standalone test driver */
+ s32 in_maxag; /* 4: for standalone test driver */
+ u8 pad[2016]; /* 2016: pad to 2048 */
+ iagctl_t in_agctl[MAXAG]; /* 2048: AG control information */
+} dinomap_t; /* (4096) */
+
+
+/*
+ * In-core inode map control page
+ */
+typedef struct inomap {
+ dinomap_t im_imap; /* 4096: inode allocation control */
+ struct inode *im_ipimap; /* 4: ptr to inode for imap */
+ struct semaphore im_freelock; /* 4: iag free list lock */
+ struct semaphore im_aglock[MAXAG]; /* 512: per AG locks */
+ u32 *im_DBGdimap;
+ atomic_t im_numinos; /* num of backed inodes */
+ atomic_t im_numfree; /* num of free backed inodes */
+} imap_t;
+
+#define im_freeiag im_imap.in_freeiag
+#define im_nextiag im_imap.in_nextiag
+#define im_agctl im_imap.in_agctl
+#define im_nbperiext im_imap.in_nbperiext
+#define im_l2nbperiext im_imap.in_l2nbperiext
+
+/* for standalone testdriver
+ */
+#define im_diskblock im_imap.in_diskblock
+#define im_maxag im_imap.in_maxag
+
+extern int diFree(struct inode *);
+extern int diAlloc(struct inode *, boolean_t, struct inode *);
+extern int diSync(struct inode *);
+/* external references */
+extern int diUpdatePMap(struct inode *ipimap, unsigned long inum,
+ boolean_t is_free, tblock_t * tblk);
+#ifdef _STILL_TO_PORT
+extern int diExtendFS(inode_t * ipimap, inode_t * ipbmap);
+#endif /* _STILL_TO_PORT */
+
+extern int diMount(struct inode *);
+extern int diUnmount(struct inode *, int);
+extern int diRead(struct inode *);
+extern void diClearExtension(struct inode *);
+extern struct inode *diReadSpecial(struct super_block *, ino_t);
+extern void diWriteSpecial(struct inode *);
+extern void diFreeSpecial(struct inode *);
+extern int diWrite(tid_t tid, struct inode *);
+#endif /* _H_JFS_IMAP */
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
new file mode 100644
index 000000000000..53debd12bd11
--- /dev/null
+++ b/fs/jfs/jfs_incore.h
@@ -0,0 +1,149 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+*/
+#ifndef _H_JFS_INCORE
+#define _H_JFS_INCORE
+
+#include <linux/slab.h>
+#include <asm/bitops.h>
+#include "jfs_types.h"
+#include "jfs_xtree.h"
+#include "jfs_dtree.h"
+
+/*
+ * JFS magic number
+ */
+#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */
+
+/*
+ * Due to header ordering problems this can't be in jfs_lock.h
+ */
+typedef struct jfs_rwlock {
+ struct rw_semaphore rw_sem;
+ atomic_t in_use; /* for hacked implementation of trylock */
+} jfs_rwlock_t;
+
+/*
+ * JFS-private inode information
+ */
+struct jfs_inode_info {
+ int fileset; /* fileset number (always 16)*/
+ uint mode2; /* jfs-specific mode */
+ pxd_t ixpxd; /* inode extent descriptor */
+ dxd_t acl; /* dxd describing acl */
+ dxd_t ea; /* dxd describing ea */
+ time_t otime; /* time created */
+ uint next_index; /* next available directory entry index */
+ int acltype; /* Type of ACL */
+ short btorder; /* access order */
+ short btindex; /* btpage entry index*/
+ struct inode *ipimap; /* inode map */
+ long cflag; /* commit flags */
+ u16 bxflag; /* xflag of pseudo buffer? */
+ unchar agno; /* ag number */
+ unchar pad; /* pad */
+ lid_t blid; /* lid of pseudo buffer? */
+ lid_t atlhead; /* anonymous tlock list head */
+ lid_t atltail; /* anonymous tlock list tail */
+ struct list_head anon_inode_list; /* inodes having anonymous txns */
+ struct list_head mp_list; /* metapages in inode's address space */
+ jfs_rwlock_t rdwrlock; /* read/write lock */
+ lid_t xtlid; /* lid of xtree lock on directory */
+ union {
+ struct {
+ xtpage_t _xtroot; /* 288: xtree root */
+ struct inomap *_imap; /* 4: inode map header */
+ } file;
+ struct {
+ dir_table_slot_t _table[12]; /* 96: directory index */
+ dtroot_t _dtroot; /* 288: dtree root */
+ } dir;
+ struct {
+ unchar _unused[16]; /* 16: */
+ dxd_t _dxd; /* 16: */
+ unchar _inline[128]; /* 128: inline symlink */
+ } link;
+ } u;
+ struct inode vfs_inode;
+};
+#define i_xtroot u.file._xtroot
+#define i_imap u.file._imap
+#define i_dirtable u.dir._table
+#define i_dtroot u.dir._dtroot
+#define i_inline u.link._inline
+
+/*
+ * cflag
+ */
+enum cflags {
+ COMMIT_New, /* never committed inode */
+ COMMIT_Nolink, /* inode committed with zero link count */
+ COMMIT_Inlineea, /* commit inode inline EA */
+ COMMIT_Freewmap, /* free WMAP at iClose() */
+ COMMIT_Dirty, /* Inode is really dirty */
+ COMMIT_Holdlock, /* Hold the IWRITE_LOCK until commit is done */
+ COMMIT_Dirtable, /* commit changes to di_dirtable */
+ COMMIT_Stale, /* data extent is no longer valid */
+ COMMIT_Synclist, /* metadata pages on group commit synclist */
+};
+
+#define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag))
+#define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag))
+#define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag))
+#define test_and_clear_cflag(flag, ip) \
+ test_and_clear_bit(flag, &(JFS_IP(ip)->cflag))
+/*
+ * JFS-private superblock information.
+ */
+struct jfs_sb_info {
+ unsigned long mntflag; /* 4: aggregate attributes */
+ struct inode *ipbmap; /* 4: block map inode */
+ struct inode *ipaimap; /* 4: aggregate inode map inode */
+ struct inode *ipaimap2; /* 4: secondary aimap inode */
+ struct inode *ipimap; /* 4: aggregate inode map inode */
+ struct jfs_log *log; /* 4: log */
+ short bsize; /* 2: logical block size */
+ short l2bsize; /* 2: log2 logical block size */
+ short nbperpage; /* 2: blocks per page */
+ short l2nbperpage; /* 2: log2 blocks per page */
+ short l2niperblk; /* 2: log2 inodes per page */
+ short reserved; /* 2: log2 inodes per page */
+ pxd_t logpxd; /* 8: pxd describing log */
+ pxd_t ait2; /* 8: pxd describing AIT copy */
+ /* Formerly in ipimap */
+ uint gengen; /* 4: inode generation generator*/
+ uint inostamp; /* 4: shows inode belongs to fileset*/
+
+ /* Formerly in ipbmap */
+ struct bmap *bmap; /* 4: incore bmap descriptor */
+ struct nls_table *nls_tab; /* 4: current codepage */
+ struct inode *direct_inode; /* 4: inode for physical I/O */
+ struct address_space *direct_mapping; /* 4: mapping for physical I/O */
+ uint state; /* 4: mount/recovery state */
+};
+
+static inline struct jfs_inode_info *JFS_IP(struct inode *inode)
+{
+ return list_entry(inode, struct jfs_inode_info, vfs_inode);
+}
+#define JFS_SBI(sb) ((struct jfs_sb_info *)(sb)->u.generic_sbp)
+
+#define isReadOnly(ip) ((JFS_SBI((ip)->i_sb)->log) ? 0 : 1)
+
+#endif /* _H_JFS_INCORE */
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
new file mode 100644
index 000000000000..fb4384c71277
--- /dev/null
+++ b/fs/jfs/jfs_inode.c
@@ -0,0 +1,132 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_imap.h"
+#include "jfs_dinode.h"
+#include "jfs_debug.h"
+
+/*
+ * NAME: ialloc()
+ *
+ * FUNCTION: Allocate a new inode
+ *
+ */
+struct inode *ialloc(struct inode *parent, umode_t mode)
+{
+ struct super_block *sb = parent->i_sb;
+ struct inode *inode;
+ struct jfs_inode_info *jfs_inode;
+ int rc;
+
+ inode = new_inode(sb);
+ if (!inode) {
+ jERROR(1, ("ialloc: new_inode returned NULL!\n"));
+ return inode;
+ }
+
+ jfs_inode = JFS_IP(inode);
+
+ rc = diAlloc(parent, S_ISDIR(mode), inode);
+ if (rc) {
+ jERROR(1, ("ialloc: diAlloc returned %d!\n", rc));
+ make_bad_inode(inode);
+ iput(inode);
+ return NULL;
+ }
+
+ inode->i_uid = current->fsuid;
+ if (parent->i_mode & S_ISGID) {
+ inode->i_gid = parent->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current->fsgid;
+
+ inode->i_mode = mode;
+ if (S_ISDIR(mode))
+ jfs_inode->mode2 = IDIRECTORY | mode;
+ else
+ jfs_inode->mode2 = INLINEEA | ISPARSE | mode;
+ inode->i_blksize = sb->s_blocksize;
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ jfs_inode->otime = inode->i_ctime;
+ inode->i_version = ++event;
+ inode->i_generation = JFS_SBI(sb)->gengen++;
+
+ jfs_inode->cflag = 0;
+ set_cflag(COMMIT_New, inode);
+
+ /* Zero remaining fields */
+ memset(&jfs_inode->acl, 0, sizeof(dxd_t));
+ memset(&jfs_inode->ea, 0, sizeof(dxd_t));
+ jfs_inode->next_index = 0;
+ jfs_inode->acltype = 0;
+ jfs_inode->btorder = 0;
+ jfs_inode->btindex = 0;
+ jfs_inode->bxflag = 0;
+ jfs_inode->blid = 0;
+ jfs_inode->atlhead = 0;
+ jfs_inode->atltail = 0;
+ jfs_inode->xtlid = 0;
+
+ jFYI(1, ("ialloc returns inode = 0x%p\n", inode));
+
+ return inode;
+}
+
+/*
+ * NAME: iwritelocklist()
+ *
+ * FUNCTION: Lock multiple inodes in sorted order to avoid deadlock
+ *
+ */
+void iwritelocklist(int n, ...)
+{
+ va_list ilist;
+ struct inode *sort[4];
+ struct inode *ip;
+ int k, m;
+
+ va_start(ilist, n);
+ for (k = 0; k < n; k++)
+ sort[k] = va_arg(ilist, struct inode *);
+ va_end(ilist);
+
+ /* Bubble sort in descending order */
+ do {
+ m = 0;
+ for (k = 0; k < n; k++)
+ if ((k + 1) < n
+ && sort[k + 1]->i_ino > sort[k]->i_ino) {
+ ip = sort[k];
+ sort[k] = sort[k + 1];
+ sort[k + 1] = ip;
+ m++;
+ }
+ } while (m);
+
+ /* Lock them */
+ for (k = 0; k < n; k++) {
+ IWRITE_LOCK(sort[k]);
+ }
+}
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
new file mode 100644
index 000000000000..ba1c14982c70
--- /dev/null
+++ b/fs/jfs/jfs_inode.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_INODE
+#define _H_JFS_INODE
+
+extern struct inode *ialloc(struct inode *, umode_t);
+
+#endif /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
new file mode 100644
index 000000000000..c30a633e26d1
--- /dev/null
+++ b/fs/jfs/jfs_lock.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_LOCK
+#define _H_JFS_LOCK
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+
+/*
+ * jfs_lock.h
+ *
+ * JFS lock definition for globally referenced locks
+ */
+
+/* readers/writer lock: thread-thread */
+
+/*
+ * RW semaphores do not currently have a trylock function. Since the
+ * implementation varies by platform, I have implemented a platform-independent
+ * wrapper around the rw_semaphore routines. If this turns out to be the best
+ * way of avoiding our locking problems, I will push to get a trylock
+ * implemented in the kernel, but I'd rather find a way to avoid having to
+ * use it.
+ */
+#define RDWRLOCK_T jfs_rwlock_t
+static inline void RDWRLOCK_INIT(jfs_rwlock_t * Lock)
+{
+ init_rwsem(&Lock->rw_sem);
+ atomic_set(&Lock->in_use, 0);
+}
+static inline void READ_LOCK(jfs_rwlock_t * Lock)
+{
+ atomic_inc(&Lock->in_use);
+ down_read(&Lock->rw_sem);
+}
+static inline void READ_UNLOCK(jfs_rwlock_t * Lock)
+{
+ up_read(&Lock->rw_sem);
+ atomic_dec(&Lock->in_use);
+}
+static inline void WRITE_LOCK(jfs_rwlock_t * Lock)
+{
+ atomic_inc(&Lock->in_use);
+ down_write(&Lock->rw_sem);
+}
+
+static inline int WRITE_TRYLOCK(jfs_rwlock_t * Lock)
+{
+ if (atomic_read(&Lock->in_use))
+ return 0;
+ WRITE_LOCK(Lock);
+ return 1;
+}
+static inline void WRITE_UNLOCK(jfs_rwlock_t * Lock)
+{
+ up_write(&Lock->rw_sem);
+ atomic_dec(&Lock->in_use);
+}
+
+#define IREAD_LOCK(ip) READ_LOCK(&JFS_IP(ip)->rdwrlock)
+#define IREAD_UNLOCK(ip) READ_UNLOCK(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_LOCK(ip) WRITE_LOCK(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_TRYLOCK(ip) WRITE_TRYLOCK(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_UNLOCK(ip) WRITE_UNLOCK(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_LOCK_LIST iwritelocklist
+
+extern void iwritelocklist(int, ...);
+
+/*
+ * Conditional sleep where condition is protected by spinlock
+ *
+ * lock_cmd and unlock_cmd take and release the spinlock
+ */
+#define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd) \
+do { \
+ DECLARE_WAITQUEUE(__wait, current); \
+ \
+ add_wait_queue(&wq, &__wait); \
+ for (;;) { \
+ set_current_state(TASK_UNINTERRUPTIBLE);\
+ if (cond) \
+ break; \
+ unlock_cmd; \
+ schedule(); \
+ lock_cmd; \
+ } \
+ current->state = TASK_RUNNING; \
+ remove_wait_queue(&wq, &__wait); \
+} while (0)
+
+#endif /* _H_JFS_LOCK */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
new file mode 100644
index 000000000000..0d243fba0f82
--- /dev/null
+++ b/fs/jfs/jfs_logmgr.c
@@ -0,0 +1,2490 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+*/
+
+/*
+ * jfs_logmgr.c: log manager
+ *
+ * for related information, see transaction manager (jfs_txnmgr.c), and
+ * recovery manager (jfs_logredo.c).
+ *
+ * note: for detail, RTFS.
+ *
+ * log buffer manager:
+ * special purpose buffer manager supporting log i/o requirements.
+ * per log serial pageout of logpage
+ * queuing i/o requests and redrive i/o at iodone
+ * maintain current logpage buffer
+ * no caching since append only
+ * appropriate jfs buffer cache buffers as needed
+ *
+ * group commit:
+ * transactions which wrote COMMIT records in the same in-memory
+ * log page during the pageout of previous/current log page(s) are
+ * committed together by the pageout of the page.
+ *
+ * TBD lazy commit:
+ * transactions are committed asynchronously when the log page
+ * containing it COMMIT is paged out when it becomes full;
+ *
+ * serialization:
+ * . a per log lock serialize log write.
+ * . a per log lock serialize group commit.
+ * . a per log lock serialize log open/close;
+ *
+ * TBD log integrity:
+ * careful-write (ping-pong) of last logpage to recover from crash
+ * in overwrite.
+ * detection of split (out-of-order) write of physical sectors
+ * of last logpage via timestamp at end of each sector
+ * with its mirror data array at trailer).
+ *
+ * alternatives:
+ * lsn - 64-bit monotonically increasing integer vs
+ * 32-bit lspn and page eor.
+ */
+
+#include <linux/fs.h>
+#include <linux/locks.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+
+
+/*
+ * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIOtask)
+ */
+static lbuf_t *log_redrive_list;
+static spinlock_t log_redrive_lock = SPIN_LOCK_UNLOCKED;
+
+
+/*
+ * log read/write serialization (per log)
+ */
+#define LOG_LOCK_INIT(log) init_MUTEX(&(log)->loglock)
+#define LOG_LOCK(log) down(&((log)->loglock))
+#define LOG_UNLOCK(log) up(&((log)->loglock))
+
+
+/*
+ * log group commit serialization (per log)
+ */
+
+#define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock)
+#define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock)
+#define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock)
+#define LOGGC_WAKEUP(tblk) wake_up(&(tblk)->gcwait)
+
+/*
+ * log sync serialization (per log)
+ */
+#define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE)
+#define LOGSYNC_BARRIER(logsize) ((logsize)/4)
+/*
+#define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE)
+#define LOGSYNC_BARRIER(logsize) ((logsize)/2)
+*/
+
+
+/*
+ * log buffer cache synchronization
+ */
+static spinlock_t jfsLCacheLock = SPIN_LOCK_UNLOCKED;
+
+#define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags)
+#define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags)
+
+/*
+ * See __SLEEP_COND in jfs_locks.h
+ */
+#define LCACHE_SLEEP_COND(wq, cond, flags) \
+do { \
+ if (cond) \
+ break; \
+ __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \
+} while (0)
+
+#define LCACHE_WAKEUP(event) wake_up(event)
+
+
+/*
+ * lbuf buffer cache (lCache) control
+ */
+/* log buffer manager pageout control (cumulative, inclusive) */
+#define lbmREAD 0x0001
+#define lbmWRITE 0x0002 /* enqueue at tail of write queue;
+ * init pageout if at head of queue;
+ */
+#define lbmRELEASE 0x0004 /* remove from write queue
+ * at completion of pageout;
+ * do not free/recycle it yet:
+ * caller will free it;
+ */
+#define lbmSYNC 0x0008 /* do not return to freelist
+ * when removed from write queue;
+ */
+#define lbmFREE 0x0010 /* return to freelist
+ * at completion of pageout;
+ * the buffer may be recycled;
+ */
+#define lbmDONE 0x0020
+#define lbmERROR 0x0040
+#define lbmGC 0x0080 /* lbmIODone to perform post-GC processing
+ * of log page
+ */
+#define lbmDIRECT 0x0100
+
+/*
+ * external references
+ */
+extern void vPut(struct inode *ip);
+extern void txLazyUnlock(tblock_t * tblk);
+extern int jfs_thread_stopped(void);
+extern struct task_struct *jfsIOtask;
+extern struct completion jfsIOwait;
+
+/*
+ * forward references
+ */
+static int lmWriteRecord(log_t * log, tblock_t * tblk, lrd_t * lrd,
+ tlock_t * tlck);
+
+static int lmNextPage(log_t * log);
+static int lmLogInit(log_t * log);
+static int lmLogShutdown(log_t * log);
+
+static int lbmLogInit(log_t * log);
+static void lbmLogShutdown(log_t * log);
+static lbuf_t *lbmAllocate(log_t * log, int);
+static void lbmFree(lbuf_t * bp);
+static void lbmfree(lbuf_t * bp);
+static int lbmRead(log_t * log, int pn, lbuf_t ** bpp);
+static void lbmWrite(log_t * log, lbuf_t * bp, int flag, int cant_block);
+static void lbmDirectWrite(log_t * log, lbuf_t * bp, int flag);
+static int lbmIOWait(lbuf_t * bp, int flag);
+static bio_end_io_t lbmIODone;
+#ifdef _STILL_TO_PORT
+static void lbmDirectIODone(iobuf_t * ddbp);
+#endif /* _STILL_TO_PORT */
+void lbmStartIO(lbuf_t * bp);
+void lmGCwrite(log_t * log, int cant_block);
+
+
+/*
+ * statistics
+ */
+#ifdef CONFIG_JFS_STATISTICS
+struct lmStat {
+ uint commit; /* # of commit */
+ uint pagedone; /* # of page written */
+ uint submitted; /* # of pages submitted */
+} lmStat;
+#endif
+
+
+/*
+ * NAME: lmLog()
+ *
+ * FUNCTION: write a log record;
+ *
+ * PARAMETER:
+ *
+ * RETURN: lsn - offset to the next log record to write (end-of-log);
+ * -1 - error;
+ *
+ * note: todo: log error handler
+ */
+int lmLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck)
+{
+ int lsn;
+ int diffp, difft;
+ metapage_t *mp = NULL;
+
+ jFYI(1, ("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p\n",
+ log, tblk, lrd, tlck));
+
+ LOG_LOCK(log);
+
+ /* log by (out-of-transaction) JFS ? */
+ if (tblk == NULL)
+ goto writeRecord;
+
+ /* log from page ? */
+ if (tlck == NULL ||
+ tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL)
+ goto writeRecord;
+
+ /*
+ * initialize/update page/transaction recovery lsn
+ */
+ lsn = log->lsn;
+
+ LOGSYNC_LOCK(log);
+
+ /*
+ * initialize page lsn if first log write of the page
+ */
+ if (mp->lsn == 0) {
+ mp->log = log;
+ mp->lsn = lsn;
+ log->count++;
+
+ /* insert page at tail of logsynclist */
+ list_add_tail(&mp->synclist, &log->synclist);
+ }
+
+ /*
+ * initialize/update lsn of tblock of the page
+ *
+ * transaction inherits oldest lsn of pages associated
+ * with allocation/deallocation of resources (their
+ * log records are used to reconstruct allocation map
+ * at recovery time: inode for inode allocation map,
+ * B+-tree index of extent descriptors for block
+ * allocation map);
+ * allocation map pages inherit transaction lsn at
+ * commit time to allow forwarding log syncpt past log
+ * records associated with allocation/deallocation of
+ * resources only after persistent map of these map pages
+ * have been updated and propagated to home.
+ */
+ /*
+ * initialize transaction lsn:
+ */
+ if (tblk->lsn == 0) {
+ /* inherit lsn of its first page logged */
+ tblk->lsn = mp->lsn;
+ log->count++;
+
+ /* insert tblock after the page on logsynclist */
+ list_add(&tblk->synclist, &mp->synclist);
+ }
+ /*
+ * update transaction lsn:
+ */
+ else {
+ /* inherit oldest/smallest lsn of page */
+ logdiff(diffp, mp->lsn, log);
+ logdiff(difft, tblk->lsn, log);
+ if (diffp < difft) {
+ /* update tblock lsn with page lsn */
+ tblk->lsn = mp->lsn;
+
+ /* move tblock after page on logsynclist */
+ list_del(&tblk->synclist);
+ list_add(&tblk->synclist, &mp->synclist);
+ }
+ }
+
+ LOGSYNC_UNLOCK(log);
+
+ /*
+ * write the log record
+ */
+ writeRecord:
+ lsn = lmWriteRecord(log, tblk, lrd, tlck);
+
+ /*
+ * forward log syncpt if log reached next syncpt trigger
+ */
+ logdiff(diffp, lsn, log);
+ if (diffp >= log->nextsync)
+ lsn = lmLogSync(log, 0);
+
+ /* update end-of-log lsn */
+ log->lsn = lsn;
+
+ LOG_UNLOCK(log);
+
+ /* return end-of-log address */
+ return lsn;
+}
+
+
+/*
+ * NAME: lmWriteRecord()
+ *
+ * FUNCTION: move the log record to current log page
+ *
+ * PARAMETER: cd - commit descriptor
+ *
+ * RETURN: end-of-log address
+ *
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int
+lmWriteRecord(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck)
+{
+ int lsn = 0; /* end-of-log address */
+ lbuf_t *bp; /* dst log page buffer */
+ logpage_t *lp; /* dst log page */
+ caddr_t dst; /* destination address in log page */
+ int dstoffset; /* end-of-log offset in log page */
+ int freespace; /* free space in log page */
+ caddr_t p; /* src meta-data page */
+ caddr_t src;
+ int srclen;
+ int nbytes; /* number of bytes to move */
+ int i;
+ int len;
+ linelock_t *linelock;
+ lv_t *lv;
+ lvd_t *lvd;
+ int l2linesize;
+
+ len = 0;
+
+ /* retrieve destination log page to write */
+ bp = (lbuf_t *) log->bp;
+ lp = (logpage_t *) bp->l_ldata;
+ dstoffset = log->eor;
+
+ /* any log data to write ? */
+ if (tlck == NULL)
+ goto moveLrd;
+
+ /*
+ * move log record data
+ */
+ /* retrieve source meta-data page to log */
+ if (tlck->flag & tlckPAGELOCK) {
+ p = (caddr_t) (tlck->mp->data);
+ linelock = (linelock_t *) & tlck->lock;
+ }
+ /* retrieve source in-memory inode to log */
+ else if (tlck->flag & tlckINODELOCK) {
+ if (tlck->type & tlckDTREE)
+ p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot;
+ else
+ p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
+ linelock = (linelock_t *) & tlck->lock;
+ }
+#ifdef _JFS_WIP
+ else if (tlck->flag & tlckINLINELOCK) {
+
+ inlinelock = (inlinelock_t *) & tlck;
+ p = (caddr_t) & inlinelock->pxd;
+ linelock = (linelock_t *) & tlck;
+ }
+#endif /* _JFS_WIP */
+ else {
+ jERROR(2, ("lmWriteRecord: UFO tlck:0x%p\n", tlck));
+ return 0; /* Probably should trap */
+ }
+ l2linesize = linelock->l2linesize;
+
+ moveData:
+ ASSERT(linelock->index <= linelock->maxcnt);
+
+ lv = (lv_t *) & linelock->lv;
+ for (i = 0; i < linelock->index; i++, lv++) {
+ if (lv->length == 0)
+ continue;
+
+ /* is page full ? */
+ if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) {
+ /* page become full: move on to next page */
+ lmNextPage(log);
+
+ bp = log->bp;
+ lp = (logpage_t *) bp->l_ldata;
+ dstoffset = LOGPHDRSIZE;
+ }
+
+ /*
+ * move log vector data
+ */
+ src = (u8 *) p + (lv->offset << l2linesize);
+ srclen = lv->length << l2linesize;
+ len += srclen;
+ while (srclen > 0) {
+ freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
+ nbytes = min(freespace, srclen);
+ dst = (caddr_t) lp + dstoffset;
+ memcpy(dst, src, nbytes);
+ dstoffset += nbytes;
+
+ /* is page not full ? */
+ if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
+ break;
+
+ /* page become full: move on to next page */
+ lmNextPage(log);
+
+ bp = (lbuf_t *) log->bp;
+ lp = (logpage_t *) bp->l_ldata;
+ dstoffset = LOGPHDRSIZE;
+
+ srclen -= nbytes;
+ src += nbytes;
+ }
+
+ /*
+ * move log vector descriptor
+ */
+ len += 4;
+ lvd = (lvd_t *) ((caddr_t) lp + dstoffset);
+ lvd->offset = cpu_to_le16(lv->offset);
+ lvd->length = cpu_to_le16(lv->length);
+ dstoffset += 4;
+ jFYI(1,
+ ("lmWriteRecord: lv offset:%d length:%d\n",
+ lv->offset, lv->length));
+ }
+
+ if ((i = linelock->next)) {
+ linelock = (linelock_t *) lid_to_tlock(i);
+ goto moveData;
+ }
+
+ /*
+ * move log record descriptor
+ */
+ moveLrd:
+ lrd->length = cpu_to_le16(len);
+
+ src = (caddr_t) lrd;
+ srclen = LOGRDSIZE;
+
+ while (srclen > 0) {
+ freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
+ nbytes = min(freespace, srclen);
+ dst = (caddr_t) lp + dstoffset;
+ memcpy(dst, src, nbytes);
+
+ dstoffset += nbytes;
+ srclen -= nbytes;
+
+ /* are there more to move than freespace of page ? */
+ if (srclen)
+ goto pageFull;
+
+ /*
+ * end of log record descriptor
+ */
+
+ /* update last log record eor */
+ log->eor = dstoffset;
+ bp->l_eor = dstoffset;
+ lsn = (log->page << L2LOGPSIZE) + dstoffset;
+
+ if (lrd->type & cpu_to_le16(LOG_COMMIT)) {
+ tblk->clsn = lsn;
+ jFYI(1,
+ ("wr: tclsn:0x%x, beor:0x%x\n", tblk->clsn,
+ bp->l_eor));
+
+ INCREMENT(lmStat.commit); /* # of commit */
+
+ /*
+ * enqueue tblock for group commit:
+ *
+ * enqueue tblock of non-trivial/synchronous COMMIT
+ * at tail of group commit queue
+ * (trivial/asynchronous COMMITs are ignored by
+ * group commit.)
+ */
+ LOGGC_LOCK(log);
+
+ /* init tblock gc state */
+ tblk->flag = tblkGC_QUEUE;
+ tblk->bp = log->bp;
+ tblk->pn = log->page;
+ tblk->eor = log->eor;
+ init_waitqueue_head(&tblk->gcwait);
+
+ /* enqueue transaction to commit queue */
+ tblk->cqnext = NULL;
+ if (log->cqueue.head) {
+ log->cqueue.tail->cqnext = tblk;
+ log->cqueue.tail = tblk;
+ } else
+ log->cqueue.head = log->cqueue.tail = tblk;
+
+ LOGGC_UNLOCK(log);
+ }
+
+ jFYI(1,
+ ("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x\n",
+ le16_to_cpu(lrd->type), log->bp, log->page,
+ dstoffset));
+
+ /* page not full ? */
+ if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
+ return lsn;
+
+ pageFull:
+ /* page become full: move on to next page */
+ lmNextPage(log);
+
+ bp = (lbuf_t *) log->bp;
+ lp = (logpage_t *) bp->l_ldata;
+ dstoffset = LOGPHDRSIZE;
+ src += nbytes;
+ }
+
+ return lsn;
+}
+
+
+/*
+ * NAME: lmNextPage()
+ *
+ * FUNCTION: write current page and allocate next page.
+ *
+ * PARAMETER: log
+ *
+ * RETURN: 0
+ *
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int lmNextPage(log_t * log)
+{
+ logpage_t *lp;
+ int lspn; /* log sequence page number */
+ int pn; /* current page number */
+ lbuf_t *bp;
+ lbuf_t *nextbp;
+ tblock_t *tblk;
+
+ jFYI(1, ("lmNextPage\n"));
+
+ /* get current log page number and log sequence page number */
+ pn = log->page;
+ bp = log->bp;
+ lp = (logpage_t *) bp->l_ldata;
+ lspn = le32_to_cpu(lp->h.page);
+
+ LOGGC_LOCK(log);
+
+ /*
+ * write or queue the full page at the tail of write queue
+ */
+ /* get the tail tblk on commit queue */
+ tblk = log->cqueue.tail;
+
+ /* every tblk who has COMMIT record on the current page,
+ * and has not been committed, must be on commit queue
+ * since tblk is queued at commit queueu at the time
+ * of writing its COMMIT record on the page before
+ * page becomes full (even though the tblk thread
+ * who wrote COMMIT record may have been suspended
+ * currently);
+ */
+
+ /* is page bound with outstanding tail tblk ? */
+ if (tblk && tblk->pn == pn) {
+ /* mark tblk for end-of-page */
+ tblk->flag |= tblkGC_EOP;
+
+ /* if page is not already on write queue,
+ * just enqueue (no lbmWRITE to prevent redrive)
+ * buffer to wqueue to ensure correct serial order
+ * of the pages since log pages will be added
+ * continuously (tblk bound with the page hasn't
+ * got around to init write of the page, either
+ * preempted or the page got filled by its COMMIT
+ * record);
+ * pages with COMMIT are paged out explicitly by
+ * tblk in lmGroupCommit();
+ */
+ if (bp->l_wqnext == NULL) {
+ /* bp->l_ceor = bp->l_eor; */
+ /* lp->h.eor = lp->t.eor = bp->l_ceor; */
+ lbmWrite(log, bp, 0, 0);
+ }
+ }
+ /* page is not bound with outstanding tblk:
+ * init write or mark it to be redriven (lbmWRITE)
+ */
+ else {
+ /* finalize the page */
+ bp->l_ceor = bp->l_eor;
+ lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+ lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0);
+ }
+ LOGGC_UNLOCK(log);
+
+ /*
+ * allocate/initialize next page
+ */
+ /* if log wraps, the first data page of log is 2
+ * (0 never used, 1 is superblock).
+ */
+ log->page = (pn == log->size - 1) ? 2 : pn + 1;
+ log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */
+
+ /* allocate/initialize next log page buffer */
+ nextbp = lbmAllocate(log, log->page);
+ nextbp->l_eor = log->eor;
+ log->bp = nextbp;
+
+ /* initialize next log page */
+ lp = (logpage_t *) nextbp->l_ldata;
+ lp->h.page = lp->t.page = cpu_to_le32(lspn + 1);
+ lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
+
+ jFYI(1, ("lmNextPage done\n"));
+ return 0;
+}
+
+
+/*
+ * NAME: lmGroupCommit()
+ *
+ * FUNCTION: group commit
+ * initiate pageout of the pages with COMMIT in the order of
+ * page number - redrive pageout of the page at the head of
+ * pageout queue until full page has been written.
+ *
+ * RETURN:
+ *
+ * NOTE:
+ * LOGGC_LOCK serializes log group commit queue, and
+ * transaction blocks on the commit queue.
+ * N.B. LOG_LOCK is NOT held during lmGroupCommit().
+ */
+int lmGroupCommit(log_t * log, tblock_t * tblk)
+{
+ int rc = 0;
+
+ LOGGC_LOCK(log);
+
+ /* group committed already ? */
+ if (tblk->flag & tblkGC_COMMITTED) {
+ if (tblk->flag & tblkGC_ERROR)
+ rc = EIO;
+
+ LOGGC_UNLOCK(log);
+ return rc;
+ }
+ jFYI(1,
+ ("lmGroup Commit: tblk = 0x%p, gcrtc = %d\n", tblk,
+ log->gcrtc));
+
+ /*
+ * group commit pageout in progress
+ */
+ if ((!(log->cflag & logGC_PAGEOUT)) && log->cqueue.head) {
+ /*
+ * only transaction in the commit queue:
+ *
+ * start one-transaction group commit as
+ * its group leader.
+ */
+ log->cflag |= logGC_PAGEOUT;
+
+ lmGCwrite(log, 0);
+ }
+ /* lmGCwrite gives up LOGGC_LOCK, check again */
+
+ if (tblk->flag & tblkGC_COMMITTED) {
+ if (tblk->flag & tblkGC_ERROR)
+ rc = EIO;
+
+ LOGGC_UNLOCK(log);
+ return rc;
+ }
+
+ /* upcount transaction waiting for completion
+ */
+ log->gcrtc++;
+
+ if (tblk->xflag & COMMIT_LAZY) {
+ tblk->flag |= tblkGC_LAZY;
+ LOGGC_UNLOCK(log);
+ return 0;
+ }
+ tblk->flag |= tblkGC_READY;
+
+ __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED),
+ LOGGC_LOCK(log), LOGGC_UNLOCK(log));
+
+ /* removed from commit queue */
+ if (tblk->flag & tblkGC_ERROR)
+ rc = EIO;
+
+ LOGGC_UNLOCK(log);
+ return rc;
+}
+
+/*
+ * NAME: lmGCwrite()
+ *
+ * FUNCTION: group commit write
+ * initiate write of log page, building a group of all transactions
+ * with commit records on that page.
+ *
+ * RETURN: None
+ *
+ * NOTE:
+ * LOGGC_LOCK must be held by caller.
+ * N.B. LOG_LOCK is NOT held during lmGroupCommit().
+ */
+void lmGCwrite(log_t * log, int cant_write)
+{
+ lbuf_t *bp;
+ logpage_t *lp;
+ int gcpn; /* group commit page number */
+ tblock_t *tblk;
+ tblock_t *xtblk;
+
+ /*
+ * build the commit group of a log page
+ *
+ * scan commit queue and make a commit group of all
+ * transactions with COMMIT records on the same log page.
+ */
+ /* get the head tblk on the commit queue */
+ tblk = xtblk = log->cqueue.head;
+ gcpn = tblk->pn;
+
+ while (tblk && tblk->pn == gcpn) {
+ xtblk = tblk;
+
+ /* state transition: (QUEUE, READY) -> COMMIT */
+ tblk->flag |= tblkGC_COMMIT;
+ tblk = tblk->cqnext;
+ }
+ tblk = xtblk; /* last tblk of the page */
+
+ /*
+ * pageout to commit transactions on the log page.
+ */
+ bp = (lbuf_t *) tblk->bp;
+ lp = (logpage_t *) bp->l_ldata;
+ /* is page already full ? */
+ if (tblk->flag & tblkGC_EOP) {
+ /* mark page to free at end of group commit of the page */
+ tblk->flag &= ~tblkGC_EOP;
+ tblk->flag |= tblkGC_FREE;
+ bp->l_ceor = bp->l_eor;
+ lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+ jEVENT(0,
+ ("gc: tclsn:0x%x, bceor:0x%x\n", tblk->clsn,
+ bp->l_ceor));
+ lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC,
+ cant_write);
+ }
+ /* page is not yet full */
+ else {
+ bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */
+ lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+ jEVENT(0,
+ ("gc: tclsn:0x%x, bceor:0x%x\n", tblk->clsn,
+ bp->l_ceor));
+ lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write);
+ }
+}
+
+/*
+ * NAME: lmPostGC()
+ *
+ * FUNCTION: group commit post-processing
+ * Processes transactions after their commit records have been written
+ * to disk, redriving log I/O if necessary.
+ *
+ * RETURN: None
+ *
+ * NOTE:
+ * This routine is called a interrupt time by lbmIODone
+ */
+void lmPostGC(lbuf_t * bp)
+{
+ unsigned long flags;
+ log_t *log = bp->l_log;
+ logpage_t *lp;
+ tblock_t *tblk;
+
+ //LOGGC_LOCK(log);
+ spin_lock_irqsave(&log->gclock, flags);
+ /*
+ * current pageout of group commit completed.
+ *
+ * remove/wakeup transactions from commit queue who were
+ * group committed with the current log page
+ */
+ while ((tblk = log->cqueue.head) && (tblk->flag & tblkGC_COMMIT)) {
+ /* if transaction was marked GC_COMMIT then
+ * it has been shipped in the current pageout
+ * and made it to disk - it is committed.
+ */
+
+ if (bp->l_flag & lbmERROR)
+ tblk->flag |= tblkGC_ERROR;
+
+ /* remove it from the commit queue */
+ log->cqueue.head = tblk->cqnext;
+ if (log->cqueue.head == NULL)
+ log->cqueue.tail = NULL;
+ tblk->flag &= ~tblkGC_QUEUE;
+ tblk->cqnext = 0;
+
+ jEVENT(0,
+ ("lmPostGC: tblk = 0x%p, flag = 0x%x\n", tblk,
+ tblk->flag));
+
+ if (!(tblk->xflag & COMMIT_FORCE))
+ /*
+ * Hand tblk over to lazy commit thread
+ */
+ txLazyUnlock(tblk);
+ else {
+ /* state transition: COMMIT -> COMMITTED */
+ tblk->flag |= tblkGC_COMMITTED;
+
+ if (tblk->flag & tblkGC_READY) {
+ log->gcrtc--;
+ LOGGC_WAKEUP(tblk);
+ }
+ }
+
+ /* was page full before pageout ?
+ * (and this is the last tblk bound with the page)
+ */
+ if (tblk->flag & tblkGC_FREE)
+ lbmFree(bp);
+ /* did page become full after pageout ?
+ * (and this is the last tblk bound with the page)
+ */
+ else if (tblk->flag & tblkGC_EOP) {
+ /* finalize the page */
+ lp = (logpage_t *) bp->l_ldata;
+ bp->l_ceor = bp->l_eor;
+ lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+ jEVENT(0, ("lmPostGC: calling lbmWrite\n"));
+ lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE,
+ 1);
+ }
+
+ }
+
+ /* are there any transactions who have entered lnGroupCommit()
+ * (whose COMMITs are after that of the last log page written.
+ * They are waiting for new group commit (above at (SLEEP 1)):
+ * select the latest ready transaction as new group leader and
+ * wake her up to lead her group.
+ */
+ if ((log->gcrtc > 0) && log->cqueue.head)
+ /*
+ * Call lmGCwrite with new group leader
+ */
+ lmGCwrite(log, 1);
+
+ /* no transaction are ready yet (transactions are only just
+ * queued (GC_QUEUE) and not entered for group commit yet).
+ * let the first transaction entering group commit
+ * will elect hetself as new group leader.
+ */
+ else
+ log->cflag &= ~logGC_PAGEOUT;
+
+ //LOGGC_UNLOCK(log);
+ spin_unlock_irqrestore(&log->gclock, flags);
+ return;
+}
+
+/*
+ * NAME: lmLogSync()
+ *
+ * FUNCTION: write log SYNCPT record for specified log
+ * if new sync address is available
+ * (normally the case if sync() is executed by back-ground
+ * process).
+ * if not, explicitly run jfs_blogsync() to initiate
+ * getting of new sync address.
+ * calculate new value of i_nextsync which determines when
+ * this code is called again.
+ *
+ * this is called only from lmLog().
+ *
+ * PARAMETER: ip - pointer to logs inode.
+ *
+ * RETURN: 0
+ *
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+int lmLogSync(log_t * log, int nosyncwait)
+{
+ int logsize;
+ int written; /* written since last syncpt */
+ int free; /* free space left available */
+ int delta; /* additional delta to write normally */
+ int more; /* additional write granted */
+ lrd_t lrd;
+ int lsn;
+ struct logsyncblk *lp;
+
+ /*
+ * forward syncpt
+ */
+ /* if last sync is same as last syncpt,
+ * invoke sync point forward processing to update sync.
+ */
+
+ if (log->sync == log->syncpt) {
+ LOGSYNC_LOCK(log);
+ /* ToDo: push dirty metapages out to disk */
+// bmLogSync(log);
+
+ if (list_empty(&log->synclist))
+ log->sync = log->lsn;
+ else {
+ lp = list_entry(log->synclist.next,
+ struct logsyncblk, synclist);
+ log->sync = lp->lsn;
+ }
+ LOGSYNC_UNLOCK(log);
+
+ }
+
+ /* if sync is different from last syncpt,
+ * write a SYNCPT record with syncpt = sync.
+ * reset syncpt = sync
+ */
+ if (log->sync != log->syncpt) {
+ struct jfs_sb_info *sbi = JFS_SBI(log->sb);
+ /*
+ * We need to make sure all of the "written" metapages
+ * actually make it to disk
+ */
+ fsync_inode_data_buffers(sbi->ipbmap);
+ fsync_inode_data_buffers(sbi->ipimap);
+ fsync_inode_data_buffers(sbi->direct_inode);
+
+ lrd.logtid = 0;
+ lrd.backchain = 0;
+ lrd.type = cpu_to_le16(LOG_SYNCPT);
+ lrd.length = 0;
+ lrd.log.syncpt.sync = cpu_to_le32(log->sync);
+ lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+
+ log->syncpt = log->sync;
+ } else
+ lsn = log->lsn;
+
+ /*
+ * setup next syncpt trigger (SWAG)
+ */
+ logsize = log->logsize;
+
+ logdiff(written, lsn, log);
+ free = logsize - written;
+ delta = LOGSYNC_DELTA(logsize);
+ more = min(free / 2, delta);
+ if (more < 2 * LOGPSIZE) {
+ jEVENT(1,
+ ("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n\n"));
+ /*
+ * log wrapping
+ *
+ * option 1 - panic ? No.!
+ * option 2 - shutdown file systems
+ * associated with log ?
+ * option 3 - extend log ?
+ */
+ /*
+ * option 4 - second chance
+ *
+ * mark log wrapped, and continue.
+ * when all active transactions are completed,
+ * mark log vaild for recovery.
+ * if crashed during invalid state, log state
+ * implies invald log, forcing fsck().
+ */
+ /* mark log state log wrap in log superblock */
+ /* log->state = LOGWRAP; */
+
+ /* reset sync point computation */
+ log->syncpt = log->sync = lsn;
+ log->nextsync = delta;
+ } else
+ /* next syncpt trigger = written + more */
+ log->nextsync = written + more;
+
+ /* return if lmLogSync() from outside of transaction, e.g., sync() */
+ if (nosyncwait)
+ return lsn;
+
+ /* if number of bytes written from last sync point is more
+ * than 1/4 of the log size, stop new transactions from
+ * starting until all current transactions are completed
+ * by setting syncbarrier flag.
+ */
+ if (written > LOGSYNC_BARRIER(logsize) && logsize > 32 * LOGPSIZE) {
+ log->syncbarrier = 1;
+ jFYI(1, ("log barrier on: lsn=0x%x syncpt=0x%x\n", lsn,
+ log->syncpt));
+ }
+
+ return lsn;
+}
+
+
+/*
+ * NAME: lmLogOpen()
+ *
+ * FUNCTION: open the log on first open;
+ * insert filesystem in the active list of the log.
+ *
+ * PARAMETER: ipmnt - file system mount inode
+ * iplog - log inode (out)
+ *
+ * RETURN:
+ *
+ * serialization:
+ */
+int lmLogOpen(struct super_block *sb, log_t ** logptr)
+{
+ int rc;
+ kdev_t logdev; /* dev_t of log device */
+ log_t *log;
+
+ logdev = sb->s_dev;
+
+#ifdef _STILL_TO_PORT
+ /*
+ * open the inode representing the log device (aka log inode)
+ */
+ if (logdev != fsdev)
+ goto externalLog;
+#endif /* _STILL_TO_PORT */
+
+ /*
+ * in-line log in host file system
+ *
+ * file system to log have 1-to-1 relationship;
+ */
+// inlineLog:
+
+ *logptr = log = kmalloc(sizeof(log_t), GFP_KERNEL);
+ if (log == 0)
+ return ENOMEM;
+
+ memset(log, 0, sizeof(log_t));
+ log->sb = sb; /* This should be a list */
+ log->flag = JFS_INLINELOG;
+ log->dev = logdev;
+ log->base = addressPXD(&JFS_SBI(sb)->logpxd);
+ log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
+ (L2LOGPSIZE - sb->s_blocksize_bits);
+ log->l2bsize = sb->s_blocksize_bits;
+ ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits);
+ /*
+ * initialize log.
+ */
+ if ((rc = lmLogInit(log)))
+ goto errout10;
+
+#ifdef _STILL_TO_PORT
+ goto out;
+
+ /*
+ * external log as separate logical volume
+ *
+ * file systems to log may have n-to-1 relationship;
+ */
+ externalLog:
+ /*
+ * open log inode
+ *
+ * log inode is reserved inode of (dev_t = log device,
+ * fileset number = 0, i_number = 0), which acquire
+ * one i_count for each open by file system.
+ *
+ * hand craft dummy vfs to force iget() the special case of
+ * an in-memory inode allocation without on-disk inode
+ */
+ memset(&dummyvfs, 0, sizeof(struct vfs));
+ dummyvfs.filesetvfs.vfs_data = NULL;
+ dummyvfs.dummyvfs.dev = logdev;
+ dummyvfs.dummyvfs.ipmnt = NULL;
+ ICACHE_LOCK();
+ rc = iget((struct vfs *) &dummyvfs, 0, (inode_t **) & log, 0);
+ ICACHE_UNLOCK();
+ if (rc)
+ return rc;
+
+ log->flag = 0;
+ log->dev = logdev;
+ log->base = 0;
+ log->size = 0;
+
+ /*
+ * serialize open/close between multiple file systems
+ * bound with the log;
+ */
+ ip = (inode_t *) log;
+ IWRITE_LOCK(ip);
+
+ /*
+ * subsequent open: add file system to log active file system list
+ */
+#ifdef _JFS_OS2
+ if (log->strat2p)
+#endif /* _JFS_OS2 */
+ {
+ if (rc = lmLogFileSystem(log, fsdev, 1))
+ goto errout10;
+
+ IWRITE_UNLOCK(ip);
+
+ *iplog = ip;
+ jFYI(1, ("lmLogOpen: exit(0)\n"));
+ return 0;
+ }
+
+ /* decouple log inode from dummy vfs */
+ vPut(ip);
+
+ /*
+ * first open:
+ */
+#ifdef _JFS_OS2
+ /*
+ * establish access to the single/shared (already open) log device
+ */
+ logdevfp = (void *) logStrat2;
+ log->strat2p = logStrat2;
+ log->strat3p = logStrat3;
+
+ log->l2pbsize = 9; /* todo: when OS/2 have multiple external log */
+#endif /* _JFS_OS2 */
+
+ /*
+ * initialize log:
+ */
+ if (rc = lmLogInit(log))
+ goto errout20;
+
+ /*
+ * add file system to log active file system list
+ */
+ if (rc = lmLogFileSystem(log, fsdev, 1))
+ goto errout30;
+
+ /*
+ * insert log device into log device list
+ */
+ out:
+#endif /* _STILL_TO_PORT */
+ jFYI(1, ("lmLogOpen: exit(0)\n"));
+ return 0;
+
+ /*
+ * unwind on error
+ */
+#ifdef _STILL_TO_PORT
+ errout30: /* unwind lbmLogInit() */
+ lbmLogShutdown(log);
+
+ errout20: /* close external log device */
+
+#endif /* _STILL_TO_PORT */
+ errout10: /* free log inode */
+ kfree(log);
+
+ jFYI(1, ("lmLogOpen: exit(%d)\n", rc));
+ return rc;
+}
+
+
+/*
+ * NAME: lmLogInit()
+ *
+ * FUNCTION: log initialization at first log open.
+ *
+ * logredo() (or logformat()) should have been run previously.
+ * initialize the log inode from log superblock.
+ * set the log state in the superblock to LOGMOUNT and
+ * write SYNCPT log record.
+ *
+ * PARAMETER: log - log structure
+ *
+ * RETURN: 0 - if ok
+ * EINVAL - bad log magic number or superblock dirty
+ * error returned from logwait()
+ *
+ * serialization: single first open thread
+ */
+static int lmLogInit(log_t * log)
+{
+ int rc = 0;
+ lrd_t lrd;
+ logsuper_t *logsuper;
+ lbuf_t *bpsuper;
+ lbuf_t *bp;
+ logpage_t *lp;
+ int lsn;
+
+ jFYI(1, ("lmLogInit: log:0x%p\n", log));
+
+ /*
+ * log inode is overlaid on generic inode where
+ * dinode have been zeroed out by iRead();
+ */
+
+ /*
+ * initialize log i/o
+ */
+ if ((rc = lbmLogInit(log)))
+ return rc;
+
+ /*
+ * validate log superblock
+ */
+ if ((rc = lbmRead(log, 1, &bpsuper)))
+ goto errout10;
+
+ logsuper = (logsuper_t *) bpsuper->l_ldata;
+
+ if (logsuper->magic != cpu_to_le32(LOGMAGIC)) {
+ jERROR(1, ("*** Log Format Error ! ***\n"));
+ rc = EINVAL;
+ goto errout20;
+ }
+
+ /* logredo() should have been run successfully. */
+ if (logsuper->state != cpu_to_le32(LOGREDONE)) {
+ jERROR(1, ("*** Log Is Dirty ! ***\n"));
+ rc = EINVAL;
+ goto errout20;
+ }
+
+ /* initialize log inode from log superblock */
+ if (log->flag & JFS_INLINELOG) {
+ if (log->size != le32_to_cpu(logsuper->size)) {
+ rc = EINVAL;
+ goto errout20;
+ }
+ jFYI(0,
+ ("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x\n",
+ log, (unsigned long long) log->base, log->size));
+ } else {
+ log->size = le32_to_cpu(logsuper->size);
+ jFYI(0,
+ ("lmLogInit: external log:0x%p base:0x%Lx size:0x%x\n",
+ log, (unsigned long long) log->base, log->size));
+ }
+
+ log->flag |= JFS_GROUPCOMMIT;
+/*
+ log->flag |= JFS_LAZYCOMMIT;
+*/
+ log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
+ log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page);
+
+ /*
+ * initialize for log append write mode
+ */
+ /* establish current/end-of-log page/buffer */
+ if ((rc = lbmRead(log, log->page, &bp)))
+ goto errout20;
+
+ lp = (logpage_t *) bp->l_ldata;
+
+ jFYI(1, ("lmLogInit: lsn:0x%x page:%d eor:%d:%d\n",
+ le32_to_cpu(logsuper->end), log->page, log->eor,
+ le16_to_cpu(lp->h.eor)));
+
+// ASSERT(log->eor == lp->h.eor);
+
+ log->bp = bp;
+ bp->l_pn = log->page;
+ bp->l_eor = log->eor;
+
+ /* initialize the group commit serialization lock */
+ LOGGC_LOCK_INIT(log);
+
+ /* if current page is full, move on to next page */
+ if (log->eor >= LOGPSIZE - LOGPTLRSIZE)
+ lmNextPage(log);
+
+ /* allocate/initialize the log write serialization lock */
+ LOG_LOCK_INIT(log);
+
+ /*
+ * initialize log syncpoint
+ */
+ /*
+ * write the first SYNCPT record with syncpoint = 0
+ * (i.e., log redo up to HERE !);
+ * remove current page from lbm write queue at end of pageout
+ * (to write log superblock update), but do not release to freelist;
+ */
+ lrd.logtid = 0;
+ lrd.backchain = 0;
+ lrd.type = cpu_to_le16(LOG_SYNCPT);
+ lrd.length = 0;
+ lrd.log.syncpt.sync = 0;
+ lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+ bp = log->bp;
+ bp->l_ceor = bp->l_eor;
+ lp = (logpage_t *) bp->l_ldata;
+ lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+ lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0);
+ if ((rc = lbmIOWait(bp, 0)))
+ goto errout30;
+
+ /* initialize logsync parameters */
+ log->logsize = (log->size - 2) << L2LOGPSIZE;
+ log->lsn = lsn;
+ log->syncpt = lsn;
+ log->sync = log->syncpt;
+ log->nextsync = LOGSYNC_DELTA(log->logsize);
+ init_waitqueue_head(&log->syncwait);
+
+ jFYI(1, ("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x\n",
+ log->lsn, log->syncpt, log->sync));
+
+ LOGSYNC_LOCK_INIT(log);
+
+ INIT_LIST_HEAD(&log->synclist);
+
+ log->cqueue.head = log->cqueue.tail = 0;
+
+ log->count = 0;
+
+ /*
+ * initialize for lazy/group commit
+ */
+ log->clsn = lsn;
+
+ /*
+ * update/write superblock
+ */
+ logsuper->state = cpu_to_le32(LOGMOUNT);
+ log->serial = le32_to_cpu(logsuper->serial) + 1;
+ logsuper->serial = cpu_to_le32(log->serial);
+ lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+ if ((rc = lbmIOWait(bpsuper, lbmFREE)))
+ goto errout30;
+
+ jFYI(1, ("lmLogInit: exit(%d)\n", rc));
+ return 0;
+
+ /*
+ * unwind on error
+ */
+ errout30: /* release log page */
+ lbmFree(bp);
+
+ errout20: /* release log superblock */
+ lbmFree(bpsuper);
+
+ errout10: /* unwind lbmLogInit() */
+ lbmLogShutdown(log);
+
+ jFYI(1, ("lmLogInit: exit(%d)\n", rc));
+ return rc;
+}
+
+
+/*
+ * NAME: lmLogClose()
+ *
+ * FUNCTION: remove file system <ipmnt> from active list of log <iplog>
+ * and close it on last close.
+ *
+ * PARAMETER: sb - superblock
+ * log - log inode
+ *
+ * RETURN: errors from subroutines
+ *
+ * serialization:
+ */
+int lmLogClose(struct super_block *sb, log_t * log)
+{
+ int rc;
+
+ jFYI(1, ("lmLogClose: log:0x%p\n", log));
+
+ /*
+ * in-line log in host file system
+ */
+// inlineLog:
+#ifdef _STILL_TO_PORT
+ if (log->flag & JFS_INLINELOG) {
+ rc = lmLogShutdown(log);
+
+ goto out1;
+ }
+
+ /*
+ * external log as separate logical volume
+ */
+ externalLog:
+
+ /* serialize open/close between multiple file systems
+ * associated with the log
+ */
+ IWRITE_LOCK(iplog);
+
+ /*
+ * remove file system from log active file system list
+ */
+ rc = lmLogFileSystem(log, fsdev, 0);
+
+ if (iplog->i_count > 1)
+ goto out2;
+
+ /*
+ * last close: shut down log
+ */
+ rc = ((rc1 = lmLogShutdown(log)) && rc == 0) ? rc1 : rc;
+
+ out1:
+#else /* _STILL_TO_PORT */
+ rc = lmLogShutdown(log);
+#endif /* _STILL_TO_PORT */
+
+// out2:
+
+ jFYI(0, ("lmLogClose: exit(%d)\n", rc));
+ return rc;
+}
+
+
+/*
+ * NAME: lmLogShutdown()
+ *
+ * FUNCTION: log shutdown at last LogClose().
+ *
+ * write log syncpt record.
+ * update super block to set redone flag to 0.
+ *
+ * PARAMETER: log - log inode
+ *
+ * RETURN: 0 - success
+ *
+ * serialization: single last close thread
+ */
+static int lmLogShutdown(log_t * log)
+{
+ int rc;
+ lrd_t lrd;
+ int lsn;
+ logsuper_t *logsuper;
+ lbuf_t *bpsuper;
+ lbuf_t *bp;
+ logpage_t *lp;
+
+ jFYI(1, ("lmLogShutdown: log:0x%p\n", log));
+
+ if (log->cqueue.head || !list_empty(&log->synclist)) {
+ /*
+ * If there was very recent activity, we may need to wait
+ * for the lazycommit thread to catch up
+ */
+ int i;
+
+ for (i = 0; i < 800; i++) { /* Too much? */
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(HZ / 4);
+ if ((log->cqueue.head == NULL) &&
+ list_empty(&log->synclist))
+ break;
+ }
+ }
+ assert(log->cqueue.head == NULL);
+ assert(list_empty(&log->synclist));
+
+ /*
+ * We need to make sure all of the "written" metapages
+ * actually make it to disk
+ */
+ fsync_no_super(log->sb->s_bdev);
+
+ /*
+ * write the last SYNCPT record with syncpoint = 0
+ * (i.e., log redo up to HERE !)
+ */
+ lrd.logtid = 0;
+ lrd.backchain = 0;
+ lrd.type = cpu_to_le16(LOG_SYNCPT);
+ lrd.length = 0;
+ lrd.log.syncpt.sync = 0;
+ lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+ bp = log->bp;
+ lp = (logpage_t *) bp->l_ldata;
+ lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+ lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0);
+ lbmIOWait(log->bp, lbmFREE);
+
+ /*
+ * synchronous update log superblock
+ * mark log state as shutdown cleanly
+ * (i.e., Log does not need to be replayed).
+ */
+ if ((rc = lbmRead(log, 1, &bpsuper)))
+ goto out;
+
+ logsuper = (logsuper_t *) bpsuper->l_ldata;
+ logsuper->state = cpu_to_le32(LOGREDONE);
+ logsuper->end = cpu_to_le32(lsn);
+ lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+ rc = lbmIOWait(bpsuper, lbmFREE);
+
+ jFYI(1, ("lmLogShutdown: lsn:0x%x page:%d eor:%d\n",
+ lsn, log->page, log->eor));
+
+ out:
+ /*
+ * shutdown per log i/o
+ */
+ lbmLogShutdown(log);
+
+ if (rc) {
+ jFYI(1, ("lmLogShutdown: exit(%d)\n", rc));
+ }
+ return rc;
+}
+
+
+#ifdef _STILL_TO_PORT
+/*
+ * NAME: lmLogFileSystem()
+ *
+ * FUNCTION: insert (<activate> = true)/remove (<activate> = false)
+ * file system into/from log active file system list.
+ *
+ * PARAMETE: log - pointer to logs inode.
+ * fsdev - dev_t of filesystem.
+ * serial - pointer to returned log serial number
+ * activate - insert/remove device from active list.
+ *
+ * RETURN: 0 - success
+ * errors returned by vms_iowait().
+ *
+ * serialization: IWRITE_LOCK(log inode) held on entry/exit
+ */
+static int lmLogFileSystem(log_t * log, dev_t fsdev, int activate)
+{
+ int rc = 0;
+ int bit, word;
+ logsuper_t *logsuper;
+ lbuf_t *bpsuper;
+
+ /*
+ * insert/remove file system device to log active file system list.
+ */
+ if ((rc = lbmRead(log, 1, &bpsuper)))
+ return rc;
+
+ logsuper = (logsuper_t *) bpsuper->l_ldata;
+ bit = MINOR(fsdev);
+ word = bit / 32;
+ bit -= 32 * word;
+ if (activate)
+ logsuper->active[word] |=
+ cpu_to_le32((LEFTMOSTONE >> bit));
+ else
+ logsuper->active[word] &=
+ cpu_to_le32((~(LEFTMOSTONE >> bit)));
+
+ /*
+ * synchronous write log superblock:
+ *
+ * write sidestream bypassing write queue:
+ * at file system mount, log super block is updated for
+ * activation of the file system before any log record
+ * (MOUNT record) of the file system, and at file system
+ * unmount, all meta data for the file system has been
+ * flushed before log super block is updated for deactivation
+ * of the file system.
+ */
+ lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+ rc = lbmIOWait(bpsuper, lbmFREE);
+
+ return rc;
+}
+#endif /* _STILL_TO_PORT */
+
+
+/*
+ * lmLogQuiesce()
+ */
+int lmLogQuiesce(log_t * log)
+{
+ int rc;
+
+ rc = lmLogShutdown(log);
+
+ return rc;
+}
+
+
+/*
+ * lmLogResume()
+ */
+int lmLogResume(log_t * log, struct super_block *sb)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(sb);
+ int rc;
+
+ log->base = addressPXD(&sbi->logpxd);
+ log->size =
+ (lengthPXD(&sbi->logpxd) << sb->s_blocksize_bits) >> L2LOGPSIZE;
+ rc = lmLogInit(log);
+
+ return rc;
+}
+
+
+/*
+ * log buffer manager (lbm)
+ * ------------------------
+ *
+ * special purpose buffer manager supporting log i/o requirements.
+ *
+ * per log write queue:
+ * log pageout occurs in serial order by fifo write queue and
+ * restricting to a single i/o in pregress at any one time.
+ * a circular singly-linked list
+ * (log->wrqueue points to the tail, and buffers are linked via
+ * bp->wrqueue field), and
+ * maintains log page in pageout ot waiting for pageout in serial pageout.
+ */
+
+/*
+ * lbmLogInit()
+ *
+ * initialize per log I/O setup at lmLogInit()
+ */
+static int lbmLogInit(log_t * log)
+{ /* log inode */
+ int i;
+ lbuf_t *lbuf;
+
+ jFYI(1, ("lbmLogInit: log:0x%p\n", log));
+
+ /* initialize current buffer cursor */
+ log->bp = NULL;
+
+ /* initialize log device write queue */
+ log->wqueue = NULL;
+
+ /*
+ * Each log has its own buffer pages allocated to it. These are
+ * not managed by the page cache. This ensures that a transaction
+ * writing to the log does not block trying to allocate a page from
+ * the page cache (for the log). This would be bad, since page
+ * allocation waits on the kswapd thread that may be committing inodes
+ * which would cause log activity. Was that clear? I'm trying to
+ * avoid deadlock here.
+ */
+ init_waitqueue_head(&log->free_wait);
+
+ log->lbuf_free = NULL;
+
+ for (i = 0; i < LOGPAGES; i++) {
+ lbuf = kmalloc(sizeof(lbuf_t), GFP_KERNEL);
+ if (lbuf == 0)
+ goto error;
+ lbuf->l_ldata = (char *) __get_free_page(GFP_KERNEL);
+ if (lbuf->l_ldata == 0) {
+ kfree(lbuf);
+ goto error;
+ }
+ lbuf->l_log = log;
+ init_waitqueue_head(&lbuf->l_ioevent);
+
+ lbuf->l_freelist = log->lbuf_free;
+ log->lbuf_free = lbuf;
+ }
+
+ return (0);
+
+ error:
+ lbmLogShutdown(log);
+ return (ENOMEM);
+}
+
+
+/*
+ * lbmLogShutdown()
+ *
+ * finalize per log I/O setup at lmLogShutdown()
+ */
+static void lbmLogShutdown(log_t * log)
+{
+ lbuf_t *lbuf;
+
+ jFYI(1, ("lbmLogShutdown: log:0x%p\n", log));
+
+ lbuf = log->lbuf_free;
+ while (lbuf) {
+ lbuf_t *next = lbuf->l_freelist;
+ free_page((unsigned long) lbuf->l_ldata);
+ kfree(lbuf);
+ lbuf = next;
+ }
+
+ log->bp = NULL;
+}
+
+
+/*
+ * lbmAllocate()
+ *
+ * allocate an empty log buffer
+ */
+static lbuf_t *lbmAllocate(log_t * log, int pn)
+{
+ lbuf_t *bp;
+ unsigned long flags;
+
+ /*
+ * recycle from log buffer freelist if any
+ */
+ LCACHE_LOCK(flags);
+ LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags);
+ log->lbuf_free = bp->l_freelist;
+ LCACHE_UNLOCK(flags);
+
+ bp->l_flag = 0;
+
+ bp->l_wqnext = NULL;
+ bp->l_freelist = NULL;
+
+ bp->l_pn = pn;
+ bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize));
+ bp->l_ceor = 0;
+
+ return bp;
+}
+
+
+/*
+ * lbmFree()
+ *
+ * release a log buffer to freelist
+ */
+static void lbmFree(lbuf_t * bp)
+{
+ unsigned long flags;
+
+ LCACHE_LOCK(flags);
+
+ lbmfree(bp);
+
+ LCACHE_UNLOCK(flags);
+}
+
+static void lbmfree(lbuf_t * bp)
+{
+ log_t *log = bp->l_log;
+
+ assert(bp->l_wqnext == NULL);
+
+ /*
+ * return the buffer to head of freelist
+ */
+ bp->l_freelist = log->lbuf_free;
+ log->lbuf_free = bp;
+
+ wake_up(&log->free_wait);
+ return;
+}
+
+
+#ifdef _THIS_IS_NOT_USED
+/*
+ * lbmRelease()
+ *
+ * remove the log buffer from log device write queue;
+ */
+static void lbmRelease(log_t * log, uint flag)
+{
+ lbuf_t *bp, *tail;
+ unsigned long flags;
+
+ bp = log->bp;
+
+ LCACHE_LOCK(flags);
+
+ tail = log->wqueue;
+
+ /* single element queue */
+ if (bp == tail) {
+ log->wqueue = NULL;
+ bp->l_wqnext = NULL;
+ }
+ /* multi element queue */
+ else {
+ tail->l_wqnext = bp->l_wqnext;
+ bp->l_wqnext = NULL;
+ }
+
+ if (flag & lbmFREE)
+ lbmfree(bp);
+
+ LCACHE_UNLOCK(flags);
+}
+#endif /* _THIS_IS_NOT_USED */
+
+
+/*
+ * NAME: lbmRedrive
+ *
+ * FUNCTION: add a log buffer to the the log redrive list
+ *
+ * PARAMETER:
+ * bp - log buffer
+ *
+ * NOTES:
+ * Takes log_redrive_lock.
+ */
+static inline void lbmRedrive(lbuf_t *bp)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&log_redrive_lock, flags);
+ bp->l_redrive_next = log_redrive_list;
+ log_redrive_list = bp;
+ spin_unlock_irqrestore(&log_redrive_lock, flags);
+
+ wake_up_process(jfsIOtask);
+}
+
+
+/*
+ * lbmRead()
+ */
+static int lbmRead(log_t * log, int pn, lbuf_t ** bpp)
+{
+ struct bio *bio;
+ lbuf_t *bp;
+
+ /*
+ * allocate a log buffer
+ */
+ *bpp = bp = lbmAllocate(log, pn);
+ jFYI(1, ("lbmRead: bp:0x%p pn:0x%x\n", bp, pn));
+
+ bp->l_flag |= lbmREAD;
+
+ bio = bio_alloc(GFP_NOFS, 1);
+
+ bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+ bio->bi_dev = log->dev;
+ bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata);
+ bio->bi_io_vec[0].bv_len = LOGPSIZE;
+ bio->bi_io_vec[0].bv_offset = 0;
+
+ bio->bi_vcnt = 1;
+ bio->bi_idx = 0;
+ bio->bi_size = LOGPSIZE;
+
+ bio->bi_end_io = lbmIODone;
+ bio->bi_private = bp;
+ submit_bio(READ, bio);
+ run_task_queue(&tq_disk);
+
+ wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
+
+ return 0;
+}
+
+
+/*
+ * lbmWrite()
+ *
+ * buffer at head of pageout queue stays after completion of
+ * partial-page pageout and redriven by explicit initiation of
+ * pageout by caller until full-page pageout is completed and
+ * released.
+ *
+ * device driver i/o done redrives pageout of new buffer at
+ * head of pageout queue when current buffer at head of pageout
+ * queue is released at the completion of its full-page pageout.
+ *
+ * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit().
+ * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone()
+ */
+static void lbmWrite(log_t * log, lbuf_t * bp, int flag, int cant_block)
+{
+ lbuf_t *tail;
+ unsigned long flags;
+
+ jFYI(1, ("lbmWrite: bp:0x%p flag:0x%x pn:0x%x\n",
+ bp, flag, bp->l_pn));
+
+ /* map the logical block address to physical block address */
+ bp->l_blkno =
+ log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
+
+ LCACHE_LOCK(flags); /* disable+lock */
+
+ /*
+ * initialize buffer for device driver
+ */
+ bp->l_flag = flag;
+
+ /*
+ * insert bp at tail of write queue associated with log
+ *
+ * (request is either for bp already/currently at head of queue
+ * or new bp to be inserted at tail)
+ */
+ tail = log->wqueue;
+
+ /* is buffer not already on write queue ? */
+ if (bp->l_wqnext == NULL) {
+ /* insert at tail of wqueue */
+ if (tail == NULL) {
+ log->wqueue = bp;
+ bp->l_wqnext = bp;
+ } else {
+ log->wqueue = bp;
+ bp->l_wqnext = tail->l_wqnext;
+ tail->l_wqnext = bp;
+ }
+
+ tail = bp;
+ }
+
+ /* is buffer at head of wqueue and for write ? */
+ if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) {
+ LCACHE_UNLOCK(flags); /* unlock+enable */
+ return;
+ }
+
+ LCACHE_UNLOCK(flags); /* unlock+enable */
+
+ if (cant_block)
+ lbmRedrive(bp);
+ else if (flag & lbmSYNC)
+ lbmStartIO(bp);
+ else {
+ LOGGC_UNLOCK(log);
+ lbmStartIO(bp);
+ LOGGC_LOCK(log);
+ }
+}
+
+
+/*
+ * lbmDirectWrite()
+ *
+ * initiate pageout bypassing write queue for sidestream
+ * (e.g., log superblock) write;
+ */
+static void lbmDirectWrite(log_t * log, lbuf_t * bp, int flag)
+{
+ jEVENT(0, ("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x\n",
+ bp, flag, bp->l_pn));
+
+ /*
+ * initialize buffer for device driver
+ */
+ bp->l_flag = flag | lbmDIRECT;
+
+ /* map the logical block address to physical block address */
+ bp->l_blkno =
+ log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
+
+ /*
+ * initiate pageout of the page
+ */
+ lbmStartIO(bp);
+}
+
+
+/*
+ * NAME: lbmStartIO()
+ *
+ * FUNCTION: Interface to DD strategy routine
+ *
+ * RETURN: none
+ *
+ * serialization: LCACHE_LOCK() is NOT held during log i/o;
+ */
+void lbmStartIO(lbuf_t * bp)
+{
+ struct bio *bio;
+ log_t *log = bp->l_log;
+
+ jFYI(1, ("lbmStartIO\n"));
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+ bio->bi_dev = log->dev;
+ bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata);
+ bio->bi_io_vec[0].bv_len = LOGPSIZE;
+ bio->bi_io_vec[0].bv_offset = 0;
+
+ bio->bi_vcnt = 1;
+ bio->bi_idx = 0;
+ bio->bi_size = LOGPSIZE;
+
+ bio->bi_end_io = lbmIODone;
+ bio->bi_private = bp;
+
+ submit_bio(WRITE, bio);
+
+ INCREMENT(lmStat.submitted);
+ run_task_queue(&tq_disk);
+
+ jFYI(1, ("lbmStartIO done\n"));
+}
+
+
+/*
+ * lbmIOWait()
+ */
+static int lbmIOWait(lbuf_t * bp, int flag)
+{
+ unsigned long flags;
+ int rc = 0;
+
+ jFYI(1,
+ ("lbmIOWait1: bp:0x%p flag:0x%x:0x%x\n", bp, bp->l_flag,
+ flag));
+
+ LCACHE_LOCK(flags); /* disable+lock */
+
+ LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags);
+
+ rc = (bp->l_flag & lbmERROR) ? EIO : 0;
+
+ if (flag & lbmFREE)
+ lbmfree(bp);
+
+ LCACHE_UNLOCK(flags); /* unlock+enable */
+
+ jFYI(1,
+ ("lbmIOWait2: bp:0x%p flag:0x%x:0x%x\n", bp, bp->l_flag,
+ flag));
+ return rc;
+}
+
+/*
+ * lbmIODone()
+ *
+ * executed at INTIODONE level
+ */
+static void lbmIODone(struct bio *bio)
+{
+ lbuf_t *bp = bio->bi_private;
+ lbuf_t *nextbp, *tail;
+ log_t *log;
+ unsigned long flags;
+
+ /*
+ * get back jfs buffer bound to the i/o buffer
+ */
+ jEVENT(0, ("lbmIODone: bp:0x%p flag:0x%x\n", bp, bp->l_flag));
+
+ LCACHE_LOCK(flags); /* disable+lock */
+
+ bp->l_flag |= lbmDONE;
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ bp->l_flag |= lbmERROR;
+
+ jERROR(1, ("lbmIODone: I/O error in JFS log\n"));
+ }
+ bio_put(bio);
+
+ /*
+ * pagein completion
+ */
+ if (bp->l_flag & lbmREAD) {
+ bp->l_flag &= ~lbmREAD;
+
+ LCACHE_UNLOCK(flags); /* unlock+enable */
+
+ /* wakeup I/O initiator */
+ LCACHE_WAKEUP(&bp->l_ioevent);
+
+ return;
+ }
+
+ /*
+ * pageout completion
+ *
+ * the bp at the head of write queue has completed pageout.
+ *
+ * if single-commit/full-page pageout, remove the current buffer
+ * from head of pageout queue, and redrive pageout with
+ * the new buffer at head of pageout queue;
+ * otherwise, the partial-page pageout buffer stays at
+ * the head of pageout queue to be redriven for pageout
+ * by lmGroupCommit() until full-page pageout is completed.
+ */
+ bp->l_flag &= ~lbmWRITE;
+ INCREMENT(lmStat.pagedone);
+
+ /* update committed lsn */
+ log = bp->l_log;
+ log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor;
+
+ if (bp->l_flag & lbmDIRECT) {
+ LCACHE_WAKEUP(&bp->l_ioevent);
+ LCACHE_UNLOCK(flags);
+ return;
+ }
+
+ tail = log->wqueue;
+
+ /* single element queue */
+ if (bp == tail) {
+ /* remove head buffer of full-page pageout
+ * from log device write queue
+ */
+ if (bp->l_flag & lbmRELEASE) {
+ log->wqueue = NULL;
+ bp->l_wqnext = NULL;
+ }
+ }
+ /* multi element queue */
+ else {
+ /* remove head buffer of full-page pageout
+ * from log device write queue
+ */
+ if (bp->l_flag & lbmRELEASE) {
+ nextbp = tail->l_wqnext = bp->l_wqnext;
+ bp->l_wqnext = NULL;
+
+ /*
+ * redrive pageout of next page at head of write queue:
+ * redrive next page without any bound tblk
+ * (i.e., page w/o any COMMIT records), or
+ * first page of new group commit which has been
+ * queued after current page (subsequent pageout
+ * is performed synchronously, except page without
+ * any COMMITs) by lmGroupCommit() as indicated
+ * by lbmWRITE flag;
+ */
+ if (nextbp->l_flag & lbmWRITE) {
+ /*
+ * We can't do the I/O at interrupt time.
+ * The jfsIO thread can do it
+ */
+ lbmRedrive(nextbp);
+ }
+ }
+ }
+
+ /*
+ * synchronous pageout:
+ *
+ * buffer has not necessarily been removed from write queue
+ * (e.g., synchronous write of partial-page with COMMIT):
+ * leave buffer for i/o initiator to dispose
+ */
+ if (bp->l_flag & lbmSYNC) {
+ LCACHE_UNLOCK(flags); /* unlock+enable */
+
+ /* wakeup I/O initiator */
+ LCACHE_WAKEUP(&bp->l_ioevent);
+ }
+
+ /*
+ * Group Commit pageout:
+ */
+ else if (bp->l_flag & lbmGC) {
+ LCACHE_UNLOCK(flags);
+ lmPostGC(bp);
+ }
+
+ /*
+ * asynchronous pageout:
+ *
+ * buffer must have been removed from write queue:
+ * insert buffer at head of freelist where it can be recycled
+ */
+ else {
+ assert(bp->l_flag & lbmRELEASE);
+ assert(bp->l_flag & lbmFREE);
+ lbmfree(bp);
+
+ LCACHE_UNLOCK(flags); /* unlock+enable */
+ }
+ return;
+}
+
+int jfsIOWait(void *arg)
+{
+ lbuf_t *bp;
+
+ jFYI(1, ("jfsIOWait is here!\n"));
+
+ lock_kernel();
+
+ daemonize();
+ current->tty = NULL;
+ strcpy(current->comm, "jfsIO");
+
+ unlock_kernel();
+
+ jfsIOtask = current;
+
+ spin_lock_irq(&current->sigmask_lock);
+ siginitsetinv(&current->blocked,
+ sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP)
+ | sigmask(SIGCONT));
+ spin_unlock_irq(&current->sigmask_lock);
+
+ complete(&jfsIOwait);
+
+ do {
+ spin_lock_irq(&log_redrive_lock);
+ while ((bp = log_redrive_list)) {
+ log_redrive_list = bp->l_redrive_next;
+ bp->l_redrive_next = NULL;
+ spin_unlock_irq(&log_redrive_lock);
+ lbmStartIO(bp);
+ spin_lock_irq(&log_redrive_lock);
+ }
+ spin_unlock_irq(&log_redrive_lock);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ } while (!jfs_thread_stopped());
+
+ jFYI(1,("jfsIOWait being killed!\n"));
+ complete(&jfsIOwait);
+ return 0;
+}
+
+
+#ifdef _STILL_TO_PORT
+/*
+ * lbmDirectIODone()
+ *
+ * iodone() for lbmDirectWrite() to bypass write queue;
+ * executed at INTIODONE level;
+ */
+static void lbmDirectIODone(iobuf_t * iobp)
+{
+ lbuf_t *bp;
+ unsigned long flags;
+
+ /*
+ * get back jfs buffer bound to the io buffer
+ */
+ bp = (lbuf_t *) iobp->b_jfsbp;
+ jEVENT(0,
+ ("lbmDirectIODone: bp:0x%p flag:0x%x\n", bp, bp->l_flag));
+
+ LCACHE_LOCK(flags); /* disable+lock */
+
+ bp->l_flag |= lbmDONE;
+
+ if (iobp->b_flags & B_ERROR) {
+ bp->l_flag |= lbmERROR;
+#ifdef _JFS_OS2
+ SysLogError();
+#endif
+ }
+
+ /*
+ * pageout completion
+ */
+ bp->l_flag &= ~lbmWRITE;
+
+ /*
+ * synchronous pageout:
+ */
+ if (bp->l_flag & lbmSYNC) {
+ LCACHE_UNLOCK(flags); /* unlock+enable */
+
+ /* wakeup I/O initiator */
+ LCACHE_WAKEUP(&bp->l_ioevent);
+ }
+ /*
+ * asynchronous pageout:
+ */
+ else {
+ assert(bp->l_flag & lbmRELEASE);
+ assert(bp->l_flag & lbmFREE);
+ lbmfree(bp);
+
+ LCACHE_UNLOCK(flags); /* unlock+enable */
+ }
+}
+#endif /* _STILL_TO_PORT */
+
+#ifdef _STILL_TO_PORT
+/*
+ * NAME: lmLogFormat()/jfs_logform()
+ *
+ * FUNCTION: format file system log (ref. jfs_logform()).
+ *
+ * PARAMETERS:
+ * log - log inode (with common mount inode base);
+ * logAddress - start address of log space in FS block;
+ * logSize - length of log space in FS block;
+ *
+ * RETURN: 0 - success
+ * -1 - i/o error
+ */
+int lmLogFormat(inode_t * ipmnt, s64 logAddress, int logSize)
+{
+ int rc = 0;
+ cbuf_t *bp;
+ logsuper_t *logsuper;
+ logpage_t *lp;
+ int lspn; /* log sequence page number */
+ struct lrd *lrd_ptr;
+ int npbperpage, npages;
+
+ jFYI(0, ("lmLogFormat: logAddress:%Ld logSize:%d\n",
+ logAddress, logSize));
+
+ /* allocate a JFS buffer */
+ bp = rawAllocate();
+
+ /* map the logical block address to physical block address */
+ bp->cm_blkno = logAddress << ipmnt->i_l2bfactor;
+
+ npbperpage = LOGPSIZE >> ipmnt->i_l2pbsize;
+ npages = logSize / (LOGPSIZE >> ipmnt->i_l2bsize);
+
+ /*
+ * log space:
+ *
+ * page 0 - reserved;
+ * page 1 - log superblock;
+ * page 2 - log data page: A SYNC log record is written
+ * into this page at logform time;
+ * pages 3-N - log data page: set to empty log data pages;
+ */
+ /*
+ * init log superblock: log page 1
+ */
+ logsuper = (logsuper_t *) bp->cm_cdata;
+
+ logsuper->magic = cpu_to_le32(LOGMAGIC);
+ logsuper->version = cpu_to_le32(LOGVERSION);
+ logsuper->state = cpu_to_le32(LOGREDONE);
+ logsuper->flag = cpu_to_le32(ipmnt->i_mntflag); /* ? */
+ logsuper->size = cpu_to_le32(npages);
+ logsuper->bsize = cpu_to_le32(ipmnt->i_bsize);
+ logsuper->l2bsize = cpu_to_le32(ipmnt->i_l2bsize);
+ logsuper->end =
+ cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE);
+
+ bp->cm_blkno += npbperpage;
+ rawWrite(ipmnt, bp, 0);
+
+ /*
+ * init pages 2 to npages-1 as log data pages:
+ *
+ * log page sequence number (lpsn) initialization:
+ *
+ * pn: 0 1 2 3 n-1
+ * +-----+-----+=====+=====+===.....===+=====+
+ * lspn: N-1 0 1 N-2
+ * <--- N page circular file ---->
+ *
+ * the N (= npages-2) data pages of the log is maintained as
+ * a circular file for the log records;
+ * lpsn grows by 1 monotonically as each log page is written
+ * to the circular file of the log;
+ * Since the AIX DUMMY log record is dropped for this XJFS,
+ * and setLogpage() will not reset the page number even if
+ * the eor is equal to LOGPHDRSIZE. In order for binary search
+ * still work in find log end process, we have to simulate the
+ * log wrap situation at the log format time.
+ * The 1st log page written will have the highest lpsn. Then
+ * the succeeding log pages will have ascending order of
+ * the lspn starting from 0, ... (N-2)
+ */
+ lp = (logpage_t *) bp->cm_cdata;
+
+ /*
+ * initialize 1st log page to be written: lpsn = N - 1,
+ * write a SYNCPT log record is written to this page
+ */
+ lp->h.page = lp->t.page = cpu_to_le32(npages - 3);
+ lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE);
+
+ lrd_ptr = (struct lrd *) &lp->data;
+ lrd_ptr->logtid = 0;
+ lrd_ptr->backchain = 0;
+ lrd_ptr->type = cpu_to_le16(LOG_SYNCPT);
+ lrd_ptr->length = 0;
+ lrd_ptr->log.syncpt.sync = 0;
+
+ bp->cm_blkno += npbperpage;
+ rawWrite(ipmnt, bp, 0);
+
+ /*
+ * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
+ */
+ for (lspn = 0; lspn < npages - 3; lspn++) {
+ lp->h.page = lp->t.page = cpu_to_le32(lspn);
+ lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
+
+ bp->cm_blkno += npbperpage;
+ rawWrite(ipmnt, bp, 0);
+ }
+
+ /*
+ * finalize log
+ */
+ /* release the buffer */
+ rawRelease(bp);
+
+ return rc;
+}
+#endif /* _STILL_TO_PORT */
+
+
+#ifdef CONFIG_JFS_STATISTICS
+int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
+ int *eof, void *data)
+{
+ int len = 0;
+ off_t begin;
+
+ len += sprintf(buffer,
+ "JFS Logmgr stats\n"
+ "================\n"
+ "commits = %d\n"
+ "writes submitted = %d\n"
+ "writes completed = %d\n",
+ lmStat.commit,
+ lmStat.submitted,
+ lmStat.pagedone);
+
+ begin = offset;
+ *start = buffer + begin;
+ len -= begin;
+
+ if (len > length)
+ len = length;
+ else
+ *eof = 1;
+
+ if (len < 0)
+ len = 0;
+
+ return len;
+}
+#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
new file mode 100644
index 000000000000..33cb5b5d2a04
--- /dev/null
+++ b/fs/jfs/jfs_logmgr.h
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _H_JFS_LOGMGR
+#define _H_JFS_LOGMGR
+
+
+#include "jfs_filsys.h"
+#include "jfs_lock.h"
+
+/*
+ * log manager configuration parameters
+ */
+
+/* log page size */
+#define LOGPSIZE 4096
+#define L2LOGPSIZE 12
+
+#define LOGPAGES 16 /* Log pages per mounted file system */
+
+/*
+ * log logical volume
+ *
+ * a log is used to make the commit operation on journalled
+ * files within the same logical volume group atomic.
+ * a log is implemented with a logical volume.
+ * there is one log per logical volume group.
+ *
+ * block 0 of the log logical volume is not used (ipl etc).
+ * block 1 contains a log "superblock" and is used by logFormat(),
+ * lmLogInit(), lmLogShutdown(), and logRedo() to record status
+ * of the log but is not otherwise used during normal processing.
+ * blocks 2 - (N-1) are used to contain log records.
+ *
+ * when a volume group is varied-on-line, logRedo() must have
+ * been executed before the file systems (logical volumes) in
+ * the volume group can be mounted.
+ */
+/*
+ * log superblock (block 1 of logical volume)
+ */
+#define LOGSUPER_B 1
+#define LOGSTART_B 2
+
+#define LOGMAGIC 0x87654321
+#define LOGVERSION 1
+
+typedef struct {
+ u32 magic; /* 4: log lv identifier */
+ s32 version; /* 4: version number */
+ s32 serial; /* 4: log open/mount counter */
+ s32 size; /* 4: size in number of LOGPSIZE blocks */
+ s32 bsize; /* 4: logical block size in byte */
+ s32 l2bsize; /* 4: log2 of bsize */
+
+ u32 flag; /* 4: option */
+ u32 state; /* 4: state - see below */
+
+ s32 end; /* 4: addr of last log record set by logredo */
+ u32 active[8]; /* 32: active file systems bit vector */
+ s32 rsrvd[LOGPSIZE / 4 - 17];
+} logsuper_t;
+
+/* log flag: commit option (see jfs_filsys.h) */
+
+/* log state */
+#define LOGMOUNT 0 /* log mounted by lmLogInit() */
+#define LOGREDONE 1 /* log shutdown by lmLogShutdown().
+ * log redo completed by logredo().
+ */
+#define LOGWRAP 2 /* log wrapped */
+#define LOGREADERR 3 /* log read error detected in logredo() */
+
+
+/*
+ * log logical page
+ *
+ * (this comment should be rewritten !)
+ * the header and trailer structures (h,t) will normally have
+ * the same page and eor value.
+ * An exception to this occurs when a complete page write is not
+ * accomplished on a power failure. Since the hardware may "split write"
+ * sectors in the page, any out of order sequence may occur during powerfail
+ * and needs to be recognized during log replay. The xor value is
+ * an "exclusive or" of all log words in the page up to eor. This
+ * 32 bit eor is stored with the top 16 bits in the header and the
+ * bottom 16 bits in the trailer. logredo can easily recognize pages
+ * that were not completed by reconstructing this eor and checking
+ * the log page.
+ *
+ * Previous versions of the operating system did not allow split
+ * writes and detected partially written records in logredo by
+ * ordering the updates to the header, trailer, and the move of data
+ * into the logdata area. The order: (1) data is moved (2) header
+ * is updated (3) trailer is updated. In logredo, when the header
+ * differed from the trailer, the header and trailer were reconciled
+ * as follows: if h.page != t.page they were set to the smaller of
+ * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only)
+ * h.eor != t.eor they were set to the smaller of their two values.
+ */
+typedef struct {
+ struct { /* header */
+ s32 page; /* 4: log sequence page number */
+ s16 rsrvd; /* 2: */
+ s16 eor; /* 2: end-of-log offset of lasrt record write */
+ } h;
+
+ s32 data[LOGPSIZE / 4 - 4]; /* log record area */
+
+ struct { /* trailer */
+ s32 page; /* 4: normally the same as h.page */
+ s16 rsrvd; /* 2: */
+ s16 eor; /* 2: normally the same as h.eor */
+ } t;
+} logpage_t;
+
+#define LOGPHDRSIZE 8 /* log page header size */
+#define LOGPTLRSIZE 8 /* log page trailer size */
+
+
+/*
+ * log record
+ *
+ * (this comment should be rewritten !)
+ * jfs uses only "after" log records (only a single writer is allowed
+ * in a page, pages are written to temporary paging space if
+ * if they must be written to disk before commit, and i/o is
+ * scheduled for modified pages to their home location after
+ * the log records containing the after values and the commit
+ * record is written to the log on disk, undo discards the copy
+ * in main-memory.)
+ *
+ * a log record consists of a data area of variable length followed by
+ * a descriptor of fixed size LOGRDSIZE bytes.
+ * the data area is rounded up to an integral number of 4-bytes and
+ * must be no longer than LOGPSIZE.
+ * the descriptor is of size of multiple of 4-bytes and aligned on a
+ * 4-byte boundary.
+ * records are packed one after the other in the data area of log pages.
+ * (sometimes a DUMMY record is inserted so that at least one record ends
+ * on every page or the longest record is placed on at most two pages).
+ * the field eor in page header/trailer points to the byte following
+ * the last record on a page.
+ */
+
+/* log record types */
+#define LOG_COMMIT 0x8000
+#define LOG_SYNCPT 0x4000
+#define LOG_MOUNT 0x2000
+#define LOG_REDOPAGE 0x0800
+#define LOG_NOREDOPAGE 0x0080
+#define LOG_NOREDOINOEXT 0x0040
+#define LOG_UPDATEMAP 0x0008
+#define LOG_NOREDOFILE 0x0001
+
+/* REDOPAGE/NOREDOPAGE log record data type */
+#define LOG_INODE 0x0001
+#define LOG_XTREE 0x0002
+#define LOG_DTREE 0x0004
+#define LOG_BTROOT 0x0010
+#define LOG_EA 0x0020
+#define LOG_ACL 0x0040
+#define LOG_DATA 0x0080
+#define LOG_NEW 0x0100
+#define LOG_EXTEND 0x0200
+#define LOG_RELOCATE 0x0400
+#define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */
+
+/* UPDATEMAP log record descriptor type */
+#define LOG_ALLOCXADLIST 0x0080
+#define LOG_ALLOCPXDLIST 0x0040
+#define LOG_ALLOCXAD 0x0020
+#define LOG_ALLOCPXD 0x0010
+#define LOG_FREEXADLIST 0x0008
+#define LOG_FREEPXDLIST 0x0004
+#define LOG_FREEXAD 0x0002
+#define LOG_FREEPXD 0x0001
+
+
+typedef struct lrd {
+ /*
+ * type independent area
+ */
+ s32 logtid; /* 4: log transaction identifier */
+ s32 backchain; /* 4: ptr to prev record of same transaction */
+ u16 type; /* 2: record type */
+ s16 length; /* 2: length of data in record (in byte) */
+ s32 aggregate; /* 4: file system lv/aggregate */
+ /* (16) */
+
+ /*
+ * type dependent area (20)
+ */
+ union {
+
+ /*
+ * COMMIT: commit
+ *
+ * transaction commit: no type-dependent information;
+ */
+
+ /*
+ * REDOPAGE: after-image
+ *
+ * apply after-image;
+ *
+ * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+ */
+ struct {
+ u32 fileset; /* 4: fileset number */
+ u32 inode; /* 4: inode number */
+ u16 type; /* 2: REDOPAGE record type */
+ s16 l2linesize; /* 2: log2 of line size */
+ pxd_t pxd; /* 8: on-disk page pxd */
+ } redopage; /* (20) */
+
+ /*
+ * NOREDOPAGE: the page is freed
+ *
+ * do not apply after-image records which precede this record
+ * in the log with the same page block number to this page.
+ *
+ * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+ */
+ struct {
+ s32 fileset; /* 4: fileset number */
+ u32 inode; /* 4: inode number */
+ u16 type; /* 2: NOREDOPAGE record type */
+ s16 rsrvd; /* 2: reserved */
+ pxd_t pxd; /* 8: on-disk page pxd */
+ } noredopage; /* (20) */
+
+ /*
+ * UPDATEMAP: update block allocation map
+ *
+ * either in-line PXD,
+ * or out-of-line XADLIST;
+ *
+ * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+ */
+ struct {
+ u32 fileset; /* 4: fileset number */
+ u32 inode; /* 4: inode number */
+ u16 type; /* 2: UPDATEMAP record type */
+ s16 nxd; /* 2: number of extents */
+ pxd_t pxd; /* 8: pxd */
+ } updatemap; /* (20) */
+
+ /*
+ * NOREDOINOEXT: the inode extent is freed
+ *
+ * do not apply after-image records which precede this
+ * record in the log with the any of the 4 page block
+ * numbers in this inode extent.
+ *
+ * NOTE: The fileset and pxd fields MUST remain in
+ * the same fields in the REDOPAGE record format.
+ *
+ */
+ struct {
+ s32 fileset; /* 4: fileset number */
+ s32 iagnum; /* 4: IAG number */
+ s32 inoext_idx; /* 4: inode extent index */
+ pxd_t pxd; /* 8: on-disk page pxd */
+ } noredoinoext; /* (20) */
+
+ /*
+ * SYNCPT: log sync point
+ *
+ * replay log upto syncpt address specified;
+ */
+ struct {
+ s32 sync; /* 4: syncpt address (0 = here) */
+ } syncpt;
+
+ /*
+ * MOUNT: file system mount
+ *
+ * file system mount: no type-dependent information;
+ */
+
+ /*
+ * ? FREEXTENT: free specified extent(s)
+ *
+ * free specified extent(s) from block allocation map
+ * N.B.: nextents should be length of data/sizeof(xad_t)
+ */
+ struct {
+ s32 type; /* 4: FREEXTENT record type */
+ s32 nextent; /* 4: number of extents */
+
+ /* data: PXD or XAD list */
+ } freextent;
+
+ /*
+ * ? NOREDOFILE: this file is freed
+ *
+ * do not apply records which precede this record in the log
+ * with the same inode number.
+ *
+ * NOREDILE must be the first to be written at commit
+ * (last to be read in logredo()) - it prevents
+ * replay of preceding updates of all preceding generations
+ * of the inumber esp. the on-disk inode itself,
+ * but does NOT prevent
+ * replay of the
+ */
+ struct {
+ s32 fileset; /* 4: fileset number */
+ u32 inode; /* 4: inode number */
+ } noredofile;
+
+ /*
+ * ? NEWPAGE:
+ *
+ * metadata type dependent
+ */
+ struct {
+ s32 fileset; /* 4: fileset number */
+ u32 inode; /* 4: inode number */
+ s32 type; /* 4: NEWPAGE record type */
+ pxd_t pxd; /* 8: on-disk page pxd */
+ } newpage;
+
+ /*
+ * ? DUMMY: filler
+ *
+ * no type-dependent information
+ */
+ } log;
+} lrd_t; /* (36) */
+
+#define LOGRDSIZE (sizeof(struct lrd))
+
+/*
+ * line vector descriptor
+ */
+typedef struct {
+ s16 offset;
+ s16 length;
+} lvd_t;
+
+
+/*
+ * log logical volume
+ */
+typedef struct jfs_log {
+
+ struct super_block *sb; /* 4: This is used to sync metadata
+ * before writing syncpt. Will
+ * need to be a list if we share
+ * the log between fs's
+ */
+ kdev_t dev; /* 4: log lv number */
+ struct file *devfp; /* 4: log device file */
+ s32 serial; /* 4: log mount serial number */
+
+ s64 base; /* @8: log extent address (inline log ) */
+ int size; /* 4: log size in log page (in page) */
+ int l2bsize; /* 4: log2 of bsize */
+
+ uint flag; /* 4: flag */
+ uint state; /* 4: state */
+
+ struct lbuf *lbuf_free; /* 4: free lbufs */
+ wait_queue_head_t free_wait; /* 4: */
+
+ /* log write */
+ int logtid; /* 4: log tid */
+ int page; /* 4: page number of eol page */
+ int eor; /* 4: eor of last record in eol page */
+ struct lbuf *bp; /* 4: current log page buffer */
+
+ struct semaphore loglock; /* 4: log write serialization lock */
+
+ /* syncpt */
+ int nextsync; /* 4: bytes to write before next syncpt */
+ int active; /* 4: */
+ int syncbarrier; /* 4: */
+ wait_queue_head_t syncwait; /* 4: */
+
+ /* commit */
+ uint cflag; /* 4: */
+ struct { /* 8: FIFO commit queue header */
+ struct tblock *head;
+ struct tblock *tail;
+ } cqueue;
+ int gcrtc; /* 4: GC_READY transaction count */
+ struct tblock *gclrt; /* 4: latest GC_READY transaction */
+ spinlock_t gclock; /* 4: group commit lock */
+ int logsize; /* 4: log data area size in byte */
+ int lsn; /* 4: end-of-log */
+ int clsn; /* 4: clsn */
+ int syncpt; /* 4: addr of last syncpt record */
+ int sync; /* 4: addr from last logsync() */
+ struct list_head synclist; /* 8: logsynclist anchor */
+ spinlock_t synclock; /* 4: synclist lock */
+ struct lbuf *wqueue; /* 4: log pageout queue */
+ int count; /* 4: count */
+} log_t;
+
+/*
+ * group commit flag
+ */
+/* log_t */
+#define logGC_PAGEOUT 0x00000001
+
+/* tblock_t/lbuf_t */
+#define tblkGC_QUEUE 0x0001
+#define tblkGC_READY 0x0002
+#define tblkGC_COMMIT 0x0004
+#define tblkGC_COMMITTED 0x0008
+#define tblkGC_EOP 0x0010
+#define tblkGC_FREE 0x0020
+#define tblkGC_LEADER 0x0040
+#define tblkGC_ERROR 0x0080
+#define tblkGC_LAZY 0x0100 // D230860
+#define tblkGC_UNLOCKED 0x0200 // D230860
+
+/*
+ * log cache buffer header
+ */
+typedef struct lbuf {
+ log_t *l_log; /* 4: log associated with buffer */
+
+ /*
+ * data buffer base area
+ */
+ uint l_flag; /* 4: pageout control flags */
+
+ struct lbuf *l_wqnext; /* 4: write queue link */
+ struct lbuf *l_freelist; /* 4: freelistlink */
+
+ int l_pn; /* 4: log page number */
+ int l_eor; /* 4: log record eor */
+ int l_ceor; /* 4: committed log record eor */
+
+ s64 l_blkno; /* 8: log page block number */
+ caddr_t l_ldata; /* 4: data page */
+
+ wait_queue_head_t l_ioevent; /* 4: i/o done event */
+ struct page *l_page; /* The page itself */
+} lbuf_t;
+
+/* Reuse l_freelist for redrive list */
+#define l_redrive_next l_freelist
+
+/*
+ * logsynclist block
+ *
+ * common logsyncblk prefix for jbuf_t and tblock_t
+ */
+typedef struct logsyncblk {
+ u16 xflag; /* flags */
+ u16 flag; /* only meaninful in tblock_t */
+ lid_t lid; /* lock id */
+ s32 lsn; /* log sequence number */
+ struct list_head synclist; /* log sync list link */
+} logsyncblk_t;
+
+/*
+ * logsynclist serialization (per log)
+ */
+
+#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock)
+#define LOGSYNC_LOCK(log) spin_lock(&(log)->synclock)
+#define LOGSYNC_UNLOCK(log) spin_unlock(&(log)->synclock)
+
+/* compute the difference in bytes of lsn from sync point */
+#define logdiff(diff, lsn, log)\
+{\
+ diff = (lsn) - (log)->syncpt;\
+ if (diff < 0)\
+ diff += (log)->logsize;\
+}
+
+extern int lmLogOpen(struct super_block *sb, log_t ** log);
+extern int lmLogClose(struct super_block *sb, log_t * log);
+extern int lmLogSync(log_t * log, int nosyncwait);
+extern int lmLogQuiesce(log_t * log);
+extern int lmLogResume(log_t * log, struct super_block *sb);
+extern int lmLogFormat(struct super_block *sb, s64 logAddress, int logSize);
+
+#endif /* _H_JFS_LOGMGR */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
new file mode 100644
index 000000000000..ee144b0fcad8
--- /dev/null
+++ b/fs/jfs/jfs_metapage.c
@@ -0,0 +1,686 @@
+/*
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Module: jfs/jfs_metapage.c
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+
+extern struct task_struct *jfsCommitTask;
+static unsigned int metapages = 1024; /* ??? Need a better number */
+static unsigned int free_metapages;
+static metapage_t *metapage_buf;
+static unsigned long meta_order;
+static metapage_t *meta_free_list = NULL;
+static spinlock_t meta_lock = SPIN_LOCK_UNLOCKED;
+static wait_queue_head_t meta_wait;
+
+#ifdef CONFIG_JFS_STATISTICS
+struct {
+ uint pagealloc; /* # of page allocations */
+ uint pagefree; /* # of page frees */
+ uint lockwait; /* # of sleeping lock_metapage() calls */
+ uint allocwait; /* # of sleeping alloc_metapage() calls */
+} mpStat;
+#endif
+
+
+#define HASH_BITS 10 /* This makes hash_table 1 4K page */
+#define HASH_SIZE (1 << HASH_BITS)
+static metapage_t **hash_table = NULL;
+static unsigned long hash_order;
+
+
+static inline int metapage_locked(struct metapage *mp)
+{
+ return test_bit(META_locked, &mp->flag);
+}
+
+static inline int trylock_metapage(struct metapage *mp)
+{
+ return test_and_set_bit(META_locked, &mp->flag);
+}
+
+static inline void unlock_metapage(struct metapage *mp)
+{
+ clear_bit(META_locked, &mp->flag);
+ wake_up(&mp->wait);
+}
+
+static void __lock_metapage(struct metapage *mp)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ INCREMENT(mpStat.lockwait);
+
+ add_wait_queue_exclusive(&mp->wait, &wait);
+ do {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (metapage_locked(mp)) {
+ spin_unlock(&meta_lock);
+ schedule();
+ spin_lock(&meta_lock);
+ }
+ } while (trylock_metapage(mp));
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&mp->wait, &wait);
+}
+
+/* needs meta_lock */
+static inline void lock_metapage(struct metapage *mp)
+{
+ if (trylock_metapage(mp))
+ __lock_metapage(mp);
+}
+
+/* We're currently re-evaluating the method we use to write metadata
+ * pages. Currently, we have to make sure there no dirty buffer_heads
+ * hanging around after we free the metadata page, since the same
+ * physical disk blocks may be used in a different address space and we
+ * can't write old data over the good data.
+ *
+ * The best way to do this now is with block_invalidate_page. However,
+ * this is only available in the newer kernels and is not exported
+ * to modules. block_flushpage is the next best, but it too is not exported
+ * to modules.
+ *
+ * In a module, about the best we have is generic_buffer_fdatasync. This
+ * synchronously writes any dirty buffers. This is not optimal, but it will
+ * keep old dirty buffers from overwriting newer data.
+ */
+static inline void invalidate_page(metapage_t *mp)
+{
+#ifdef MODULE
+ generic_buffer_fdatasync(mp->mapping->host, mp->index, mp->index + 1);
+#else
+ lock_page(mp->page);
+ block_flushpage(mp->page, 0);
+ UnlockPage(mp->page);
+#endif
+}
+
+int __init metapage_init(void)
+{
+ int i;
+ metapage_t *last = NULL;
+ metapage_t *mp;
+
+ /*
+ * Initialize wait queue
+ */
+ init_waitqueue_head(&meta_wait);
+
+ /*
+ * Allocate the metapage structures
+ */
+ for (meta_order = 0;
+ ((PAGE_SIZE << meta_order) / sizeof(metapage_t)) < metapages;
+ meta_order++);
+ metapages = (PAGE_SIZE << meta_order) / sizeof(metapage_t);
+
+ jFYI(1, ("metapage_init: metapage size = %Zd, metapages = %d\n",
+ sizeof(metapage_t), metapages));
+
+ metapage_buf =
+ (metapage_t *) __get_free_pages(GFP_KERNEL, meta_order);
+ assert(metapage_buf);
+ memset(metapage_buf, 0, PAGE_SIZE << meta_order);
+
+ mp = metapage_buf;
+ for (i = 0; i < metapages; i++, mp++) {
+ mp->flag = 0;
+ set_bit(META_free, &mp->flag);
+ init_waitqueue_head(&mp->wait);
+ mp->hash_next = last;
+ last = mp;
+ }
+ meta_free_list = last;
+ free_metapages = metapages;
+
+ /*
+ * Now the hash list
+ */
+ for (hash_order = 0;
+ ((PAGE_SIZE << hash_order) / sizeof(void *)) < HASH_SIZE;
+ hash_order++);
+ hash_table =
+ (metapage_t **) __get_free_pages(GFP_KERNEL, hash_order);
+ assert(hash_table);
+ memset(hash_table, 0, PAGE_SIZE << hash_order);
+
+ return 0;
+}
+
+void metapage_exit(void)
+{
+ free_pages((unsigned long) metapage_buf, meta_order);
+ free_pages((unsigned long) hash_table, hash_order);
+ metapage_buf = 0; /* This is a signal to the jfsIOwait thread */
+}
+
+/*
+ * Get metapage structure from freelist
+ *
+ * Caller holds meta_lock
+ */
+static metapage_t *alloc_metapage(int *dropped_lock)
+{
+ metapage_t *new;
+
+ *dropped_lock = FALSE;
+
+ /*
+ * Reserve two metapages for the lazy commit thread. Otherwise
+ * we may deadlock with holders of metapages waiting for tlocks
+ * that lazy thread should be freeing.
+ */
+ if ((free_metapages < 3) && (current != jfsCommitTask)) {
+ INCREMENT(mpStat.allocwait);
+ *dropped_lock = TRUE;
+ __SLEEP_COND(meta_wait, (free_metapages > 2),
+ spin_lock(&meta_lock), spin_unlock(&meta_lock));
+ }
+
+ assert(meta_free_list);
+
+ new = meta_free_list;
+ meta_free_list = new->hash_next;
+ free_metapages--;
+
+ return new;
+}
+
+/*
+ * Put metapage on freelist (holding meta_lock)
+ */
+static inline void __free_metapage(metapage_t * mp)
+{
+ mp->flag = 0;
+ set_bit(META_free, &mp->flag);
+ mp->hash_next = meta_free_list;
+ meta_free_list = mp;
+ free_metapages++;
+ wake_up(&meta_wait);
+}
+
+/*
+ * Put metapage on freelist (not holding meta_lock)
+ */
+static inline void free_metapage(metapage_t * mp)
+{
+ spin_lock(&meta_lock);
+ __free_metapage(mp);
+ spin_unlock(&meta_lock);
+}
+
+/*
+ * Basically same hash as in pagemap.h, but using our hash table
+ */
+static metapage_t **meta_hash(struct address_space *mapping,
+ unsigned long index)
+{
+#define i (((unsigned long)mapping)/ \
+ (sizeof(struct inode) & ~(sizeof(struct inode) -1 )))
+#define s(x) ((x) + ((x) >> HASH_BITS))
+ return hash_table + (s(i + index) & (HASH_SIZE - 1));
+#undef i
+#undef s
+}
+
+static metapage_t *search_hash(metapage_t ** hash_ptr,
+ struct address_space *mapping,
+ unsigned long index)
+{
+ metapage_t *ptr;
+
+ for (ptr = *hash_ptr; ptr; ptr = ptr->hash_next) {
+ if ((ptr->mapping == mapping) && (ptr->index == index))
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void add_to_hash(metapage_t * mp, metapage_t ** hash_ptr)
+{
+ if (*hash_ptr)
+ (*hash_ptr)->hash_prev = mp;
+
+ mp->hash_prev = NULL;
+ mp->hash_next = *hash_ptr;
+ *hash_ptr = mp;
+ list_add(&mp->inode_list, &JFS_IP(mp->mapping->host)->mp_list);
+}
+
+static void remove_from_hash(metapage_t * mp, metapage_t ** hash_ptr)
+{
+ list_del(&mp->inode_list);
+
+ if (mp->hash_prev)
+ mp->hash_prev->hash_next = mp->hash_next;
+ else {
+ assert(*hash_ptr == mp);
+ *hash_ptr = mp->hash_next;
+ }
+
+ if (mp->hash_next)
+ mp->hash_next->hash_prev = mp->hash_prev;
+}
+
+/*
+ * Direct address space operations
+ */
+
+static int direct_get_block(struct inode *ip, sector_t lblock,
+ struct buffer_head *bh_result, int create)
+{
+ if (create)
+ bh_result->b_state |= (1UL << BH_New);
+
+ map_bh(bh_result, ip->i_sb, lblock);
+
+ return 0;
+}
+
+static int direct_writepage(struct page *page)
+{
+ return block_write_full_page(page, direct_get_block);
+}
+
+static int direct_readpage(struct file *fp, struct page *page)
+{
+ return block_read_full_page(page, direct_get_block);
+}
+
+static int direct_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ return block_prepare_write(page, from, to, direct_get_block);
+}
+
+static int direct_bmap(struct address_space *mapping, long block)
+{
+ return generic_block_bmap(mapping, block, direct_get_block);
+}
+
+struct address_space_operations direct_aops = {
+ readpage: direct_readpage,
+ writepage: direct_writepage,
+ sync_page: block_sync_page,
+ prepare_write: direct_prepare_write,
+ commit_write: generic_commit_write,
+ bmap: direct_bmap,
+};
+
+metapage_t *__get_metapage(struct inode *inode,
+ unsigned long lblock, unsigned int size,
+ int absolute, unsigned long new)
+{
+ int dropped_lock;
+ metapage_t **hash_ptr;
+ int l2BlocksPerPage;
+ int l2bsize;
+ struct address_space *mapping;
+ metapage_t *mp;
+ unsigned long page_index;
+ unsigned long page_offset;
+
+ jFYI(1, ("__get_metapage: inode = 0x%p, lblock = 0x%lx\n",
+ inode, lblock));
+
+ if (absolute)
+ mapping = JFS_SBI(inode->i_sb)->direct_mapping;
+ else
+ mapping = inode->i_mapping;
+
+ spin_lock(&meta_lock);
+
+ hash_ptr = meta_hash(mapping, lblock);
+
+ mp = search_hash(hash_ptr, mapping, lblock);
+ if (mp) {
+ page_found:
+ if (test_bit(META_discard, &mp->flag)) {
+ assert(new); /* It's okay to reuse a discarded
+ * if we expect it to be empty
+ */
+ clear_bit(META_discard, &mp->flag);
+ }
+ mp->count++;
+ jFYI(1, ("__get_metapage: found 0x%p, in hash\n", mp));
+ assert(mp->logical_size == size);
+ lock_metapage(mp);
+ spin_unlock(&meta_lock);
+ } else {
+ l2bsize = inode->i_sb->s_blocksize_bits;
+ l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
+ page_index = lblock >> l2BlocksPerPage;
+ page_offset = (lblock - (page_index << l2BlocksPerPage)) <<
+ l2bsize;
+ if ((page_offset + size) > PAGE_SIZE) {
+ spin_unlock(&meta_lock);
+ jERROR(1, ("MetaData crosses page boundary!!\n"));
+ return NULL;
+ }
+
+ mp = alloc_metapage(&dropped_lock);
+ if (dropped_lock) {
+ /* alloc_metapage blocked, we need to search the hash
+ * again. (The goto is ugly, maybe we'll clean this
+ * up in the future.)
+ */
+ metapage_t *mp2;
+ mp2 = search_hash(hash_ptr, mapping, lblock);
+ if (mp2) {
+ __free_metapage(mp);
+ mp = mp2;
+ goto page_found;
+ }
+ }
+ mp->flag = 0;
+ lock_metapage(mp);
+ if (absolute)
+ set_bit(META_absolute, &mp->flag);
+ mp->xflag = COMMIT_PAGE;
+ mp->count = 1;
+ atomic_set(&mp->nohomeok,0);
+ mp->mapping = mapping;
+ mp->index = lblock;
+ mp->page = 0;
+ mp->logical_size = size;
+ add_to_hash(mp, hash_ptr);
+ spin_unlock(&meta_lock);
+
+ if (new) {
+ jFYI(1,
+ ("__get_metapage: Calling grab_cache_page\n"));
+ mp->page = grab_cache_page(mapping, page_index);
+ if (!mp->page) {
+ jERROR(1, ("grab_cache_page failed!\n"));
+ spin_lock(&meta_lock);
+ remove_from_hash(mp, hash_ptr);
+ __free_metapage(mp);
+ spin_unlock(&meta_lock);
+ return NULL;
+ } else
+ INCREMENT(mpStat.pagealloc);
+ } else {
+ jFYI(1,
+ ("__get_metapage: Calling read_cache_page\n"));
+ mp->page =
+ read_cache_page(mapping, lblock,
+ (filler_t *) mapping->a_ops->
+ readpage, NULL);
+ if (IS_ERR(mp->page)) {
+ jERROR(1, ("read_cache_page failed!\n"));
+ spin_lock(&meta_lock);
+ remove_from_hash(mp, hash_ptr);
+ __free_metapage(mp);
+ spin_unlock(&meta_lock);
+ return NULL;
+ } else
+ INCREMENT(mpStat.pagealloc);
+ lock_page(mp->page);
+ }
+ mp->data = (void *) (kmap(mp->page) + page_offset);
+ }
+ jFYI(1, ("__get_metapage: returning = 0x%p\n", mp));
+ return mp;
+}
+
+void hold_metapage(metapage_t * mp, int force)
+{
+ spin_lock(&meta_lock);
+
+ mp->count++;
+
+ if (force) {
+ ASSERT (!(test_bit(META_forced, &mp->flag)));
+ if (trylock_metapage(mp))
+ set_bit(META_forced, &mp->flag);
+ } else
+ lock_metapage(mp);
+
+ spin_unlock(&meta_lock);
+}
+
+static void __write_metapage(metapage_t * mp)
+{
+ struct inode *ip = (struct inode *) mp->mapping->host;
+ unsigned long page_index;
+ unsigned long page_offset;
+ int rc;
+ int l2bsize = ip->i_sb->s_blocksize_bits;
+ int l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
+
+ jFYI(1, ("__write_metapage: mp = 0x%p\n", mp));
+
+ if (test_bit(META_discard, &mp->flag)) {
+ /*
+ * This metadata is no longer valid
+ */
+ clear_bit(META_dirty, &mp->flag);
+ return;
+ }
+
+ page_index = mp->page->index;
+ page_offset =
+ (mp->index - (page_index << l2BlocksPerPage)) << l2bsize;
+
+ rc = mp->mapping->a_ops->prepare_write(NULL, mp->page, page_offset,
+ page_offset +
+ mp->logical_size);
+ if (rc) {
+ jERROR(1, ("prepare_write return %d!\n", rc));
+ ClearPageUptodate(mp->page);
+ kunmap(mp->page);
+ clear_bit(META_dirty, &mp->flag);
+ return;
+ }
+ rc = mp->mapping->a_ops->commit_write(NULL, mp->page, page_offset,
+ page_offset +
+ mp->logical_size);
+ if (rc) {
+ jERROR(1, ("commit_write returned %d\n", rc));
+ }
+
+ clear_bit(META_dirty, &mp->flag);
+
+ jFYI(1, ("__write_metapage done\n"));
+}
+
+void release_metapage(metapage_t * mp)
+{
+ log_t *log;
+ struct inode *ip;
+
+ jFYI(1,
+ ("release_metapage: mp = 0x%p, flag = 0x%lx\n", mp,
+ mp->flag));
+
+ spin_lock(&meta_lock);
+ if (test_bit(META_forced, &mp->flag)) {
+ clear_bit(META_forced, &mp->flag);
+ mp->count--;
+ spin_unlock(&meta_lock);
+ return;
+ }
+
+ ip = (struct inode *) mp->mapping->host;
+
+ assert(mp->count);
+ if (--mp->count || atomic_read(&mp->nohomeok)) {
+ unlock_metapage(mp);
+ spin_unlock(&meta_lock);
+ } else {
+ remove_from_hash(mp, meta_hash(mp->mapping, mp->index));
+ spin_unlock(&meta_lock);
+
+ if (mp->page) {
+ kunmap(mp->page);
+ mp->data = 0;
+ if (test_bit(META_dirty, &mp->flag))
+ __write_metapage(mp);
+ UnlockPage(mp->page);
+ if (test_bit(META_sync, &mp->flag)) {
+ sync_metapage(mp);
+ clear_bit(META_sync, &mp->flag);
+ }
+
+ if (test_bit(META_discard, &mp->flag))
+ invalidate_page(mp);
+
+ page_cache_release(mp->page);
+ INCREMENT(mpStat.pagefree);
+ }
+
+ if (mp->lsn) {
+ /*
+ * Remove metapage from logsynclist.
+ */
+ log = mp->log;
+ LOGSYNC_LOCK(log);
+ mp->log = 0;
+ mp->lsn = 0;
+ mp->clsn = 0;
+ log->count--;
+ list_del(&mp->synclist);
+ LOGSYNC_UNLOCK(log);
+ }
+
+ free_metapage(mp);
+ }
+ jFYI(1, ("release_metapage: done\n"));
+}
+
+void invalidate_metapages(struct inode *ip, unsigned long addr,
+ unsigned long len)
+{
+ metapage_t **hash_ptr;
+ unsigned long lblock;
+ int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_sb->s_blocksize_bits;
+ struct address_space *mapping = ip->i_mapping;
+ metapage_t *mp;
+#ifndef MODULE
+ struct page *page;
+#endif
+
+ /*
+ * First, mark metapages to discard. They will eventually be
+ * released, but should not be written.
+ */
+ for (lblock = addr; lblock < addr + len;
+ lblock += 1 << l2BlocksPerPage) {
+ hash_ptr = meta_hash(mapping, lblock);
+ spin_lock(&meta_lock);
+ mp = search_hash(hash_ptr, mapping, lblock);
+ if (mp) {
+ set_bit(META_discard, &mp->flag);
+ spin_unlock(&meta_lock);
+ /*
+ * If in the metapage cache, we've got the page locked
+ */
+#ifdef MODULE
+ UnlockPage(mp->page);
+ generic_buffer_fdatasync(mp->mapping->host, mp->index,
+ mp->index+1);
+ lock_page(mp->page);
+#else
+ block_flushpage(mp->page, 0);
+#endif
+ } else {
+ spin_unlock(&meta_lock);
+#ifdef MODULE
+ generic_buffer_fdatasync(ip, lblock << l2BlocksPerPage,
+ (lblock + 1) << l2BlocksPerPage);
+#else
+ page = find_lock_page(mapping,
+ lblock >> l2BlocksPerPage);
+ if (page) {
+ block_flushpage(page, 0);
+ UnlockPage(page);
+ }
+#endif
+ }
+ }
+}
+
+void invalidate_inode_metapages(struct inode *inode)
+{
+ struct list_head *ptr;
+ metapage_t *mp;
+
+ spin_lock(&meta_lock);
+ list_for_each(ptr, &JFS_IP(inode)->mp_list) {
+ mp = list_entry(ptr, metapage_t, inode_list);
+ clear_bit(META_dirty, &mp->flag);
+ set_bit(META_discard, &mp->flag);
+ kunmap(mp->page);
+ UnlockPage(mp->page);
+ page_cache_release(mp->page);
+ INCREMENT(mpStat.pagefree);
+ mp->data = 0;
+ mp->page = 0;
+ }
+ spin_unlock(&meta_lock);
+ truncate_inode_pages(inode->i_mapping, 0);
+}
+
+#ifdef CONFIG_JFS_STATISTICS
+int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
+ int *eof, void *data)
+{
+ int len = 0;
+ off_t begin;
+
+ len += sprintf(buffer,
+ "JFS Metapage statistics\n"
+ "=======================\n"
+ "metapages in use = %d\n"
+ "page allocations = %d\n"
+ "page frees = %d\n"
+ "lock waits = %d\n"
+ "allocation waits = %d\n",
+ metapages - free_metapages,
+ mpStat.pagealloc,
+ mpStat.pagefree,
+ mpStat.lockwait,
+ mpStat.allocwait);
+
+ begin = offset;
+ *start = buffer + begin;
+ len -= begin;
+
+ if (len > length)
+ len = length;
+ else
+ *eof = 1;
+
+ if (len < 0)
+ len = 0;
+
+ return len;
+}
+#endif
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
new file mode 100644
index 000000000000..334f77c4c705
--- /dev/null
+++ b/fs/jfs/jfs_metapage.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_METAPAGE
+#define _H_JFS_METAPAGE
+
+#include <linux/pagemap.h>
+
+typedef struct metapage {
+ /* Common logsyncblk prefix (see jfs_logmgr.h) */
+ u16 xflag;
+ u16 unused;
+ lid_t lid;
+ int lsn;
+ struct list_head synclist;
+ /* End of logsyncblk prefix */
+
+ unsigned long flag; /* See Below */
+ unsigned long count; /* Reference count */
+ void *data; /* Data pointer */
+
+ /* list management stuff */
+ struct metapage *hash_prev;
+ struct metapage *hash_next; /* Also used for free list */
+
+ struct list_head inode_list; /* per-inode metapage list */
+ /*
+ * mapping & index become redundant, but we need these here to
+ * add the metapage to the hash before we have the real page
+ */
+ struct address_space *mapping;
+ unsigned long index;
+ wait_queue_head_t wait;
+
+ /* implementation */
+ struct page *page;
+ unsigned long logical_size;
+
+ /* Journal management */
+ int clsn;
+ atomic_t nohomeok;
+ struct jfs_log *log;
+} metapage_t;
+
+/*
+ * Direct-access address space operations
+ */
+extern struct address_space_operations direct_aops;
+
+/* metapage flag */
+#define META_locked 0
+#define META_absolute 1
+#define META_free 2
+#define META_dirty 3
+#define META_sync 4
+#define META_discard 5
+#define META_forced 6
+
+#define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag)
+
+/* function prototypes */
+extern metapage_t *__get_metapage(struct inode *inode,
+ unsigned long lblock, unsigned int size,
+ int absolute, unsigned long new);
+
+#define read_metapage(inode, lblock, size, absolute)\
+ __get_metapage(inode, lblock, size, absolute, FALSE)
+
+#define get_metapage(inode, lblock, size, absolute)\
+ __get_metapage(inode, lblock, size, absolute, TRUE)
+
+extern void release_metapage(metapage_t *);
+
+#define flush_metapage(mp) \
+{\
+ set_bit(META_dirty, &(mp)->flag);\
+ set_bit(META_sync, &(mp)->flag);\
+ release_metapage(mp);\
+}
+
+#define sync_metapage(mp) \
+ generic_buffer_fdatasync((struct inode *)mp->mapping->host,\
+ mp->page->index, mp->page->index + 1)
+
+#define write_metapage(mp) \
+{\
+ set_bit(META_dirty, &(mp)->flag);\
+ release_metapage(mp);\
+}
+
+#define discard_metapage(mp) \
+{\
+ clear_bit(META_dirty, &(mp)->flag);\
+ set_bit(META_discard, &(mp)->flag);\
+ release_metapage(mp);\
+}
+
+extern void hold_metapage(metapage_t *, int);
+
+/*
+ * This routine uses hash to explicitly find small number of pages
+ */
+extern void invalidate_metapages(struct inode *, unsigned long, unsigned long);
+
+/*
+ * This one uses mp_list to invalidate all pages for an inode
+ */
+extern void invalidate_inode_metapages(struct inode *inode);
+#endif /* _H_JFS_METAPAGE */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
new file mode 100644
index 000000000000..04559d312280
--- /dev/null
+++ b/fs/jfs/jfs_mount.c
@@ -0,0 +1,541 @@
+/*
+ * MODULE_NAME: jfs_mount.c
+ *
+ * COMPONENT_NAME: sysjfs
+ *
+ *
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * Change History :
+ *
+ */
+
+/*
+ * Module: jfs_mount.c
+ *
+ * note: file system in transition to aggregate/fileset:
+ *
+ * file system mount is interpreted as the mount of aggregate,
+ * if not already mounted, and mount of the single/only fileset in
+ * the aggregate;
+ *
+ * a file system/aggregate is represented by an internal inode
+ * (aka mount inode) initialized with aggregate superblock;
+ * each vfs represents a fileset, and points to its "fileset inode
+ * allocation map inode" (aka fileset inode):
+ * (an aggregate itself is structured recursively as a filset:
+ * an internal vfs is constructed and points to its "fileset inode
+ * allocation map inode" (aka aggregate inode) where each inode
+ * represents a fileset inode) so that inode number is mapped to
+ * on-disk inode in uniform way at both aggregate and fileset level;
+ *
+ * each vnode/inode of a fileset is linked to its vfs (to facilitate
+ * per fileset inode operations, e.g., unmount of a fileset, etc.);
+ * each inode points to the mount inode (to facilitate access to
+ * per aggregate information, e.g., block size, etc.) as well as
+ * its file set inode.
+ *
+ * aggregate
+ * ipmnt
+ * mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap;
+ * fileset vfs -> vp(1) <-> ... <-> vp(n) <->vproot;
+ */
+
+#include <linux/fs.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+
+
+/*
+ * forward references
+ */
+static int chkSuper(struct super_block *);
+static int logMOUNT(struct super_block *sb);
+
+/*
+ * NAME: jfs_mount(sb)
+ *
+ * FUNCTION: vfs_mount()
+ *
+ * PARAMETER: sb - super block
+ *
+ * RETURN: EBUSY - device already mounted or open for write
+ * EBUSY - cvrdvp already mounted;
+ * EBUSY - mount table full
+ * ENOTDIR - cvrdvp not directory on a device mount
+ * ENXIO - device open failure
+ */
+int jfs_mount(struct super_block *sb)
+{
+ int rc = 0; /* Return code */
+ struct jfs_sb_info *sbi = JFS_SBI(sb);
+ struct inode *ipaimap = NULL;
+ struct inode *ipaimap2 = NULL;
+ struct inode *ipimap = NULL;
+ struct inode *ipbmap = NULL;
+
+ jFYI(1, ("\nMount JFS\n"));
+
+ /*
+ * read/validate superblock
+ * (initialize mount inode from the superblock)
+ */
+ if ((rc = chkSuper(sb))) {
+ goto errout20;
+ }
+
+ ipaimap = diReadSpecial(sb, AGGREGATE_I);
+ if (ipaimap == NULL) {
+ jERROR(1, ("jfs_mount: Faild to read AGGREGATE_I\n"));
+ rc = EIO;
+ goto errout20;
+ }
+ sbi->ipaimap = ipaimap;
+
+ jFYI(1, ("jfs_mount: ipaimap:0x%p\n", ipaimap));
+
+ /*
+ * initialize aggregate inode allocation map
+ */
+ if ((rc = diMount(ipaimap))) {
+ jERROR(1,
+ ("jfs_mount: diMount(ipaimap) failed w/rc = %d\n",
+ rc));
+ goto errout21;
+ }
+
+ /*
+ * open aggregate block allocation map
+ */
+ ipbmap = diReadSpecial(sb, BMAP_I);
+ if (ipbmap == NULL) {
+ rc = EIO;
+ goto errout22;
+ }
+
+ jFYI(1, ("jfs_mount: ipbmap:0x%p\n", ipbmap));
+
+ sbi->ipbmap = ipbmap;
+
+ /*
+ * initialize aggregate block allocation map
+ */
+ if ((rc = dbMount(ipbmap))) {
+ jERROR(1, ("jfs_mount: dbMount failed w/rc = %d\n", rc));
+ goto errout22;
+ }
+
+ /*
+ * open the secondary aggregate inode allocation map
+ *
+ * This is a duplicate of the aggregate inode allocation map.
+ *
+ * hand craft a vfs in the same fashion as we did to read ipaimap.
+ * By adding INOSPEREXT (32) to the inode number, we are telling
+ * diReadSpecial that we are reading from the secondary aggregate
+ * inode table. This also creates a unique entry in the inode hash
+ * table.
+ */
+ if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
+ ipaimap2 = diReadSpecial(sb, AGGREGATE_I + INOSPEREXT);
+ if (ipaimap2 == 0) {
+ jERROR(1,
+ ("jfs_mount: Faild to read AGGREGATE_I\n"));
+ rc = EIO;
+ goto errout35;
+ }
+ sbi->ipaimap2 = ipaimap2;
+
+ jFYI(1, ("jfs_mount: ipaimap2:0x%p\n", ipaimap2));
+
+ /*
+ * initialize secondary aggregate inode allocation map
+ */
+ if ((rc = diMount(ipaimap2))) {
+ jERROR(1,
+ ("jfs_mount: diMount(ipaimap2) failed, rc = %d\n",
+ rc));
+ goto errout35;
+ }
+ } else
+ /* Secondary aggregate inode table is not valid */
+ sbi->ipaimap2 = 0;
+
+ /*
+ * mount (the only/single) fileset
+ */
+ /*
+ * open fileset inode allocation map (aka fileset inode)
+ */
+ ipimap = diReadSpecial(sb, FILESYSTEM_I);
+ if (ipimap == NULL) {
+ jERROR(1, ("jfs_mount: Failed to read FILESYSTEM_I\n"));
+ /* open fileset secondary inode allocation map */
+ rc = EIO;
+ goto errout40;
+ }
+ jFYI(1, ("jfs_mount: ipimap:0x%p\n", ipimap));
+
+ /* map further access of per fileset inodes by the fileset inode */
+ sbi->ipimap = ipimap;
+
+ /* initialize fileset inode allocation map */
+ if ((rc = diMount(ipimap))) {
+ jERROR(1, ("jfs_mount: diMount failed w/rc = %d\n", rc));
+ goto errout41;
+ }
+
+ jFYI(1, ("Mount JFS Complete.\n"));
+ goto out;
+
+ /*
+ * unwind on error
+ */
+//errout42: /* close fileset inode allocation map */
+ diUnmount(ipimap, 1);
+
+ errout41: /* close fileset inode allocation map inode */
+ diFreeSpecial(ipimap);
+
+ errout40: /* fileset closed */
+
+ /* close secondary aggregate inode allocation map */
+ if (ipaimap2) {
+ diUnmount(ipaimap2, 1);
+ diFreeSpecial(ipaimap2);
+ }
+
+ errout35:
+
+ /* close aggregate block allocation map */
+ dbUnmount(ipbmap, 1);
+ diFreeSpecial(ipbmap);
+
+ errout22: /* close aggregate inode allocation map */
+
+ diUnmount(ipaimap, 1);
+
+ errout21: /* close aggregate inodes */
+ diFreeSpecial(ipaimap);
+ errout20: /* aggregate closed */
+
+ out:
+
+ if (rc) {
+ jERROR(1, ("Mount JFS Failure: %d\n", rc));
+ }
+ return rc;
+}
+
+/*
+ * NAME: jfs_mount_rw(sb, remount)
+ *
+ * FUNCTION: Completes read-write mount, or remounts read-only volume
+ * as read-write
+ */
+int jfs_mount_rw(struct super_block *sb, int remount)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(sb);
+ log_t *log;
+ int rc;
+
+ /*
+ * If we are re-mounting a previously read-only volume, we want to
+ * re-read the inode and block maps, since fsck.jfs may have updated
+ * them.
+ */
+ if (remount) {
+ if (chkSuper(sb) || (sbi->state != FM_CLEAN))
+ return -EINVAL;
+
+ truncate_inode_pages(sbi->ipimap->i_mapping, 0);
+ truncate_inode_pages(sbi->ipbmap->i_mapping, 0);
+ diUnmount(sbi->ipimap, 1);
+ if ((rc = diMount(sbi->ipimap))) {
+ jERROR(1,("jfs_mount_rw: diMount failed!\n"));
+ return rc;
+ }
+
+ dbUnmount(sbi->ipbmap, 1);
+ if ((rc = dbMount(sbi->ipbmap))) {
+ jERROR(1,("jfs_mount_rw: dbMount failed!\n"));
+ return rc;
+ }
+ }
+#ifdef _STILL_TO_PORT
+ /*
+ * get log device associated with the fs being mounted;
+ */
+ if (ipmnt->i_mntflag & JFS_INLINELOG) {
+ vfsp->vfs_logVPB = vfsp->vfs_hVPB;
+ vfsp->vfs_logvpfs = vfsp->vfs_vpfsi;
+ } else if (vfsp->vfs_logvpfs == NULL) {
+ /*
+ * XXX: there's only one external log per system;
+ */
+ jERROR(1, ("jfs_mount: Mount Failure! No Log Device.\n"));
+ goto errout30;
+ }
+
+ logdev = vfsp->vfs_logvpfs->vpi_unit;
+ ipmnt->i_logdev = logdev;
+#endif /* _STILL_TO_PORT */
+
+ /*
+ * open/initialize log
+ */
+ if ((rc = lmLogOpen(sb, &log)))
+ return rc;
+
+ JFS_SBI(sb)->log = log;
+
+ /*
+ * update file system superblock;
+ */
+ if ((rc = updateSuper(sb, FM_MOUNT))) {
+ jERROR(1,
+ ("jfs_mount: updateSuper failed w/rc = %d\n", rc));
+ lmLogClose(sb, log);
+ JFS_SBI(sb)->log = 0;
+ return rc;
+ }
+
+ /*
+ * write MOUNT log record of the file system
+ */
+ logMOUNT(sb);
+
+ return rc;
+}
+
+/*
+ * chkSuper()
+ *
+ * validate the superblock of the file system to be mounted and
+ * get the file system parameters.
+ *
+ * returns
+ * 0 with fragsize set if check successful
+ * error code if not successful
+ */
+static int chkSuper(struct super_block *sb)
+{
+ int rc = 0;
+ metapage_t *mp;
+ struct jfs_sb_info *sbi = JFS_SBI(sb);
+ struct jfs_superblock *j_sb;
+ int AIM_bytesize, AIT_bytesize;
+ int expected_AIM_bytesize, expected_AIT_bytesize;
+ s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr;
+ s64 byte_addr_diff0, byte_addr_diff1;
+ s32 bsize;
+
+ if ((rc = readSuper(sb, &mp)))
+ return rc;
+ j_sb = (struct jfs_superblock *) (mp->data);
+
+ /*
+ * validate superblock
+ */
+ /* validate fs signature */
+ if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) ||
+ j_sb->s_version != cpu_to_le32(JFS_VERSION)) {
+ //rc = EFORMAT;
+ rc = EINVAL;
+ goto out;
+ }
+
+ bsize = le32_to_cpu(j_sb->s_bsize);
+#ifdef _JFS_4K
+ if (bsize != PSIZE) {
+ jERROR(1, ("Currently only 4K block size supported!\n"));
+ rc = EINVAL;
+ goto out;
+ }
+#endif /* _JFS_4K */
+
+ jFYI(1, ("superblock: flag:0x%08x state:0x%08x size:0x%Lx\n",
+ le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state),
+ (unsigned long long) le64_to_cpu(j_sb->s_size)));
+
+ /* validate the descriptors for Secondary AIM and AIT */
+ if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) !=
+ cpu_to_le32(JFS_BAD_SAIT)) {
+ expected_AIM_bytesize = 2 * PSIZE;
+ AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize;
+ expected_AIT_bytesize = 4 * PSIZE;
+ AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize;
+ AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize;
+ AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize;
+ byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr;
+ fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize;
+ byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr;
+ if ((AIM_bytesize != expected_AIM_bytesize) ||
+ (AIT_bytesize != expected_AIT_bytesize) ||
+ (byte_addr_diff0 != AIM_bytesize) ||
+ (byte_addr_diff1 <= AIT_bytesize))
+ j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
+ }
+
+ /* in release 1, the flag MUST reflect inline log, and group commit */
+ if ((j_sb->s_flag & cpu_to_le32(JFS_INLINELOG)) !=
+ cpu_to_le32(JFS_INLINELOG))
+ j_sb->s_flag |= cpu_to_le32(JFS_INLINELOG);
+ if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) !=
+ cpu_to_le32(JFS_GROUPCOMMIT))
+ j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT);
+ jFYI(0, ("superblock: flag:0x%08x state:0x%08x size:0x%Lx\n",
+ le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state),
+ (unsigned long long) le64_to_cpu(j_sb->s_size)));
+
+ /* validate fs state */
+ if (j_sb->s_state != cpu_to_le32(FM_CLEAN) &&
+ !(sb->s_flags & MS_RDONLY)) {
+ jERROR(1,
+ ("jfs_mount: Mount Failure: File System Dirty.\n"));
+ rc = EINVAL;
+ goto out;
+ }
+
+ sbi->state = le32_to_cpu(j_sb->s_state);
+ sbi->mntflag = le32_to_cpu(j_sb->s_flag);
+
+ /*
+ * JFS always does I/O by 4K pages. Don't tell the buffer cache
+ * that we use anything else (leave s_blocksize alone).
+ */
+ sbi->bsize = bsize;
+ sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize);
+
+ /*
+ * For now, ignore s_pbsize, l2bfactor. All I/O going through buffer
+ * cache.
+ */
+ sbi->nbperpage = PSIZE >> sbi->l2bsize;
+ sbi->l2nbperpage = L2PSIZE - sbi->l2bsize;
+ sbi->l2niperblk = sbi->l2bsize - L2DISIZE;
+ if (sbi->mntflag & JFS_INLINELOG)
+ sbi->logpxd = j_sb->s_logpxd;
+ sbi->ait2 = j_sb->s_ait2;
+
+ out:
+ release_metapage(mp);
+
+ return rc;
+}
+
+
+/*
+ * updateSuper()
+ *
+ * update synchronously superblock if it is mounted read-write.
+ */
+int updateSuper(struct super_block *sb, uint state)
+{
+ int rc;
+ metapage_t *mp;
+ struct jfs_superblock *j_sb;
+
+ /*
+ * Only fsck can fix dirty state
+ */
+ if (JFS_SBI(sb)->state == FM_DIRTY)
+ return 0;
+
+ if ((rc = readSuper(sb, &mp)))
+ return rc;
+
+ j_sb = (struct jfs_superblock *) (mp->data);
+
+ j_sb->s_state = cpu_to_le32(state);
+ JFS_SBI(sb)->state = state;
+
+ if (state == FM_MOUNT) {
+ /* record log's dev_t and mount serial number */
+ j_sb->s_logdev =
+ cpu_to_le32(kdev_t_to_nr(JFS_SBI(sb)->log->dev));
+ j_sb->s_logserial = cpu_to_le32(JFS_SBI(sb)->log->serial);
+ } else if (state == FM_CLEAN) {
+ /*
+ * If this volume is shared with OS/2, OS/2 will need to
+ * recalculate DASD usage, since we don't deal with it.
+ */
+ if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED))
+ j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME);
+ }
+
+ flush_metapage(mp);
+
+ return 0;
+}
+
+
+/*
+ * readSuper()
+ *
+ * read superblock by raw sector address
+ */
+int readSuper(struct super_block *sb, metapage_t ** mpp)
+{
+ /* read in primary superblock */
+ *mpp = read_metapage(JFS_SBI(sb)->direct_inode,
+ SUPER1_OFF >> sb->s_blocksize_bits, PSIZE, 1);
+ if (*mpp == NULL) {
+ /* read in secondary/replicated superblock */
+ *mpp = read_metapage(JFS_SBI(sb)->direct_inode,
+ SUPER2_OFF >> sb->s_blocksize_bits,
+ PSIZE, 1);
+ }
+ return *mpp ? 0 : 1;
+}
+
+
+/*
+ * logMOUNT()
+ *
+ * function: write a MOUNT log record for file system.
+ *
+ * MOUNT record keeps logredo() from processing log records
+ * for this file system past this point in log.
+ * it is harmless if mount fails.
+ *
+ * note: MOUNT record is at aggregate level, not at fileset level,
+ * since log records of previous mounts of a fileset
+ * (e.g., AFTER record of extent allocation) have to be processed
+ * to update block allocation map at aggregate level.
+ */
+static int logMOUNT(struct super_block *sb)
+{
+ log_t *log = JFS_SBI(sb)->log;
+ lrd_t lrd;
+
+ lrd.logtid = 0;
+ lrd.backchain = 0;
+ lrd.type = cpu_to_le16(LOG_MOUNT);
+ lrd.length = 0;
+ lrd.aggregate = cpu_to_le32(kdev_t_to_nr(sb->s_dev));
+ lmLog(log, NULL, &lrd, NULL);
+
+ return 0;
+}
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
new file mode 100644
index 000000000000..9f78c648aeed
--- /dev/null
+++ b/fs/jfs/jfs_superblock.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2000
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef _H_JFS_SUPERBLOCK
+#define _H_JFS_SUPERBLOCK
+/*
+ * jfs_superblock.h
+ */
+
+/*
+ * make the magic number something a human could read
+ */
+#define JFS_MAGIC "JFS1" /* Magic word: Version 1 */
+
+#define JFS_VERSION 1 /* Version number: Version 1 */
+
+#define LV_NAME_SIZE 11 /* MUST BE 11 for OS/2 boot sector */
+
+/*
+ * aggregate superblock
+ *
+ * The name superblock is too close to super_block, so the name has been
+ * changed to jfs_superblock. The utilities are still using the old name.
+ */
+struct jfs_superblock {
+ char s_magic[4]; /* 4: magic number */
+ u32 s_version; /* 4: version number */
+
+ s64 s_size; /* 8: aggregate size in hardware/LVM blocks;
+ * VFS: number of blocks
+ */
+ s32 s_bsize; /* 4: aggregate block size in bytes;
+ * VFS: fragment size
+ */
+ s16 s_l2bsize; /* 2: log2 of s_bsize */
+ s16 s_l2bfactor; /* 2: log2(s_bsize/hardware block size) */
+ s32 s_pbsize; /* 4: hardware/LVM block size in bytes */
+ s16 s_l2pbsize; /* 2: log2 of s_pbsize */
+ s16 pad; /* 2: padding necessary for alignment */
+
+ u32 s_agsize; /* 4: allocation group size in aggr. blocks */
+
+ u32 s_flag; /* 4: aggregate attributes:
+ * see jfs_filsys.h
+ */
+ u32 s_state; /* 4: mount/unmount/recovery state:
+ * see jfs_filsys.h
+ */
+ s32 s_compress; /* 4: > 0 if data compression */
+
+ pxd_t s_ait2; /* 8: first extent of secondary
+ * aggregate inode table
+ */
+
+ pxd_t s_aim2; /* 8: first extent of secondary
+ * aggregate inode map
+ */
+ u32 s_logdev; /* 4: device address of log */
+ s32 s_logserial; /* 4: log serial number at aggregate mount */